diff --git a/libs/util/include/psemek/util/statistics.hpp b/libs/util/include/psemek/util/statistics.hpp index ee1950f3..e17afd1c 100644 --- a/libs/util/include/psemek/util/statistics.hpp +++ b/libs/util/include/psemek/util/statistics.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -79,6 +80,32 @@ namespace psemek::util return variance_; } + T stddev() const + { + return std::sqrt(variance_); + } + + T percentile(double p, T const & min, T const & max) const + { + // Assume truncated normal distribution in the range [min, max] + // which is the maximum-entropy distribution on this range + // with specified mean and variance + // See + // https://en.wikipedia.org/wiki/Maximum_entropy_probability_distribution#Other_examples + // https://en.wikipedia.org/wiki/Truncated_normal_distribution + + float const mu = mean(); + float const sigma = stddev(); + + if (sigma == T{0}) + return mu; + + float const alpha = (min - mu) / sigma; + float const beta = (max - mu) / sigma; + + return mu + sigma * detail::normal_cdf_inv(std::lerp(detail::normal_cdf(alpha), detail::normal_cdf(beta), p)); + } + friend base_statistics merge(base_statistics const & s1, base_statistics const & s2) { // See https://stackoverflow.com/questions/1480626/merging-two-statistical-result-sets @@ -115,7 +142,7 @@ namespace psemek::util std::size_t count() const { return base_.count(); } T mean() const { return base_.mean(); } T variance() const { return base_.variance(); } - T stddev() const { return std::sqrt(variance()); } + T stddev() const { return base_.stddev(); } T min() const { return min_; } T max() const { return max_; } @@ -149,19 +176,7 @@ namespace psemek::util template T statistics_lite::percentile(double p) const { - // Assume truncated normal distribution in the range [min, max] - // which is the maximum-entropy distributioon on this range - // with specified mean and variance - // See - // https://en.wikipedia.org/wiki/Maximum_entropy_probability_distribution#Other_examples - // https://en.wikipedia.org/wiki/Truncated_normal_distribution - - float const mu = mean(); - float const sigma = stddev(); - float const alpha = (min_ - mu) / sigma; - float const beta = (max_ - mu) / sigma; - - return mu + sigma * detail::normal_cdf_inv(std::lerp(detail::normal_cdf(alpha), detail::normal_cdf(beta), p)); + return base_.percentile(p, min_, max_); } template @@ -174,25 +189,124 @@ namespace psemek::util return result; } + template + struct statistics_log_bucket + : statistics_lite + { + // Only accepts non-negative values + void push(T const & value, std::size_t count = 1); + + T percentile(double p) const; + + statistics_lite & lite() { return *this; } + statistics_lite const & lite() const { return *this; } + + template + friend statistics_log_bucket merge(statistics_log_bucket const & s1, statistics_log_bucket const & s2); + + private: + static constexpr T bucket_size = Precision / std::log(2.0); + + std::size_t zero_count_ = 0; + util::spatial_array, 1, int> buckets_; + mutable util::spatial_array prefix_count_; + }; + + template + void statistics_log_bucket::push(T const & value, std::size_t count) + { + lite().push(value, count); + if (value == T{0}) [[unlikely]] + ++zero_count_; + else + { + int const bucket = std::floor(std::log2(value) / bucket_size); + buckets_.get(bucket).push(value, count); + } + + prefix_count_.clear(); + } + + template + T statistics_log_bucket::percentile(double p) const + { + if (prefix_count_.empty()) + { + std::size_t count = zero_count_; + + prefix_count_.get(buckets_.min(0)) = count; + for (int bucket = buckets_.min(0); bucket < buckets_.max(0); ++bucket) + { + count += buckets_.at(bucket).count(); + prefix_count_.get(bucket + 1) = count; + } + } + + std::size_t index = std::min(this->count() - 1, this->count() * p); + + auto it = std::lower_bound(prefix_count_.begin(), prefix_count_.end(), index); + + if (it == prefix_count_.begin()) + return T{0}; + + --it; + + int bucket = buckets_.min(0) + (it - prefix_count_.begin()); + + double min = std::exp2(bucket * bucket_size); + double max = std::exp2((bucket + 1) * bucket_size); + double q = double(index - *it) / (*(it + 1) - *it); + + return buckets_.at(bucket).percentile(q, min, max); + } + + template + std::ostream & operator << (std::ostream & os, statistics_log_bucket const & s) + { + os << s.lite(); + return os; + } + + template + statistics_log_bucket merge(statistics_log_bucket const & s1, statistics_log_bucket const & s2) + { + statistics_log_bucket result; + result.lite() = merge(s1.lite(), s2.lite()); + + for (int bucket = s1.buckets_.min(0); bucket < s1.buckets_.max(0); ++bucket) + { + auto const & source = s1.buckets_.at(bucket); + auto & target = result.buckets_.get(bucket); + target = merge(target, source); + } + + for (int bucket = s2.buckets_.min(0); bucket < s2.buckets_.max(0); ++bucket) + { + auto const & source = s2.buckets_.at(bucket); + auto & target = result.buckets_.get(bucket); + target = merge(target, source); + } + + return result; + } + template struct statistics : statistics_lite { void push(T const & value, std::size_t count = 1); - std::size_t count() const { return values_.size(); } - T percentile(double p) const; template friend statistics merge(statistics const & s1, statistics const & s2); + statistics_lite & lite() { return *this; } + statistics_lite const & lite() const { return *this; } + private: mutable std::vector values_; mutable bool sorted_ = true; - - statistics_lite & lite() { return *this; } - statistics_lite const & lite() const { return *this; } }; template @@ -218,7 +332,7 @@ namespace psemek::util template std::ostream & operator << (std::ostream & os, statistics const & s) { - os << "mean = " << s.mean() << ", dev = " << s.stddev() << ", range = [" << s.min() << " .. " << s.max() << "]"; + os << s.lite(); return os; }