Add log-bucketing statistics in util

This commit is contained in:
Nikita Lisitsa 2024-06-03 23:25:29 +03:00
parent 6c5815ff76
commit 30877401a3

View file

@ -1,5 +1,6 @@
#pragma once
#include <psemek/util/spatial_array.hpp>
#include <boost/math/special_functions/erf.hpp>
#include <iostream>
@ -79,6 +80,32 @@ namespace psemek::util
return variance_;
}
T stddev() const
{
return std::sqrt(variance_);
}
T percentile(double p, T const & min, T const & max) const
{
// Assume truncated normal distribution in the range [min, max]
// which is the maximum-entropy distribution on this range
// with specified mean and variance
// See
// https://en.wikipedia.org/wiki/Maximum_entropy_probability_distribution#Other_examples
// https://en.wikipedia.org/wiki/Truncated_normal_distribution
float const mu = mean();
float const sigma = stddev();
if (sigma == T{0})
return mu;
float const alpha = (min - mu) / sigma;
float const beta = (max - mu) / sigma;
return mu + sigma * detail::normal_cdf_inv(std::lerp(detail::normal_cdf(alpha), detail::normal_cdf(beta), p));
}
friend base_statistics merge(base_statistics const & s1, base_statistics const & s2)
{
// See https://stackoverflow.com/questions/1480626/merging-two-statistical-result-sets
@ -115,7 +142,7 @@ namespace psemek::util
std::size_t count() const { return base_.count(); }
T mean() const { return base_.mean(); }
T variance() const { return base_.variance(); }
T stddev() const { return std::sqrt(variance()); }
T stddev() const { return base_.stddev(); }
T min() const { return min_; }
T max() const { return max_; }
@ -149,19 +176,7 @@ namespace psemek::util
template <typename T>
T statistics_lite<T>::percentile(double p) const
{
// Assume truncated normal distribution in the range [min, max]
// which is the maximum-entropy distributioon on this range
// with specified mean and variance
// See
// https://en.wikipedia.org/wiki/Maximum_entropy_probability_distribution#Other_examples
// https://en.wikipedia.org/wiki/Truncated_normal_distribution
float const mu = mean();
float const sigma = stddev();
float const alpha = (min_ - mu) / sigma;
float const beta = (max_ - mu) / sigma;
return mu + sigma * detail::normal_cdf_inv(std::lerp(detail::normal_cdf(alpha), detail::normal_cdf(beta), p));
return base_.percentile(p, min_, max_);
}
template <typename T>
@ -174,25 +189,124 @@ namespace psemek::util
return result;
}
template <typename T, T Precision>
struct statistics_log_bucket
: statistics_lite<T>
{
// Only accepts non-negative values
void push(T const & value, std::size_t count = 1);
T percentile(double p) const;
statistics_lite<T> & lite() { return *this; }
statistics_lite<T> const & lite() const { return *this; }
template <typename H, H P>
friend statistics_log_bucket<H, P> merge(statistics_log_bucket<H, P> const & s1, statistics_log_bucket<H, P> const & s2);
private:
static constexpr T bucket_size = Precision / std::log(2.0);
std::size_t zero_count_ = 0;
util::spatial_array<detail::base_statistics<T>, 1, int> buckets_;
mutable util::spatial_array<std::size_t, 1, int> prefix_count_;
};
template <typename T, T Precision>
void statistics_log_bucket<T, Precision>::push(T const & value, std::size_t count)
{
lite().push(value, count);
if (value == T{0}) [[unlikely]]
++zero_count_;
else
{
int const bucket = std::floor(std::log2(value) / bucket_size);
buckets_.get(bucket).push(value, count);
}
prefix_count_.clear();
}
template <typename T, T Precision>
T statistics_log_bucket<T, Precision>::percentile(double p) const
{
if (prefix_count_.empty())
{
std::size_t count = zero_count_;
prefix_count_.get(buckets_.min(0)) = count;
for (int bucket = buckets_.min(0); bucket < buckets_.max(0); ++bucket)
{
count += buckets_.at(bucket).count();
prefix_count_.get(bucket + 1) = count;
}
}
std::size_t index = std::min<std::size_t>(this->count() - 1, this->count() * p);
auto it = std::lower_bound(prefix_count_.begin(), prefix_count_.end(), index);
if (it == prefix_count_.begin())
return T{0};
--it;
int bucket = buckets_.min(0) + (it - prefix_count_.begin());
double min = std::exp2(bucket * bucket_size);
double max = std::exp2((bucket + 1) * bucket_size);
double q = double(index - *it) / (*(it + 1) - *it);
return buckets_.at(bucket).percentile(q, min, max);
}
template <typename T, T Precision>
std::ostream & operator << (std::ostream & os, statistics_log_bucket<T, Precision> const & s)
{
os << s.lite();
return os;
}
template <typename T, T Precision>
statistics_log_bucket<T, Precision> merge(statistics_log_bucket<T, Precision> const & s1, statistics_log_bucket<T, Precision> const & s2)
{
statistics_log_bucket<T, Precision> result;
result.lite() = merge(s1.lite(), s2.lite());
for (int bucket = s1.buckets_.min(0); bucket < s1.buckets_.max(0); ++bucket)
{
auto const & source = s1.buckets_.at(bucket);
auto & target = result.buckets_.get(bucket);
target = merge(target, source);
}
for (int bucket = s2.buckets_.min(0); bucket < s2.buckets_.max(0); ++bucket)
{
auto const & source = s2.buckets_.at(bucket);
auto & target = result.buckets_.get(bucket);
target = merge(target, source);
}
return result;
}
template <typename T>
struct statistics
: statistics_lite<T>
{
void push(T const & value, std::size_t count = 1);
std::size_t count() const { return values_.size(); }
T percentile(double p) const;
template <typename H>
friend statistics<H> merge(statistics<H> const & s1, statistics<H> const & s2);
statistics_lite<T> & lite() { return *this; }
statistics_lite<T> const & lite() const { return *this; }
private:
mutable std::vector<T> values_;
mutable bool sorted_ = true;
statistics_lite<T> & lite() { return *this; }
statistics_lite<T> const & lite() const { return *this; }
};
template <typename T>
@ -218,7 +332,7 @@ namespace psemek::util
template <typename T>
std::ostream & operator << (std::ostream & os, statistics<T> const & s)
{
os << "mean = " << s.mean() << ", dev = " << s.stddev() << ", range = [" << s.min() << " .. " << s.max() << "]";
os << s.lite();
return os;
}