Add log-bucketing statistics in util
This commit is contained in:
parent
6c5815ff76
commit
30877401a3
1 changed files with 134 additions and 20 deletions
|
|
@ -1,5 +1,6 @@
|
|||
#pragma once
|
||||
|
||||
#include <psemek/util/spatial_array.hpp>
|
||||
#include <boost/math/special_functions/erf.hpp>
|
||||
|
||||
#include <iostream>
|
||||
|
|
@ -79,6 +80,32 @@ namespace psemek::util
|
|||
return variance_;
|
||||
}
|
||||
|
||||
T stddev() const
|
||||
{
|
||||
return std::sqrt(variance_);
|
||||
}
|
||||
|
||||
T percentile(double p, T const & min, T const & max) const
|
||||
{
|
||||
// Assume truncated normal distribution in the range [min, max]
|
||||
// which is the maximum-entropy distribution on this range
|
||||
// with specified mean and variance
|
||||
// See
|
||||
// https://en.wikipedia.org/wiki/Maximum_entropy_probability_distribution#Other_examples
|
||||
// https://en.wikipedia.org/wiki/Truncated_normal_distribution
|
||||
|
||||
float const mu = mean();
|
||||
float const sigma = stddev();
|
||||
|
||||
if (sigma == T{0})
|
||||
return mu;
|
||||
|
||||
float const alpha = (min - mu) / sigma;
|
||||
float const beta = (max - mu) / sigma;
|
||||
|
||||
return mu + sigma * detail::normal_cdf_inv(std::lerp(detail::normal_cdf(alpha), detail::normal_cdf(beta), p));
|
||||
}
|
||||
|
||||
friend base_statistics merge(base_statistics const & s1, base_statistics const & s2)
|
||||
{
|
||||
// See https://stackoverflow.com/questions/1480626/merging-two-statistical-result-sets
|
||||
|
|
@ -115,7 +142,7 @@ namespace psemek::util
|
|||
std::size_t count() const { return base_.count(); }
|
||||
T mean() const { return base_.mean(); }
|
||||
T variance() const { return base_.variance(); }
|
||||
T stddev() const { return std::sqrt(variance()); }
|
||||
T stddev() const { return base_.stddev(); }
|
||||
T min() const { return min_; }
|
||||
T max() const { return max_; }
|
||||
|
||||
|
|
@ -149,19 +176,7 @@ namespace psemek::util
|
|||
template <typename T>
|
||||
T statistics_lite<T>::percentile(double p) const
|
||||
{
|
||||
// Assume truncated normal distribution in the range [min, max]
|
||||
// which is the maximum-entropy distributioon on this range
|
||||
// with specified mean and variance
|
||||
// See
|
||||
// https://en.wikipedia.org/wiki/Maximum_entropy_probability_distribution#Other_examples
|
||||
// https://en.wikipedia.org/wiki/Truncated_normal_distribution
|
||||
|
||||
float const mu = mean();
|
||||
float const sigma = stddev();
|
||||
float const alpha = (min_ - mu) / sigma;
|
||||
float const beta = (max_ - mu) / sigma;
|
||||
|
||||
return mu + sigma * detail::normal_cdf_inv(std::lerp(detail::normal_cdf(alpha), detail::normal_cdf(beta), p));
|
||||
return base_.percentile(p, min_, max_);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -174,25 +189,124 @@ namespace psemek::util
|
|||
return result;
|
||||
}
|
||||
|
||||
template <typename T, T Precision>
|
||||
struct statistics_log_bucket
|
||||
: statistics_lite<T>
|
||||
{
|
||||
// Only accepts non-negative values
|
||||
void push(T const & value, std::size_t count = 1);
|
||||
|
||||
T percentile(double p) const;
|
||||
|
||||
statistics_lite<T> & lite() { return *this; }
|
||||
statistics_lite<T> const & lite() const { return *this; }
|
||||
|
||||
template <typename H, H P>
|
||||
friend statistics_log_bucket<H, P> merge(statistics_log_bucket<H, P> const & s1, statistics_log_bucket<H, P> const & s2);
|
||||
|
||||
private:
|
||||
static constexpr T bucket_size = Precision / std::log(2.0);
|
||||
|
||||
std::size_t zero_count_ = 0;
|
||||
util::spatial_array<detail::base_statistics<T>, 1, int> buckets_;
|
||||
mutable util::spatial_array<std::size_t, 1, int> prefix_count_;
|
||||
};
|
||||
|
||||
template <typename T, T Precision>
|
||||
void statistics_log_bucket<T, Precision>::push(T const & value, std::size_t count)
|
||||
{
|
||||
lite().push(value, count);
|
||||
if (value == T{0}) [[unlikely]]
|
||||
++zero_count_;
|
||||
else
|
||||
{
|
||||
int const bucket = std::floor(std::log2(value) / bucket_size);
|
||||
buckets_.get(bucket).push(value, count);
|
||||
}
|
||||
|
||||
prefix_count_.clear();
|
||||
}
|
||||
|
||||
template <typename T, T Precision>
|
||||
T statistics_log_bucket<T, Precision>::percentile(double p) const
|
||||
{
|
||||
if (prefix_count_.empty())
|
||||
{
|
||||
std::size_t count = zero_count_;
|
||||
|
||||
prefix_count_.get(buckets_.min(0)) = count;
|
||||
for (int bucket = buckets_.min(0); bucket < buckets_.max(0); ++bucket)
|
||||
{
|
||||
count += buckets_.at(bucket).count();
|
||||
prefix_count_.get(bucket + 1) = count;
|
||||
}
|
||||
}
|
||||
|
||||
std::size_t index = std::min<std::size_t>(this->count() - 1, this->count() * p);
|
||||
|
||||
auto it = std::lower_bound(prefix_count_.begin(), prefix_count_.end(), index);
|
||||
|
||||
if (it == prefix_count_.begin())
|
||||
return T{0};
|
||||
|
||||
--it;
|
||||
|
||||
int bucket = buckets_.min(0) + (it - prefix_count_.begin());
|
||||
|
||||
double min = std::exp2(bucket * bucket_size);
|
||||
double max = std::exp2((bucket + 1) * bucket_size);
|
||||
double q = double(index - *it) / (*(it + 1) - *it);
|
||||
|
||||
return buckets_.at(bucket).percentile(q, min, max);
|
||||
}
|
||||
|
||||
template <typename T, T Precision>
|
||||
std::ostream & operator << (std::ostream & os, statistics_log_bucket<T, Precision> const & s)
|
||||
{
|
||||
os << s.lite();
|
||||
return os;
|
||||
}
|
||||
|
||||
template <typename T, T Precision>
|
||||
statistics_log_bucket<T, Precision> merge(statistics_log_bucket<T, Precision> const & s1, statistics_log_bucket<T, Precision> const & s2)
|
||||
{
|
||||
statistics_log_bucket<T, Precision> result;
|
||||
result.lite() = merge(s1.lite(), s2.lite());
|
||||
|
||||
for (int bucket = s1.buckets_.min(0); bucket < s1.buckets_.max(0); ++bucket)
|
||||
{
|
||||
auto const & source = s1.buckets_.at(bucket);
|
||||
auto & target = result.buckets_.get(bucket);
|
||||
target = merge(target, source);
|
||||
}
|
||||
|
||||
for (int bucket = s2.buckets_.min(0); bucket < s2.buckets_.max(0); ++bucket)
|
||||
{
|
||||
auto const & source = s2.buckets_.at(bucket);
|
||||
auto & target = result.buckets_.get(bucket);
|
||||
target = merge(target, source);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct statistics
|
||||
: statistics_lite<T>
|
||||
{
|
||||
void push(T const & value, std::size_t count = 1);
|
||||
|
||||
std::size_t count() const { return values_.size(); }
|
||||
|
||||
T percentile(double p) const;
|
||||
|
||||
template <typename H>
|
||||
friend statistics<H> merge(statistics<H> const & s1, statistics<H> const & s2);
|
||||
|
||||
statistics_lite<T> & lite() { return *this; }
|
||||
statistics_lite<T> const & lite() const { return *this; }
|
||||
|
||||
private:
|
||||
mutable std::vector<T> values_;
|
||||
mutable bool sorted_ = true;
|
||||
|
||||
statistics_lite<T> & lite() { return *this; }
|
||||
statistics_lite<T> const & lite() const { return *this; }
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -218,7 +332,7 @@ namespace psemek::util
|
|||
template <typename T>
|
||||
std::ostream & operator << (std::ostream & os, statistics<T> const & s)
|
||||
{
|
||||
os << "mean = " << s.mean() << ", dev = " << s.stddev() << ", range = [" << s.min() << " .. " << s.max() << "]";
|
||||
os << s.lite();
|
||||
return os;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue