Add log-bucketing statistics in util
This commit is contained in:
parent
6c5815ff76
commit
30877401a3
1 changed files with 134 additions and 20 deletions
|
|
@ -1,5 +1,6 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <psemek/util/spatial_array.hpp>
|
||||||
#include <boost/math/special_functions/erf.hpp>
|
#include <boost/math/special_functions/erf.hpp>
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
@ -79,6 +80,32 @@ namespace psemek::util
|
||||||
return variance_;
|
return variance_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
T stddev() const
|
||||||
|
{
|
||||||
|
return std::sqrt(variance_);
|
||||||
|
}
|
||||||
|
|
||||||
|
T percentile(double p, T const & min, T const & max) const
|
||||||
|
{
|
||||||
|
// Assume truncated normal distribution in the range [min, max]
|
||||||
|
// which is the maximum-entropy distribution on this range
|
||||||
|
// with specified mean and variance
|
||||||
|
// See
|
||||||
|
// https://en.wikipedia.org/wiki/Maximum_entropy_probability_distribution#Other_examples
|
||||||
|
// https://en.wikipedia.org/wiki/Truncated_normal_distribution
|
||||||
|
|
||||||
|
float const mu = mean();
|
||||||
|
float const sigma = stddev();
|
||||||
|
|
||||||
|
if (sigma == T{0})
|
||||||
|
return mu;
|
||||||
|
|
||||||
|
float const alpha = (min - mu) / sigma;
|
||||||
|
float const beta = (max - mu) / sigma;
|
||||||
|
|
||||||
|
return mu + sigma * detail::normal_cdf_inv(std::lerp(detail::normal_cdf(alpha), detail::normal_cdf(beta), p));
|
||||||
|
}
|
||||||
|
|
||||||
friend base_statistics merge(base_statistics const & s1, base_statistics const & s2)
|
friend base_statistics merge(base_statistics const & s1, base_statistics const & s2)
|
||||||
{
|
{
|
||||||
// See https://stackoverflow.com/questions/1480626/merging-two-statistical-result-sets
|
// See https://stackoverflow.com/questions/1480626/merging-two-statistical-result-sets
|
||||||
|
|
@ -115,7 +142,7 @@ namespace psemek::util
|
||||||
std::size_t count() const { return base_.count(); }
|
std::size_t count() const { return base_.count(); }
|
||||||
T mean() const { return base_.mean(); }
|
T mean() const { return base_.mean(); }
|
||||||
T variance() const { return base_.variance(); }
|
T variance() const { return base_.variance(); }
|
||||||
T stddev() const { return std::sqrt(variance()); }
|
T stddev() const { return base_.stddev(); }
|
||||||
T min() const { return min_; }
|
T min() const { return min_; }
|
||||||
T max() const { return max_; }
|
T max() const { return max_; }
|
||||||
|
|
||||||
|
|
@ -149,19 +176,7 @@ namespace psemek::util
|
||||||
template <typename T>
|
template <typename T>
|
||||||
T statistics_lite<T>::percentile(double p) const
|
T statistics_lite<T>::percentile(double p) const
|
||||||
{
|
{
|
||||||
// Assume truncated normal distribution in the range [min, max]
|
return base_.percentile(p, min_, max_);
|
||||||
// which is the maximum-entropy distributioon on this range
|
|
||||||
// with specified mean and variance
|
|
||||||
// See
|
|
||||||
// https://en.wikipedia.org/wiki/Maximum_entropy_probability_distribution#Other_examples
|
|
||||||
// https://en.wikipedia.org/wiki/Truncated_normal_distribution
|
|
||||||
|
|
||||||
float const mu = mean();
|
|
||||||
float const sigma = stddev();
|
|
||||||
float const alpha = (min_ - mu) / sigma;
|
|
||||||
float const beta = (max_ - mu) / sigma;
|
|
||||||
|
|
||||||
return mu + sigma * detail::normal_cdf_inv(std::lerp(detail::normal_cdf(alpha), detail::normal_cdf(beta), p));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
|
@ -174,25 +189,124 @@ namespace psemek::util
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T, T Precision>
|
||||||
|
struct statistics_log_bucket
|
||||||
|
: statistics_lite<T>
|
||||||
|
{
|
||||||
|
// Only accepts non-negative values
|
||||||
|
void push(T const & value, std::size_t count = 1);
|
||||||
|
|
||||||
|
T percentile(double p) const;
|
||||||
|
|
||||||
|
statistics_lite<T> & lite() { return *this; }
|
||||||
|
statistics_lite<T> const & lite() const { return *this; }
|
||||||
|
|
||||||
|
template <typename H, H P>
|
||||||
|
friend statistics_log_bucket<H, P> merge(statistics_log_bucket<H, P> const & s1, statistics_log_bucket<H, P> const & s2);
|
||||||
|
|
||||||
|
private:
|
||||||
|
static constexpr T bucket_size = Precision / std::log(2.0);
|
||||||
|
|
||||||
|
std::size_t zero_count_ = 0;
|
||||||
|
util::spatial_array<detail::base_statistics<T>, 1, int> buckets_;
|
||||||
|
mutable util::spatial_array<std::size_t, 1, int> prefix_count_;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T, T Precision>
|
||||||
|
void statistics_log_bucket<T, Precision>::push(T const & value, std::size_t count)
|
||||||
|
{
|
||||||
|
lite().push(value, count);
|
||||||
|
if (value == T{0}) [[unlikely]]
|
||||||
|
++zero_count_;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int const bucket = std::floor(std::log2(value) / bucket_size);
|
||||||
|
buckets_.get(bucket).push(value, count);
|
||||||
|
}
|
||||||
|
|
||||||
|
prefix_count_.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, T Precision>
|
||||||
|
T statistics_log_bucket<T, Precision>::percentile(double p) const
|
||||||
|
{
|
||||||
|
if (prefix_count_.empty())
|
||||||
|
{
|
||||||
|
std::size_t count = zero_count_;
|
||||||
|
|
||||||
|
prefix_count_.get(buckets_.min(0)) = count;
|
||||||
|
for (int bucket = buckets_.min(0); bucket < buckets_.max(0); ++bucket)
|
||||||
|
{
|
||||||
|
count += buckets_.at(bucket).count();
|
||||||
|
prefix_count_.get(bucket + 1) = count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::size_t index = std::min<std::size_t>(this->count() - 1, this->count() * p);
|
||||||
|
|
||||||
|
auto it = std::lower_bound(prefix_count_.begin(), prefix_count_.end(), index);
|
||||||
|
|
||||||
|
if (it == prefix_count_.begin())
|
||||||
|
return T{0};
|
||||||
|
|
||||||
|
--it;
|
||||||
|
|
||||||
|
int bucket = buckets_.min(0) + (it - prefix_count_.begin());
|
||||||
|
|
||||||
|
double min = std::exp2(bucket * bucket_size);
|
||||||
|
double max = std::exp2((bucket + 1) * bucket_size);
|
||||||
|
double q = double(index - *it) / (*(it + 1) - *it);
|
||||||
|
|
||||||
|
return buckets_.at(bucket).percentile(q, min, max);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, T Precision>
|
||||||
|
std::ostream & operator << (std::ostream & os, statistics_log_bucket<T, Precision> const & s)
|
||||||
|
{
|
||||||
|
os << s.lite();
|
||||||
|
return os;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, T Precision>
|
||||||
|
statistics_log_bucket<T, Precision> merge(statistics_log_bucket<T, Precision> const & s1, statistics_log_bucket<T, Precision> const & s2)
|
||||||
|
{
|
||||||
|
statistics_log_bucket<T, Precision> result;
|
||||||
|
result.lite() = merge(s1.lite(), s2.lite());
|
||||||
|
|
||||||
|
for (int bucket = s1.buckets_.min(0); bucket < s1.buckets_.max(0); ++bucket)
|
||||||
|
{
|
||||||
|
auto const & source = s1.buckets_.at(bucket);
|
||||||
|
auto & target = result.buckets_.get(bucket);
|
||||||
|
target = merge(target, source);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int bucket = s2.buckets_.min(0); bucket < s2.buckets_.max(0); ++bucket)
|
||||||
|
{
|
||||||
|
auto const & source = s2.buckets_.at(bucket);
|
||||||
|
auto & target = result.buckets_.get(bucket);
|
||||||
|
target = merge(target, source);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct statistics
|
struct statistics
|
||||||
: statistics_lite<T>
|
: statistics_lite<T>
|
||||||
{
|
{
|
||||||
void push(T const & value, std::size_t count = 1);
|
void push(T const & value, std::size_t count = 1);
|
||||||
|
|
||||||
std::size_t count() const { return values_.size(); }
|
|
||||||
|
|
||||||
T percentile(double p) const;
|
T percentile(double p) const;
|
||||||
|
|
||||||
template <typename H>
|
template <typename H>
|
||||||
friend statistics<H> merge(statistics<H> const & s1, statistics<H> const & s2);
|
friend statistics<H> merge(statistics<H> const & s1, statistics<H> const & s2);
|
||||||
|
|
||||||
|
statistics_lite<T> & lite() { return *this; }
|
||||||
|
statistics_lite<T> const & lite() const { return *this; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
mutable std::vector<T> values_;
|
mutable std::vector<T> values_;
|
||||||
mutable bool sorted_ = true;
|
mutable bool sorted_ = true;
|
||||||
|
|
||||||
statistics_lite<T> & lite() { return *this; }
|
|
||||||
statistics_lite<T> const & lite() const { return *this; }
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
|
@ -218,7 +332,7 @@ namespace psemek::util
|
||||||
template <typename T>
|
template <typename T>
|
||||||
std::ostream & operator << (std::ostream & os, statistics<T> const & s)
|
std::ostream & operator << (std::ostream & os, statistics<T> const & s)
|
||||||
{
|
{
|
||||||
os << "mean = " << s.mean() << ", dev = " << s.stddev() << ", range = [" << s.min() << " .. " << s.max() << "]";
|
os << s.lite();
|
||||||
return os;
|
return os;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue