|
| 1 | +// This module implements a Zipfian distribution generator. |
| 2 | +// |
| 3 | +// Based on https://github.com/jonhoo/rust-zipf. |
| 4 | + |
| 5 | +use rand::Rng; |
| 6 | + |
| 7 | +/// Random number generator that generates Zipf-distributed random numbers using rejection |
| 8 | +/// inversion. |
| 9 | +#[derive(Clone, Copy)] |
| 10 | +pub struct ZipfDistribution { |
| 11 | + /// Number of elements |
| 12 | + num_elements: f64, |
| 13 | + /// Exponent parameter of the distribution |
| 14 | + exponent: f64, |
| 15 | + /// `hIntegral(1.5) - 1}` |
| 16 | + h_integral_x1: f64, |
| 17 | + /// `hIntegral(num_elements + 0.5)}` |
| 18 | + h_integral_num_elements: f64, |
| 19 | + /// `2 - hIntegralInverse(hIntegral(2.5) - h(2)}` |
| 20 | + s: f64, |
| 21 | +} |
| 22 | + |
| 23 | +impl ZipfDistribution { |
| 24 | + /// Creates a new [Zipf-distributed](https://en.wikipedia.org/wiki/Zipf's_law) |
| 25 | + /// random number generator. |
| 26 | + /// |
| 27 | + /// Note that both the number of elements and the exponent must be greater than 0. |
| 28 | + pub fn new(num_elements: usize, exponent: f64) -> Result<Self, ()> { |
| 29 | + if num_elements == 0 { |
| 30 | + return Err(()); |
| 31 | + } |
| 32 | + if exponent <= 0f64 { |
| 33 | + return Err(()); |
| 34 | + } |
| 35 | + |
| 36 | + let z = ZipfDistribution { |
| 37 | + num_elements: num_elements as f64, |
| 38 | + exponent, |
| 39 | + h_integral_x1: ZipfDistribution::h_integral(1.5, exponent) - 1f64, |
| 40 | + h_integral_num_elements: ZipfDistribution::h_integral( |
| 41 | + num_elements as f64 + 0.5, |
| 42 | + exponent, |
| 43 | + ), |
| 44 | + s: 2f64 |
| 45 | + - ZipfDistribution::h_integral_inv( |
| 46 | + ZipfDistribution::h_integral(2.5, exponent) |
| 47 | + - ZipfDistribution::h(2f64, exponent), |
| 48 | + exponent, |
| 49 | + ), |
| 50 | + }; |
| 51 | + |
| 52 | + // populate cache |
| 53 | + |
| 54 | + Ok(z) |
| 55 | + } |
| 56 | +} |
| 57 | + |
| 58 | +impl ZipfDistribution { |
| 59 | + fn next<R: Rng + ?Sized>(&self, rng: &mut R) -> usize { |
| 60 | + // The paper describes an algorithm for exponents larger than 1 (Algorithm ZRI). |
| 61 | + // |
| 62 | + // The original method uses |
| 63 | + // H(x) = (v + x)^(1 - q) / (1 - q) |
| 64 | + // as the integral of the hat function. |
| 65 | + // |
| 66 | + // This function is undefined for q = 1, which is the reason for the limitation of the |
| 67 | + // exponent. |
| 68 | + // |
| 69 | + // If instead the integral function |
| 70 | + // H(x) = ((v + x)^(1 - q) - 1) / (1 - q) |
| 71 | + // is used, for which a meaningful limit exists for q = 1, the method works for all |
| 72 | + // positive exponents. |
| 73 | + // |
| 74 | + // The following implementation uses v = 0 and generates integral number in the range [1, |
| 75 | + // num_elements]. This is different to the original method where v is defined to |
| 76 | + // be positive and numbers are taken from [0, i_max]. This explains why the implementation |
| 77 | + // looks slightly different. |
| 78 | + |
| 79 | + let hnum = self.h_integral_num_elements; |
| 80 | + |
| 81 | + loop { |
| 82 | + use std::cmp; |
| 83 | + let u: f64 = hnum + rng.gen::<f64>() * (self.h_integral_x1 - hnum); |
| 84 | + // u is uniformly distributed in (h_integral_x1, h_integral_num_elements] |
| 85 | + |
| 86 | + let x: f64 = ZipfDistribution::h_integral_inv(u, self.exponent); |
| 87 | + |
| 88 | + // Limit k to the range [1, num_elements] if it would be outside |
| 89 | + // due to numerical inaccuracies. |
| 90 | + let k64 = x.max(1.0).min(self.num_elements); |
| 91 | + // float -> integer rounds towards zero, so we add 0.5 |
| 92 | + // to prevent bias towards k == 1 |
| 93 | + let k = cmp::max(1, (k64 + 0.5) as usize); |
| 94 | + |
| 95 | + // Here, the distribution of k is given by: |
| 96 | + // |
| 97 | + // P(k = 1) = C * (hIntegral(1.5) - h_integral_x1) = C |
| 98 | + // P(k = m) = C * (hIntegral(m + 1/2) - hIntegral(m - 1/2)) for m >= 2 |
| 99 | + // |
| 100 | + // where C = 1 / (h_integral_num_elements - h_integral_x1) |
| 101 | + if k64 - x <= self.s |
| 102 | + || u >= ZipfDistribution::h_integral(k64 + 0.5, self.exponent) |
| 103 | + - ZipfDistribution::h(k64, self.exponent) |
| 104 | + { |
| 105 | + // Case k = 1: |
| 106 | + // |
| 107 | + // The right inequality is always true, because replacing k by 1 gives |
| 108 | + // u >= hIntegral(1.5) - h(1) = h_integral_x1 and u is taken from |
| 109 | + // (h_integral_x1, h_integral_num_elements]. |
| 110 | + // |
| 111 | + // Therefore, the acceptance rate for k = 1 is P(accepted | k = 1) = 1 |
| 112 | + // and the probability that 1 is returned as random value is |
| 113 | + // P(k = 1 and accepted) = P(accepted | k = 1) * P(k = 1) = C = C / 1^exponent |
| 114 | + // |
| 115 | + // Case k >= 2: |
| 116 | + // |
| 117 | + // The left inequality (k - x <= s) is just a short cut |
| 118 | + // to avoid the more expensive evaluation of the right inequality |
| 119 | + // (u >= hIntegral(k + 0.5) - h(k)) in many cases. |
| 120 | + // |
| 121 | + // If the left inequality is true, the right inequality is also true: |
| 122 | + // Theorem 2 in the paper is valid for all positive exponents, because |
| 123 | + // the requirements h'(x) = -exponent/x^(exponent + 1) < 0 and |
| 124 | + // (-1/hInverse'(x))'' = (1+1/exponent) * x^(1/exponent-1) >= 0 |
| 125 | + // are both fulfilled. |
| 126 | + // Therefore, f(x) = x - hIntegralInverse(hIntegral(x + 0.5) - h(x)) |
| 127 | + // is a non-decreasing function. If k - x <= s holds, |
| 128 | + // k - x <= s + f(k) - f(2) is obviously also true which is equivalent to |
| 129 | + // -x <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)), |
| 130 | + // -hIntegralInverse(u) <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)), |
| 131 | + // and finally u >= hIntegral(k + 0.5) - h(k). |
| 132 | + // |
| 133 | + // Hence, the right inequality determines the acceptance rate: |
| 134 | + // P(accepted | k = m) = h(m) / (hIntegrated(m+1/2) - hIntegrated(m-1/2)) |
| 135 | + // The probability that m is returned is given by |
| 136 | + // P(k = m and accepted) = P(accepted | k = m) * P(k = m) |
| 137 | + // = C * h(m) = C / m^exponent. |
| 138 | + // |
| 139 | + // In both cases the probabilities are proportional to the probability mass |
| 140 | + // function of the Zipf distribution. |
| 141 | + |
| 142 | + return k; |
| 143 | + } |
| 144 | + } |
| 145 | + } |
| 146 | +} |
| 147 | + |
| 148 | +impl rand::distributions::Distribution<usize> for ZipfDistribution { |
| 149 | + fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> usize { |
| 150 | + self.next(rng) |
| 151 | + } |
| 152 | +} |
| 153 | + |
| 154 | +use std::fmt; |
| 155 | +impl fmt::Debug for ZipfDistribution { |
| 156 | + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { |
| 157 | + f.debug_struct("ZipfDistribution") |
| 158 | + .field("e", &self.exponent) |
| 159 | + .field("n", &self.num_elements) |
| 160 | + .finish() |
| 161 | + } |
| 162 | +} |
| 163 | + |
| 164 | +impl ZipfDistribution { |
| 165 | + /// Computes `H(x)`, defined as |
| 166 | + /// |
| 167 | + /// - `(x^(1 - exponent) - 1) / (1 - exponent)`, if `exponent != 1` |
| 168 | + /// - `log(x)`, if `exponent == 1` |
| 169 | + /// |
| 170 | + /// `H(x)` is an integral function of `h(x)`, the derivative of `H(x)` is `h(x)`. |
| 171 | + fn h_integral(x: f64, exponent: f64) -> f64 { |
| 172 | + let log_x = x.ln(); |
| 173 | + helper2((1f64 - exponent) * log_x) * log_x |
| 174 | + } |
| 175 | + |
| 176 | + /// Computes `h(x) = 1 / x^exponent` |
| 177 | + fn h(x: f64, exponent: f64) -> f64 { |
| 178 | + (-exponent * x.ln()).exp() |
| 179 | + } |
| 180 | + |
| 181 | + /// The inverse function of `H(x)`. |
| 182 | + /// Returns the `y` for which `H(y) = x`. |
| 183 | + fn h_integral_inv(x: f64, exponent: f64) -> f64 { |
| 184 | + let mut t: f64 = x * (1f64 - exponent); |
| 185 | + if t < -1f64 { |
| 186 | + // Limit value to the range [-1, +inf). |
| 187 | + // t could be smaller than -1 in some rare cases due to numerical errors. |
| 188 | + t = -1f64; |
| 189 | + } |
| 190 | + (helper1(t) * x).exp() |
| 191 | + } |
| 192 | +} |
| 193 | + |
| 194 | +/// Helper function that calculates `log(1 + x) / x`. |
| 195 | +/// A Taylor series expansion is used, if x is close to 0. |
| 196 | +fn helper1(x: f64) -> f64 { |
| 197 | + if x.abs() > 1e-8 { x.ln_1p() / x } else { 1f64 - x * (0.5 - x * (1.0 / 3.0 - 0.25 * x)) } |
| 198 | +} |
| 199 | + |
| 200 | +/// Helper function to calculate `(exp(x) - 1) / x`. |
| 201 | +/// A Taylor series expansion is used, if x is close to 0. |
| 202 | +fn helper2(x: f64) -> f64 { |
| 203 | + if x.abs() > 1e-8 { |
| 204 | + x.exp_m1() / x |
| 205 | + } else { |
| 206 | + 1f64 + x * 0.5 * (1f64 + x * 1.0 / 3.0 * (1f64 + 0.25 * x)) |
| 207 | + } |
| 208 | +} |
0 commit comments