Skip to content

Commit bef2465

Browse files
authored
Merge pull request #2 from lightstep/jmacd/godocex
Clean up the godoc
2 parents 0f1df4e + 68f1fab commit bef2465

7 files changed

+62
-14
lines changed

doc.go

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// Copyright 2019, LightStep Inc.
2+
3+
/*
4+
Package varopt is an implementation of VarOpt, an unbiased weighted
5+
sampling algorithm described in the paper "Stream sampling for
6+
variance-optimal estimation of subset sums"
7+
https://arxiv.org/pdf/0803.0473.pdf (2008), by Edith Cohen, Nick
8+
Duffield, Haim Kaplan, Carsten Lund, and Mikkel Thorup.
9+
10+
VarOpt is a reservoir-type sampler that maintains a fixed-size sample
11+
and provides a mechanism for merging unequal-weight samples.
12+
13+
This package also includes a simple reservoir sampling algorithm,
14+
often useful in conjunction with weighed reservoir sampling, using
15+
Algorithm R from "Random sampling with a
16+
reservoir", https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R
17+
(1985), by Jeffrey Vitter.
18+
19+
See https://github.com/lightstep/varopt/blob/master/README.md for
20+
more detail.
21+
*/
22+
package varopt

frequency_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ var colors = []curve{
3535
// While the number of expected points per second is uniform, the
3636
// output sample weights are expected to match the original
3737
// frequencies.
38-
func ExampleFrequency() {
38+
func ExampleVaropt_GetOriginalWeight() {
3939
// Number of points.
4040
const totalCount = 1e6
4141

simple.go

+14-7
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,20 @@ type Simple struct {
1313
capacity int
1414
observed int
1515
buffer []Sample
16+
rnd *rand.Rand
1617
}
1718

18-
func NewSimple(capacity int) *Simple {
19+
// NewSimple returns a simple reservoir sampler with given capacity
20+
// (i.e., reservoir size) and random number generator.
21+
func NewSimple(capacity int, rnd *rand.Rand) *Simple {
1922
return &Simple{
2023
capacity: capacity,
24+
rnd: rnd,
2125
}
2226
}
2327

28+
// Add considers a new observation for the sample. Items have unit
29+
// weight.
2430
func (s *Simple) Add(span Sample) {
2531
s.observed++
2632

@@ -34,28 +40,29 @@ func (s *Simple) Add(span Sample) {
3440
}
3541

3642
// Give this a capacity/observed chance of replacing an existing entry.
37-
index := rand.Intn(s.observed)
43+
index := s.rnd.Intn(s.observed)
3844
if index < s.capacity {
3945
s.buffer[index] = span
4046
}
4147
}
4248

49+
// Get returns the i'th selected item from the sample.
4350
func (s *Simple) Get(i int) Sample {
4451
return s.buffer[i]
4552
}
4653

54+
// Get returns the number of items in the sample. If the reservoir is
55+
// full, Size() equals Capacity().
4756
func (s *Simple) Size() int {
4857
return len(s.buffer)
4958
}
5059

60+
// Weight returns the adjusted weight of each item in the sample.
5161
func (s *Simple) Weight() float64 {
5262
return float64(s.observed) / float64(s.Size())
5363
}
5464

55-
func (s *Simple) Prob() float64 {
56-
return 1 / s.Weight()
57-
}
58-
59-
func (s *Simple) Observed() int {
65+
// Count returns the number of items that were observed.
66+
func (s *Simple) Count() int {
6067
return s.observed
6168
}

simple_test.go

+4-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
package varopt_test
44

55
import (
6+
"math/rand"
67
"testing"
78

89
"github.com/lightstep/varopt"
@@ -19,7 +20,9 @@ func TestSimple(t *testing.T) {
1920
epsilon = 0.01
2021
)
2122

22-
ss := varopt.NewSimple(sampleSize)
23+
rnd := rand.New(rand.NewSource(17167))
24+
25+
ss := varopt.NewSimple(sampleSize, rnd)
2326

2427
psum := 0.
2528
for i := 0; i < popSize; i++ {

varopt.go

+18
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ type Varopt struct {
3636
totalWeight float64
3737
}
3838

39+
// Sample is an empty interface that represents a sample item.
40+
// Sampling algorithms treat these as opaque, as their weight is
41+
// passed in separately.
3942
type Sample interface{}
4043

4144
type vsample struct {
@@ -45,13 +48,16 @@ type vsample struct {
4548

4649
type largeHeap []vsample
4750

51+
// New returns a new Varopt sampler with given capacity (i.e.,
52+
// reservoir size) and random number generator.
4853
func New(capacity int, rnd *rand.Rand) *Varopt {
4954
return &Varopt{
5055
capacity: capacity,
5156
rnd: rnd,
5257
}
5358
}
5459

60+
// Add considers a new observation for the sample with given weight.
5561
func (s *Varopt) Add(sample Sample, weight float64) {
5662
individual := vsample{
5763
sample: sample,
@@ -131,6 +137,9 @@ func (s *Varopt) Get(i int) (Sample, float64) {
131137
return s.T[i-len(s.L)].sample, s.tau
132138
}
133139

140+
// GetOriginalWeight returns the original input weight of the sample
141+
// item that was passed to Add(). This can be useful for computing a
142+
// frequency from the adjusted sample weight.
134143
func (s *Varopt) GetOriginalWeight(i int) float64 {
135144
if i < len(s.L) {
136145
return s.L[i].weight
@@ -139,22 +148,31 @@ func (s *Varopt) GetOriginalWeight(i int) float64 {
139148
return s.T[i-len(s.L)].weight
140149
}
141150

151+
// Capacity returns the size of the reservoir. This is the maximum
152+
// size of the sample.
142153
func (s *Varopt) Capacity() int {
143154
return s.capacity
144155
}
145156

157+
// Size returns the current number of items in the sample. If the
158+
// reservoir is full, this returns Capacity().
146159
func (s *Varopt) Size() int {
147160
return len(s.L) + len(s.T)
148161
}
149162

163+
// TotalWeight returns the sum of weights that were passed to Add().
150164
func (s *Varopt) TotalWeight() float64 {
151165
return s.totalWeight
152166
}
153167

168+
// TotalCount returns the number of calls to Add().
154169
func (s *Varopt) TotalCount() int {
155170
return s.totalCount
156171
}
157172

173+
// Tau returns the current large-weight threshold. Weights larger
174+
// than Tau() carry their exact weight int he sample. See the VarOpt
175+
// paper for details.
158176
func (s *Varopt) Tau() float64 {
159177
return s.tau
160178
}

varopt_test.go

+2-4
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,7 @@ const (
2323
sampleProb = 0.001
2424
sampleSize int = popSize * sampleProb
2525

26-
// TODO epsilon is somewhat variable b/c we're using the
27-
// static rand w/o a fixed seed for the test.
28-
epsilon = 0.06
26+
epsilon = 0.08
2927
)
3028

3129
func TestUnbiased(t *testing.T) {
@@ -108,7 +106,7 @@ func testUnbiased(t *testing.T, bbr, bsr float64) {
108106

109107
for _, blockList := range blockLists {
110108
for _, block := range blockList {
111-
simple := varopt.NewSimple(sampleSize)
109+
simple := varopt.NewSimple(sampleSize, rnd)
112110

113111
for _, s := range block {
114112
simple.Add(s)

weighted_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ type packet struct {
1616
protocol string
1717
}
1818

19-
func ExampleWeighted() {
19+
func ExampleNew() {
2020
const totalPackets = 1e6
2121
const sampleRatio = 0.01
2222

0 commit comments

Comments
 (0)