Initial checkin

jmacd · jmacd · commit cc959e4e6355 · 2019-11-03T08:50:12.000-08:00
diff --git a/README.md b/README.md
@@ -0,0 +1,3 @@
+This is an implementation of VarOpt, an unbiased weighted sampling
+algorithm described in the paper [Stream sampling for variance-optimal
+estimation of subset sums](https://arxiv.org/pdf/0803.0473.pdf).
diff --git a/varopt.go b/varopt.go
@@ -0,0 +1,181 @@
+// Stream sampling for variance-optimal estimation of subset sums
+// Edith Cohen, Nick Duffield, Haim Kaplan, Carsten Lund, Mikkel Thorup
+// 2008
+// https://arxiv.org/pdf/0803.0473.pdf
+
+package varopt
+
+import (
+	"container/heap"
+	"fmt"
+	"math/rand"
+)
+
+type Varopt struct {
+	// Large-weight items
+	L largeHeap
+
+	// Light-weight items.
+	T []vsample
+
+	// Temporary buffer.
+	X []vsample
+
+	// Current threshold
+	tau float64
+
+	// Size of sample & scale
+	capacity int
+
+	totalCount  int
+	totalWeight float64
+}
+
+type vsample struct {
+	sample Sample
+	weight float64
+}
+
+type largeHeap []vsample
+
+func NewVaropt(capacity int) *Varopt {
+	v := InitVaropt(capacity)
+	return &v
+}
+
+func InitVaropt(capacity int) Varopt {
+	return Varopt{
+		capacity: capacity,
+	}
+}
+
+func (s *Varopt) Add(sample Sample, weight float64) {
+	individual := vsample{
+		sample: sample,
+		weight: weight,
+	}
+
+	if weight <= 0 {
+		panic(fmt.Sprint("Invalid weight <= 0: ", weight))
+	}
+
+	s.totalCount++
+	s.totalWeight += weight
+
+	if s.Size() < s.capacity {
+		heap.Push(&s.L, individual)
+		return
+	}
+
+	// the X <- {} step from the paper is not done here,
+	// but rather at the bottom of the function
+
+	W := s.tau * float64(len(s.T))
+
+	if weight > s.tau {
+		heap.Push(&s.L, individual)
+	} else {
+		s.X = append(s.X, individual)
+		W += weight
+	}
+
+	for len(s.L) > 0 && W >= float64(len(s.T)+len(s.X)-1)*s.L[0].weight {
+		h := heap.Pop(&s.L).(vsample)
+		s.X = append(s.X, h)
+		W += h.weight
+	}
+
+	s.tau = W / float64(len(s.T)+len(s.X)-1)
+	r := s.uniform()
+	d := 0
+
+	for d < len(s.X) && r >= 0 {
+		wxd := s.X[d].weight
+		r -= (1 - wxd/s.tau)
+		d++
+	}
+	if r < 0 {
+		if d < len(s.X) {
+			s.X[d], s.X[len(s.X)-1] = s.X[len(s.X)-1], s.X[d]
+		}
+		s.X = s.X[:len(s.X)-1]
+	} else {
+		ti := rand.Intn(len(s.T))
+		s.T[ti], s.T[len(s.T)-1] = s.T[len(s.T)-1], s.T[ti]
+		s.T = s.T[:len(s.T)-1]
+	}
+	s.T = append(s.T, s.X...)
+	s.X = s.X[:0]
+}
+
+func (s *Varopt) uniform() float64 {
+	for {
+		r := rand.Float64()
+		if r != 0.0 {
+			return r
+		}
+	}
+}
+
+// Get() returns the i'th sample and its adjusted weight. To obtain
+// the sample's original weight (i.e. what was passed to Add), use
+// GetOriginalWeight(i).
+func (s *Varopt) Get(i int) (Sample, float64) {
+	if i < len(s.L) {
+		return s.L[i].sample, s.L[i].weight
+	}
+
+	return s.T[i-len(s.L)].sample, s.tau
+}
+
+func (s *Varopt) GetOriginalWeight(i int) float64 {
+	if i < len(s.L) {
+		return s.L[i].weight
+	}
+
+	return s.T[i-len(s.L)].weight
+}
+
+func (s *Varopt) Capacity() int {
+	return s.capacity
+}
+
+func (s *Varopt) Size() int {
+	return len(s.L) + len(s.T)
+}
+
+func (s *Varopt) TotalWeight() float64 {
+	return s.totalWeight
+}
+
+func (s *Varopt) TotalCount() int {
+	return s.totalCount
+}
+
+func (s *Varopt) Tau() float64 {
+	return s.tau
+}
+
+func (b largeHeap) Len() int {
+	return len(b)
+}
+
+func (b largeHeap) Swap(i, j int) {
+	b[i], b[j] = b[j], b[i]
+}
+
+func (b largeHeap) Less(i, j int) bool {
+	return b[i].weight < b[j].weight
+}
+
+func (b *largeHeap) Push(x interface{}) {
+	*b = append(*b, x.(vsample))
+}
+
+func (b *largeHeap) Pop() interface{} {
+	old := *b
+	n := len(old)
+	x := old[n-1]
+	*b = old[0 : n-1]
+	return x
+}
diff --git a/varopt_test.go b/varopt_test.go
@@ -0,0 +1,146 @@
+package varopt_test
+
+import (
+	"math"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+// There are 2 unequal sizes of simple block
+// There are odd and even numbers, in equal amount
+// There are last-digits 0-9 in equal amount
+//
+// Much like simple_test.go, we will test the mean is correct and,
+// because unbiased, also the odd/even and last-digit-0-9 groupings
+// will be balanced.
+const (
+	numBlocks      = 100
+	popSize        = 1e7
+	sampleProb     = 0.001
+	sampleSize int = popSize * sampleProb
+
+	// TODO epsilon is somewhat variable b/c we're using the
+	// static rand w/o a fixed seed for the test.
+	epsilon = 0.06
+)
+
+func TestUnbiased(t *testing.T) {
+	var (
+		// Ratio of big blocks to small blocks
+		bigBlockRatios = []float64{0.1, 0.3, 0.5, 0.7, 0.9, 1.0}
+		// Ratio of big block size to small block size
+		bigSizeRatios = []float64{0.1, 0.2, 0.4}
+	)
+
+	for _, bbr := range bigBlockRatios {
+		for _, bsr := range bigSizeRatios {
+			testUnbiased(t, bbr, bsr)
+		}
+	}
+}
+
+func testUnbiased(t *testing.T, bbr, bsr float64) {
+	var (
+		numBig   = int(numBlocks * bbr)
+		numSmall = numBlocks - numBig
+
+		factor = float64(numBig)/bsr + float64(numSmall)
+
+		smallSize = int(popSize / factor)
+		bigSize   = int(float64(smallSize) / bsr)
+
+		extra = popSize - bigSize*numBig - smallSize*numSmall
+	)
+
+	population := make([]Sample, popSize)
+
+	psum := 0.0
+
+	for i := range population {
+		population[i] = i
+		psum += float64(i)
+	}
+
+	// Note: We're leaving the data unsorted to prove lack of bias
+	// rand.Shuffle(len(population), func(i, j int) {
+	// 	population[i], population[j] = population[j], population[i]
+	// })
+
+	smallBlocks := make([][]Sample, numSmall)
+	bigBlocks := make([][]Sample, numBig)
+
+	for i := 0; i < numSmall; i++ {
+		smallBlocks[i] = make([]Sample, smallSize)
+	}
+	for i := 0; i < numBig; i++ {
+		if i == 0 {
+			bigBlocks[0] = make([]Sample, bigSize+extra)
+		} else {
+			bigBlocks[i] = make([]Sample, bigSize)
+		}
+	}
+
+	pos := 0
+	for i := 0; i < numSmall; i++ {
+		for j := 0; j < len(smallBlocks[i]); j++ {
+			smallBlocks[i][j] = population[pos]
+			pos++
+		}
+	}
+	for i := 0; i < numBig; i++ {
+		for j := 0; j < len(bigBlocks[i]); j++ {
+			bigBlocks[i][j] = population[pos]
+			pos++
+		}
+	}
+	require.Equal(t, len(population), pos)
+
+	maxDiff := 0.0
+
+	func(allBlockLists ...[][][]Sample) {
+		for _, blockLists := range allBlockLists {
+			varopt := NewVaropt(sampleSize)
+
+			for _, blockList := range blockLists {
+				for _, block := range blockList {
+					simple := NewSimple(sampleSize)
+
+					for _, s := range block {
+						simple.Add(s)
+					}
+
+					weight := simple.Weight()
+					for i := 0; i < simple.Size(); i++ {
+						varopt.Add(simple.Get(i), weight)
+					}
+				}
+			}
+
+			vsum := 0.0
+			odd := 0.0
+			even := 0.0
+
+			for i := 0; i < varopt.Size(); i++ {
+				v, w := varopt.Get(i)
+				vi := v.(int)
+				if vi%2 == 0 {
+					even++
+				} else {
+					odd++
+				}
+
+				vsum += w * float64(vi)
+			}
+
+			diff := math.Abs(vsum-psum) / psum
+			maxDiff = math.Max(maxDiff, diff)
+
+			require.InEpsilon(t, vsum, psum, epsilon)
+			require.InEpsilon(t, odd, even, epsilon)
+		}
+	}(
+		[][][]Sample{bigBlocks, smallBlocks},
+		[][][]Sample{smallBlocks, bigBlocks},
+	)
+}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+This is an implementation of VarOpt, an unbiased weighted sampling`
	`2`	`+algorithm described in the paper [Stream sampling for variance-optimal`
	`3`	`+estimation of subset sums](https://arxiv.org/pdf/0803.0473.pdf).`