From d83922c94c120a5396486e916bce4cc1fb03f9b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cyrille=20Ch=C3=A9p=C3=A9lov=20=28TP12=29?= Date: Mon, 17 Oct 2016 17:03:40 +0200 Subject: [PATCH] Add Ring[BigDecimal], modeled after Ring[BigInt] (Added a custom generator for BigDecimals, to reduce somewhat the risk of underflow) --- .../algebird/benchmark/CMSBenchmark.scala | 21 +++++++++++++++++-- .../algebird/benchmark/TopCMSBenchmark.scala | 20 +++++++++++++++++- .../com/twitter/algebird/CMSHasher.scala | 16 ++++++++++++++ .../scala/com/twitter/algebird/Group.scala | 1 + .../scala/com/twitter/algebird/Monoid.scala | 1 + .../scala/com/twitter/algebird/Ring.scala | 2 ++ .../com/twitter/algebird/Semigroup.scala | 1 + .../com/twitter/algebird/BaseProperties.scala | 12 +++++++++-- .../com/twitter/algebird/BatchedTest.scala | 5 +++++ .../twitter/algebird/CountMinSketchTest.scala | 14 +++++++++++++ .../com/twitter/algebird/FromIntLike.scala | 4 ++++ 11 files changed, 92 insertions(+), 5 deletions(-) diff --git a/algebird-benchmark/src/main/scala/com/twitter/algebird/benchmark/CMSBenchmark.scala b/algebird-benchmark/src/main/scala/com/twitter/algebird/benchmark/CMSBenchmark.scala index 10256d25e..02d59909f 100644 --- a/algebird-benchmark/src/main/scala/com/twitter/algebird/benchmark/CMSBenchmark.scala +++ b/algebird-benchmark/src/main/scala/com/twitter/algebird/benchmark/CMSBenchmark.scala @@ -13,7 +13,6 @@ import CMSFunctions.generateHashes * We benchmark different `K` types as well as different input data streams. */ object CMSBenchmark { - import CMSHasherImplicits.CMSHasherBigInt @State(Scope.Benchmark) @@ -36,24 +35,34 @@ object CMSBenchmark { var smallLongs: Vector[Long] = _ var smallBigInts: Vector[BigInt] = _ var largeBigInts: Vector[BigInt] = _ + var smallBigDecimals: Vector[BigDecimal] = _ + var largeBigDecimals: Vector[BigDecimal] = _ var largeStrings: Vector[String] = _ // need to initialize later because we don't have `eps` and `delta` yet. var longMonoid: CMSMonoid[Long] = _ var bigIntMonoid: CMSMonoid[BigInt] = _ + var bigDecimalMonoid: CMSMonoid[BigDecimal] = _ var stringMonoid: CMSMonoid[String] = _ @Setup(Level.Trial) def setup(): Unit = { longMonoid = CMS.monoid[Long](eps, delta, Seed) bigIntMonoid = CMS.monoid[BigInt](eps, delta, Seed) + bigDecimalMonoid = CMS.monoid[BigDecimal](eps, delta, Seed) stringMonoid = CMS.monoid[String](eps, delta, Seed) val bitsPerChar = 16 largeStrings = (1 to size).map(i => nextString(MaxBits / bitsPerChar)).toVector - largeBigInts = largeStrings.map(s => BigInt(s.getBytes)).toVector + largeBigInts = largeStrings.map(s => BigInt(s.getBytes)) + largeBigDecimals = largeStrings.map(s => { + val md = (s.head % 256) - 128 + BigDecimal(BigInt(s.tail.getBytes)) * BigDecimal(1).pow(md) + }) + smallLongs = (1 to size).map(_.toLong).toVector smallBigInts = (1 to size).map(BigInt(_)).toVector + smallBigDecimals = (1 to size).map(BigDecimal(_) + BigDecimal(1).pow(-size)).toVector } } @@ -77,6 +86,14 @@ class CMSBenchmark { def sumLargeBigIntCms(st: CMSState): CMS[BigInt] = sumCmsVector(st.largeBigInts, st.bigIntMonoid) + @Benchmark + def sumSmallBigDecimalCms(st: CMSState): CMS[BigDecimal] = + sumCmsVector(st.smallBigDecimals, st.bigDecimalMonoid) + + @Benchmark + def sumLargeBigDecimalCms(st: CMSState): CMS[BigDecimal] = + sumCmsVector(st.largeBigDecimals, st.bigDecimalMonoid) + @Benchmark def sumLargeStringCms(st: CMSState): CMS[String] = sumCmsVector(st.largeStrings, st.stringMonoid) diff --git a/algebird-benchmark/src/main/scala/com/twitter/algebird/benchmark/TopCMSBenchmark.scala b/algebird-benchmark/src/main/scala/com/twitter/algebird/benchmark/TopCMSBenchmark.scala index 9fb80a6aa..e059e7889 100644 --- a/algebird-benchmark/src/main/scala/com/twitter/algebird/benchmark/TopCMSBenchmark.scala +++ b/algebird-benchmark/src/main/scala/com/twitter/algebird/benchmark/TopCMSBenchmark.scala @@ -36,23 +36,33 @@ object TopCMSBenchmark { var smallLongs: Vector[Long] = _ var smallBigInts: Vector[BigInt] = _ var largeBigInts: Vector[BigInt] = _ + var smallBigDecimals: Vector[BigDecimal] = _ + var largeBigDecimals: Vector[BigDecimal] = _ var largeStrings: Vector[String] = _ var cmsLongMonoid: TopPctCMSMonoid[Long] = _ var cmsBigIntMonoid: TopPctCMSMonoid[BigInt] = _ + var cmsBigDecimalMonoid: TopPctCMSMonoid[BigDecimal] = _ var cmsStringMonoid: TopPctCMSMonoid[String] = _ @Setup(Level.Trial) def setup(): Unit = { cmsLongMonoid = TopPctCMS.monoid[Long](eps, delta, Seed, pct) cmsBigIntMonoid = TopPctCMS.monoid[BigInt](eps, delta, Seed, pct) + cmsBigDecimalMonoid = TopPctCMS.monoid[BigDecimal](eps, delta, Seed, pct) cmsStringMonoid = TopPctCMS.monoid[String](eps, delta, Seed, pct) val bitsPerChar = 16 largeStrings = (1 to size).map(i => nextString(MaxBits / bitsPerChar)).toVector - largeBigInts = largeStrings.map(s => BigInt(s.getBytes)).toVector + largeBigInts = largeStrings.map(s => BigInt(s.getBytes)) + largeBigDecimals = largeStrings.map(s => { + val md = (s.head % 256) - 128 + BigDecimal(BigInt(s.tail.getBytes)) * BigDecimal(1).pow(md) + }) + smallLongs = (1 to size).map(_.toLong).toVector smallBigInts = (1 to size).map(BigInt(_)).toVector + smallBigDecimals = (1 to size).map(BigDecimal(_) + BigDecimal(1).pow(-size)).toVector } } @@ -75,6 +85,14 @@ class TopCMSBenchmark { def sumLargeBigIntTopCms(st: CMSState) = sumTopCmsVector(st.largeBigInts, st.cmsBigIntMonoid) + @Benchmark + def sumSmallBigDecimalTopCms(st: CMSState) = + sumTopCmsVector(st.smallBigDecimals, st.cmsBigDecimalMonoid) + + @Benchmark + def sumLargeBigDecimalTopCms(st: CMSState) = + sumTopCmsVector(st.largeBigDecimals, st.cmsBigDecimalMonoid) + @Benchmark def sumLargeStringTopCms(st: CMSState) = sumTopCmsVector(st.largeStrings, st.cmsStringMonoid) diff --git a/algebird-core/src/main/scala/com/twitter/algebird/CMSHasher.scala b/algebird-core/src/main/scala/com/twitter/algebird/CMSHasher.scala index e2e4bf3d7..d42b8b327 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/CMSHasher.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/CMSHasher.scala @@ -138,4 +138,20 @@ object CMSHasher { override def hash(a: Int, b: Int, width: Int)(x: Array[Byte]): Int = hashBytes(a, b, width)(x) } + // Note: CMSHasher[BigInt] not provided here but in CMSHasherImplicits for legacy support reasons. New hashers + // should come here. + + implicit object CMSHasherBigDecimal extends CMSHasher[BigDecimal] { + override def hash(a: Int, b: Int, width: Int)(x: BigDecimal): Int = { + + val uh = scala.util.hashing.MurmurHash3.arrayHash(x.underlying.unscaledValue.toByteArray, a) + val hash = scala.util.hashing.MurmurHash3.productHash((uh, x.scale), a) + + // We only want positive integers for the subsequent modulo. This method mimics Java's Hashtable + // implementation. The Java code uses `0x7FFFFFFF` for the bit-wise AND, which is equal to Int.MaxValue. + val positiveHash = hash & Int.MaxValue + positiveHash % width + } + } + } diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Group.scala b/algebird-core/src/main/scala/com/twitter/algebird/Group.scala index 016010a60..14332fdda 100755 --- a/algebird-core/src/main/scala/com/twitter/algebird/Group.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/Group.scala @@ -108,6 +108,7 @@ object Group extends GeneratedGroupImplicits with ProductGroups { implicit val jshortGroup: Group[JShort] = JShortRing implicit val longGroup: Group[Long] = LongRing implicit val bigIntGroup: Group[BigInt] = BigIntRing + implicit val bigDecimalGroup: Group[BigDecimal] = BigDecimalRing implicit val jlongGroup: Group[JLong] = JLongRing implicit val floatGroup: Group[Float] = FloatField implicit val jfloatGroup: Group[JFloat] = JFloatField diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Monoid.scala b/algebird-core/src/main/scala/com/twitter/algebird/Monoid.scala index 1fffa0a6e..ac0c8e9b1 100755 --- a/algebird-core/src/main/scala/com/twitter/algebird/Monoid.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/Monoid.scala @@ -270,6 +270,7 @@ object Monoid extends GeneratedMonoidImplicits with ProductMonoids { implicit val shortMonoid: Monoid[Short] = ShortRing implicit val jshortMonoid: Monoid[JShort] = JShortRing implicit val bigIntMonoid: Monoid[BigInt] = BigIntRing + implicit val bigDecimalMonoid: Monoid[BigDecimal] = BigDecimalRing implicit val longMonoid: Monoid[Long] = LongRing implicit val jlongMonoid: Monoid[JLong] = JLongRing implicit val floatMonoid: Monoid[Float] = FloatField diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Ring.scala b/algebird-core/src/main/scala/com/twitter/algebird/Ring.scala index 5aee55566..4657211de 100755 --- a/algebird-core/src/main/scala/com/twitter/algebird/Ring.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/Ring.scala @@ -113,6 +113,7 @@ object LongRing extends Ring[Long] { } object BigIntRing extends NumericRing[BigInt] +object BigDecimalRing extends NumericRing[BigDecimal] object Ring extends GeneratedRingImplicits with ProductRings { // This pattern is really useful for typeclasses @@ -139,6 +140,7 @@ object Ring extends GeneratedRingImplicits with ProductRings { implicit val jshortRing: Ring[JShort] = JShortRing implicit val longRing: Ring[Long] = LongRing implicit val bigIntRing: Ring[BigInt] = BigIntRing + implicit val bigDecimalRing: Ring[BigDecimal] = BigDecimalRing implicit val jlongRing: Ring[JLong] = JLongRing implicit val floatRing: Ring[Float] = FloatField implicit val jfloatRing: Ring[JFloat] = JFloatField diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Semigroup.scala b/algebird-core/src/main/scala/com/twitter/algebird/Semigroup.scala index 56bd3067e..6b7565c48 100755 --- a/algebird-core/src/main/scala/com/twitter/algebird/Semigroup.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/Semigroup.scala @@ -148,6 +148,7 @@ object Semigroup extends GeneratedSemigroupImplicits with ProductSemigroups { implicit val jshortSemigroup: Semigroup[JShort] = JShortRing implicit val longSemigroup: Semigroup[Long] = LongRing implicit val bigIntSemigroup: Semigroup[BigInt] = BigIntRing + implicit val bigDecimalSemigroup: Semigroup[BigDecimal] = BigDecimalRing implicit val jlongSemigroup: Semigroup[JLong] = JLongRing implicit val floatSemigroup: Semigroup[Float] = FloatField implicit val jfloatSemigroup: Semigroup[JFloat] = JFloatField diff --git a/algebird-test/src/main/scala/com/twitter/algebird/BaseProperties.scala b/algebird-test/src/main/scala/com/twitter/algebird/BaseProperties.scala index 6594d8c8d..f501bce7f 100644 --- a/algebird-test/src/main/scala/com/twitter/algebird/BaseProperties.scala +++ b/algebird-test/src/main/scala/com/twitter/algebird/BaseProperties.scala @@ -16,9 +16,9 @@ limitations under the License. package com.twitter.algebird -import org.scalacheck.Arbitrary -import org.scalacheck.Prop +import org.scalacheck.{Arbitrary, Gen, Prop} import org.scalacheck.Prop.forAll + import scala.math.Equiv /** @@ -26,6 +26,14 @@ import scala.math.Equiv */ object BaseProperties { + val arbReasonableBigDecimals: Arbitrary[BigDecimal] = Arbitrary( + for { + scale <- Gen.choose(-128, +128) + base <- implicitly[Arbitrary[BigInt]].arbitrary + } yield { + (BigDecimal(base) * BigDecimal(10).pow(scale)) + }) + def defaultEq[T](t0: T, t1: T) = t0 == t1 trait HigherEq[M[_]] { diff --git a/algebird-test/src/test/scala/com/twitter/algebird/BatchedTest.scala b/algebird-test/src/test/scala/com/twitter/algebird/BatchedTest.scala index b0974c8ec..29cb85803 100644 --- a/algebird-test/src/test/scala/com/twitter/algebird/BatchedTest.scala +++ b/algebird-test/src/test/scala/com/twitter/algebird/BatchedTest.scala @@ -27,6 +27,7 @@ import Helpers.arbitraryBatched class BatchedLaws extends CheckProperties with Matchers with PropertyChecks { import BaseProperties._ + implicit val arbitraryBigDecimalsHere = BaseProperties.arbReasonableBigDecimals def testBatchedMonoid[A: Arbitrary: Monoid](name: String, size: Int): Unit = { implicit val m: Monoid[Batched[A]] = Batched.compactingMonoid[A](size) @@ -43,6 +44,10 @@ class BatchedLaws extends CheckProperties with Matchers with PropertyChecks { testBatchedMonoid[BigInt]("BigInt", 10) testBatchedMonoid[BigInt]("BigInt", 100) testBatchedMonoid[BigInt]("BigInt", 1000000) + testBatchedMonoid[BigDecimal]("BigDecimal", 1) + testBatchedMonoid[BigDecimal]("BigDecimal", 10) + testBatchedMonoid[BigDecimal]("BigDecimal", 100) + testBatchedMonoid[BigDecimal]("BigDecimal", 1000000) testBatchedMonoid[String]("String", 1) testBatchedMonoid[String]("String", 10) testBatchedMonoid[String]("String", 100) diff --git a/algebird-test/src/test/scala/com/twitter/algebird/CountMinSketchTest.scala b/algebird-test/src/test/scala/com/twitter/algebird/CountMinSketchTest.scala index 5a7cf86de..4086e4c34 100644 --- a/algebird-test/src/test/scala/com/twitter/algebird/CountMinSketchTest.scala +++ b/algebird-test/src/test/scala/com/twitter/algebird/CountMinSketchTest.scala @@ -57,6 +57,12 @@ class CmsLaws extends PropSpec with PropertyChecks with Matchers { check(monoidLawsEquiv[CMS[BigInt]]) } + property("CountMinSketch[BigDecimal] is a Monoid") { + implicit val cmsMonoid = CMS.monoid[BigDecimal](EPS, DELTA, SEED) + implicit val cmsGen = createArbitrary[BigDecimal](cmsMonoid) + check(monoidLawsEquiv[CMS[BigDecimal]]) + } + property("CountMinSketch[String] is a Monoid") { implicit val cmsMonoid = CMS.monoid[String](EPS, DELTA, SEED) implicit val cmsGen = createArbitrary[String](cmsMonoid) @@ -111,6 +117,12 @@ class TopPctCmsLaws extends PropSpec with PropertyChecks with Matchers { monoidLaws[TopCMS[BigInt]] } + property("TopPctCms[BigDecimal] is a Monoid") { + implicit val cmsMonoid = TopPctCMS.monoid[BigDecimal](EPS, DELTA, SEED, HEAVY_HITTERS_PCT) + implicit val cmsGen = createArbitrary[BigDecimal](cmsMonoid) + monoidLaws[TopCMS[BigDecimal]] + } + property("TopPctCms[String] is a Monoid") { implicit val cmsMonoid = TopPctCMS.monoid[String](EPS, DELTA, SEED, HEAVY_HITTERS_PCT) implicit val cmsGen = createArbitrary[String](cmsMonoid) @@ -273,6 +285,7 @@ class CMSShortTest extends CMSTest[Short] class CMSIntTest extends CMSTest[Int] class CMSLongTest extends CMSTest[Long] class CMSBigIntTest extends CMSTest[BigInt] +class CMSBigDecimalTest extends CMSTest[BigDecimal] class CMSStringTest extends CMSTest[String] class CMSBytesTest extends CMSTest[Bytes] @@ -964,6 +977,7 @@ class CMSHasherShortSpec extends CMSHasherSpec[Short] class CMSHasherIntSpec extends CMSHasherSpec[Int] class CMSHasherLongSpec extends CMSHasherSpec[Long] class CMSHasherBigIntSpec extends CMSHasherSpec[BigInt] +class CMSHasherBigDecimalSpec extends CMSHasherSpec[BigDecimal] class CMSHasherStringSpec extends CMSHasherSpec[String] class CMSHasherBytesSpec extends CMSHasherSpec[Bytes] diff --git a/algebird-test/src/test/scala/com/twitter/algebird/FromIntLike.scala b/algebird-test/src/test/scala/com/twitter/algebird/FromIntLike.scala index b8b509d2a..235296990 100644 --- a/algebird-test/src/test/scala/com/twitter/algebird/FromIntLike.scala +++ b/algebird-test/src/test/scala/com/twitter/algebird/FromIntLike.scala @@ -36,6 +36,10 @@ object FromIntLike { override def fromInt(x: Int): BigInt = BigInt(x) } + implicit object FromIntBigDecimal extends FromIntLike[BigDecimal] { + override def fromInt(x: Int): BigDecimal = BigDecimal(x) + } + implicit object FromIntString extends FromIntLike[String] { override def fromInt(x: Int): String = x.toString }