twitter · johnynek · Dec 13, 2013 · Dec 9, 2013 · Dec 9, 2013 · Dec 11, 2013
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala b/algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala
@@ -333,7 +333,7 @@ case class BFHash(numHashes: Int, width: Int, seed: Long = 0L) extends Function1
       }else
         digested
 
-      Stream.cons(d(0) % width, nextHash(bytes, k - 1, d.drop(1)))
+      Stream.cons(math.abs(d(0) % width), nextHash(bytes, k - 1, d.drop(1)))
     }
   }
 }

diff --git a/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala b/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala
@@ -24,9 +24,68 @@ object BloomFilterLaws extends Properties("BloomFilter") {
   property("BloomFilter is a Monoid") = monoidLaws[BF]
 }
 
+object BFHashIndices extends Properties("BFHash") {
+  import org.scalacheck.Prop.forAll
+
+  val NUM_HASHES = 10
+  val WIDTH = 4752800
+
+  val SEED = 1
 
-class BloomFilterTest extends Specification {
+  implicit val bfHashIndices: Arbitrary[Stream[Int]] =
+    Arbitrary {
+      for {
+        hashes <- choose(1, 10)
+        width <- choose(100, 5000000)
+        v <- choose(0, 100000)
+      } yield {BFHash(hashes, width, SEED).apply(v.toString)}
+    }
+
+  property("Indices are non negative") = forAll{ hashIndices: Stream[Int] => hashIndices.forall(_ >= 0)} 
+
+  /**
+   *   This is the version of the BFHash before the negative values fix 
+   */
+  case class NegativeBFHash(numHashes: Int, width: Int, seed: Long = 0L) extends Function1[String, Iterable[Int]]{
+    val size = numHashes
+
+    def apply(s: String) = nextHash(s.getBytes, numHashes)
+
+    private def splitLong(x: Long) = {
+      val upper = math.abs(x >> 32).toInt
+      val lower = math.abs((x << 32) >> 32).toInt
+      (upper, lower)
+    }
 
+    private def nextHash(bytes: Array[Byte], k: Int, digested: Seq[Int] = Seq.empty): Stream[Int] = {
+      if(k == 0)
+        Stream.empty
+      else{
+        val d = if(digested.isEmpty){
+          val (a, b) = MurmurHash128(k)(bytes)
+          val (x1, x2) = splitLong(a)
+          val (x3, x4) = splitLong(b)
+          Seq(x1, x2, x3, x4)
+        }else
+          digested
+
+        Stream.cons(d(0) % width, nextHash(bytes, k - 1, d.drop(1)))
+      }
+    }
+  }
+
+  val negativeBFHash = NegativeBFHash(NUM_HASHES, WIDTH, SEED)
+  val bfHash = BFHash(NUM_HASHES, WIDTH, SEED)
+
+  property("Indices of the two versions of BFHashes are the same, unless the first one contains negative index") = forAll{ long: Long =>
+    val s = long.toString
+    val indices = negativeBFHash.apply(s)
+    indices == bfHash.apply(s) || indices.exists(_ < 0)
+  }
+}
+
+
+class BloomFilterTest extends Specification {
 
   val SEED = 1
   val RAND = new scala.util.Random
@@ -119,5 +178,18 @@ class BloomFilterTest extends Specification {
       val bytesAfterSizeCalled = new String(serialize(bf))
       bytesBeforeSizeCalled mustEqual bytesAfterSizeCalled
     }
+
+    /**
+     * this test failed before the fix for https://github.com/twitter/algebird/issues/229 
+     */
+    "not have negative hash values" in {
+      val NUM_HASHES = 2
+      val WIDTH = 4752800
+      val bfHash = BFHash(NUM_HASHES, WIDTH, SEED)
+      val s = "7024497610539761509"
+      val index = bfHash.apply(s).head
+
+      index must be_>=(0)
+    }
   }
 }
-Original file line number
+Diff line change
@@ Expand Up @@
           }else
             digested
-          Stream.cons(d(0) % width, nextHash(bytes, k - 1, d.drop(1)))
+          Stream.cons(math.abs(d(0) % width), nextHash(bytes, k - 1, d.drop(1)))
         }
       }
     }
@@ Expand Down @@