-
Notifications
You must be signed in to change notification settings - Fork 346
Make BFHash take on non negative values only, issue #229 #243
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,9 +24,68 @@ object BloomFilterLaws extends Properties("BloomFilter") { | |
property("BloomFilter is a Monoid") = monoidLaws[BF] | ||
} | ||
|
||
object BFHashIndices extends Properties("BFHash") { | ||
import org.scalacheck.Prop.forAll | ||
|
||
val NUM_HASHES = 10 | ||
val WIDTH = 4752800 | ||
|
||
val SEED = 1 | ||
|
||
class BloomFilterTest extends Specification { | ||
implicit val bfHashIndices: Arbitrary[Stream[Int]] = | ||
Arbitrary { | ||
for { | ||
hashes <- choose(1, 10) | ||
width <- choose(100, 5000000) | ||
v <- choose(0, 100000) | ||
} yield {BFHash(hashes, width, SEED).apply(v.toString)} | ||
} | ||
|
||
property("Indices are non negative") = forAll{ hashIndices: Stream[Int] => hashIndices.forall(_ >= 0)} | ||
|
||
/** | ||
* This is the version of the BFHash before the negative values fix | ||
*/ | ||
case class NegativeBFHash(numHashes: Int, width: Int, seed: Long = 0L) extends Function1[String, Iterable[Int]]{ | ||
val size = numHashes | ||
|
||
def apply(s: String) = nextHash(s.getBytes, numHashes) | ||
|
||
private def splitLong(x: Long) = { | ||
val upper = math.abs(x >> 32).toInt | ||
val lower = math.abs((x << 32) >> 32).toInt | ||
(upper, lower) | ||
} | ||
|
||
private def nextHash(bytes: Array[Byte], k: Int, digested: Seq[Int] = Seq.empty): Stream[Int] = { | ||
if(k == 0) | ||
Stream.empty | ||
else{ | ||
val d = if(digested.isEmpty){ | ||
val (a, b) = MurmurHash128(k)(bytes) | ||
val (x1, x2) = splitLong(a) | ||
val (x3, x4) = splitLong(b) | ||
Seq(x1, x2, x3, x4) | ||
}else | ||
digested | ||
|
||
Stream.cons(d(0) % width, nextHash(bytes, k - 1, d.drop(1))) | ||
} | ||
} | ||
} | ||
|
||
val negativeBFHash = NegativeBFHash(NUM_HASHES, WIDTH, SEED) | ||
val bfHash = BFHash(NUM_HASHES, WIDTH, SEED) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use scalacheck to generate these rather than being fixed magic values There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, ALSO have fixed magic values. These were found to have the bug, so let's keep them to prevent regression. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well for the Specification section rather than the scalacheck one we should try generate the exact problem case, not sure it would make any benefit here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a test for the problem case in the Specification already. |
||
|
||
property("Indices of the two versions of BFHashes are the same, unless the first one contains negative index") = forAll{ long: Long => | ||
val s = long.toString | ||
val indices = negativeBFHash.apply(s) | ||
indices == bfHash.apply(s) || indices.exists(_ < 0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given how rare the bug was without the or in place here has it ever thrown for you? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's thrown exactly when (x >> 32) happens to be the Integer.MIN_VALUE, and then its absolute value is negative. |
||
} | ||
} | ||
|
||
|
||
class BloomFilterTest extends Specification { | ||
|
||
val SEED = 1 | ||
val RAND = new scala.util.Random | ||
|
@@ -119,5 +178,18 @@ class BloomFilterTest extends Specification { | |
val bytesAfterSizeCalled = new String(serialize(bf)) | ||
bytesBeforeSizeCalled mustEqual bytesAfterSizeCalled | ||
} | ||
|
||
/** | ||
* this test failed before the fix for https://github.com/twitter/algebird/issues/229 | ||
*/ | ||
"not have negative hash values" in { | ||
val NUM_HASHES = 2 | ||
val WIDTH = 4752800 | ||
val bfHash = BFHash(NUM_HASHES, WIDTH, SEED) | ||
val s = "7024497610539761509" | ||
val index = bfHash.apply(s).head | ||
|
||
index must be_>=(0) | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about moving the extra abs into the splitLong function so all the abs/uglyness is together?
Also adding a comment beside it as to why we have two math.abs might be good too?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moving abs into the splitLong wouldn't help because of the Integer.MIN_VALUE issue.
I'd rather use @david206's solution.