|
12 | 12 | from gzip import GzipFile
|
13 | 13 | from io import BytesIO, UnsupportedOperation
|
14 | 14 | from distutils.version import StrictVersion
|
| 15 | +import hashlib |
| 16 | +import time |
15 | 17 |
|
16 | 18 | from numpy.compat.py3k import asstr, asbytes
|
17 | 19 | from ..openers import (Opener,
|
18 | 20 | ImageOpener,
|
19 | 21 | HAVE_INDEXED_GZIP,
|
20 | 22 | BZ2File,
|
| 23 | + DeterministicGzipFile, |
21 | 24 | )
|
22 | 25 | from ..tmpdirs import InTemporaryDirectory
|
23 | 26 | from ..volumeutils import BinOpener
|
@@ -367,3 +370,126 @@ def test_iter():
|
367 | 370 | lobj = Opener(Lunk(''))
|
368 | 371 | with pytest.raises(TypeError):
|
369 | 372 | list(lobj)
|
| 373 | + |
| 374 | + |
| 375 | +def md5sum(fname): |
| 376 | + with open(fname, "rb") as fobj: |
| 377 | + return hashlib.md5(fobj.read()).hexdigest() |
| 378 | + |
| 379 | + |
| 380 | +def test_DeterministicGzipFile(): |
| 381 | + with InTemporaryDirectory(): |
| 382 | + msg = b"Hello, I'd like to have an argument." |
| 383 | + |
| 384 | + # No filename, no mtime |
| 385 | + with open("ref.gz", "wb") as fobj: |
| 386 | + with GzipFile(filename="", mode="wb", fileobj=fobj, mtime=0) as gzobj: |
| 387 | + gzobj.write(msg) |
| 388 | + anon_chksum = md5sum("ref.gz") |
| 389 | + |
| 390 | + with DeterministicGzipFile("default.gz", "wb") as fobj: |
| 391 | + internal_fobj = fobj.myfileobj |
| 392 | + fobj.write(msg) |
| 393 | + # Check that myfileobj is being closed by GzipFile.close() |
| 394 | + # This is in case GzipFile changes its internal implementation |
| 395 | + assert internal_fobj.closed |
| 396 | + |
| 397 | + assert md5sum("default.gz") == anon_chksum |
| 398 | + |
| 399 | + # No filename, current mtime |
| 400 | + now = time.time() |
| 401 | + with open("ref.gz", "wb") as fobj: |
| 402 | + with GzipFile(filename="", mode="wb", fileobj=fobj, mtime=now) as gzobj: |
| 403 | + gzobj.write(msg) |
| 404 | + now_chksum = md5sum("ref.gz") |
| 405 | + |
| 406 | + with DeterministicGzipFile("now.gz", "wb", mtime=now) as fobj: |
| 407 | + fobj.write(msg) |
| 408 | + |
| 409 | + assert md5sum("now.gz") == now_chksum |
| 410 | + |
| 411 | + # Change in default behavior |
| 412 | + with mock.patch("time.time") as t: |
| 413 | + t.return_value = now |
| 414 | + |
| 415 | + # GzipFile will use time.time() |
| 416 | + with open("ref.gz", "wb") as fobj: |
| 417 | + with GzipFile(filename="", mode="wb", fileobj=fobj) as gzobj: |
| 418 | + gzobj.write(msg) |
| 419 | + assert md5sum("ref.gz") == now_chksum |
| 420 | + |
| 421 | + # DeterministicGzipFile will use 0 |
| 422 | + with DeterministicGzipFile("now.gz", "wb") as fobj: |
| 423 | + fobj.write(msg) |
| 424 | + assert md5sum("now.gz") == anon_chksum |
| 425 | + |
| 426 | + # GzipFile is filename dependent, DeterministicGzipFile is independent |
| 427 | + with GzipFile("filenameA.gz", mode="wb", mtime=0) as gzobj: |
| 428 | + gzobj.write(msg) |
| 429 | + fnameA_chksum = md5sum("filenameA.gz") |
| 430 | + assert fnameA_chksum != anon_chksum |
| 431 | + |
| 432 | + with DeterministicGzipFile("filenameA.gz", "wb") as fobj: |
| 433 | + fobj.write(msg) |
| 434 | + |
| 435 | + # But the contents are the same with different filenames |
| 436 | + assert md5sum("filenameA.gz") == anon_chksum |
| 437 | + |
| 438 | + |
| 439 | +def test_DeterministicGzipFile_fileobj(): |
| 440 | + with InTemporaryDirectory(): |
| 441 | + msg = b"Hello, I'd like to have an argument." |
| 442 | + with open("ref.gz", "wb") as fobj: |
| 443 | + with GzipFile(filename="", mode="wb", fileobj=fobj, mtime=0) as gzobj: |
| 444 | + gzobj.write(msg) |
| 445 | + ref_chksum = md5sum("ref.gz") |
| 446 | + |
| 447 | + with open("test.gz", "wb") as fobj: |
| 448 | + with DeterministicGzipFile(filename="", mode="wb", fileobj=fobj) as gzobj: |
| 449 | + gzobj.write(msg) |
| 450 | + md5sum("test.gz") == ref_chksum |
| 451 | + |
| 452 | + with open("test.gz", "wb") as fobj: |
| 453 | + with DeterministicGzipFile(fileobj=fobj, mode="wb") as gzobj: |
| 454 | + gzobj.write(msg) |
| 455 | + md5sum("test.gz") == ref_chksum |
| 456 | + |
| 457 | + with open("test.gz", "wb") as fobj: |
| 458 | + with DeterministicGzipFile(filename="test.gz", mode="wb", fileobj=fobj) as gzobj: |
| 459 | + gzobj.write(msg) |
| 460 | + md5sum("test.gz") == ref_chksum |
| 461 | + |
| 462 | + |
| 463 | +def test_bitwise_determinism(): |
| 464 | + with InTemporaryDirectory(): |
| 465 | + msg = b"Hello, I'd like to have an argument." |
| 466 | + # Canonical reference: No filename, no mtime |
| 467 | + # Use default compresslevel |
| 468 | + with open("ref.gz", "wb") as fobj: |
| 469 | + with GzipFile(filename="", mode="wb", |
| 470 | + compresslevel=1, fileobj=fobj, |
| 471 | + mtime=0) as gzobj: |
| 472 | + gzobj.write(msg) |
| 473 | + anon_chksum = md5sum("ref.gz") |
| 474 | + |
| 475 | + # Different times, different filenames |
| 476 | + now = time.time() |
| 477 | + with mock.patch("time.time") as t: |
| 478 | + t.return_value = now |
| 479 | + with Opener("a.gz", "wb") as fobj: |
| 480 | + fobj.write(msg) |
| 481 | + t.return_value = now + 1 |
| 482 | + with Opener("b.gz", "wb") as fobj: |
| 483 | + fobj.write(msg) |
| 484 | + |
| 485 | + assert md5sum("a.gz") == anon_chksum |
| 486 | + assert md5sum("b.gz") == anon_chksum |
| 487 | + |
| 488 | + # Users can still set mtime, but filenames will not be embedded |
| 489 | + with Opener("filenameA.gz", "wb", mtime=0xCAFE10C0) as fobj: |
| 490 | + fobj.write(msg) |
| 491 | + with Opener("filenameB.gz", "wb", mtime=0xCAFE10C0) as fobj: |
| 492 | + fobj.write(msg) |
| 493 | + fnameA_chksum = md5sum("filenameA.gz") |
| 494 | + fnameB_chksum = md5sum("filenameB.gz") |
| 495 | + assert fnameA_chksum == fnameB_chksum != anon_chksum |
0 commit comments