Skip to content

Commit ea68c4e

Browse files
authored
Merge pull request #1024 from effigies/fix/gzip_compression_options
ENH: Create gzip header deterministically by default
2 parents 44a1052 + ff5efe4 commit ea68c4e

File tree

2 files changed

+147
-7
lines changed

2 files changed

+147
-7
lines changed

nibabel/openers.py

+21-7
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,24 @@
4242
HAVE_INDEXED_GZIP = False
4343

4444

45-
def _gzip_open(filename, mode='rb', compresslevel=9, keep_open=False):
45+
class DeterministicGzipFile(gzip.GzipFile):
46+
""" Deterministic variant of GzipFile
47+
48+
This writer does not add filename information to the header, and defaults
49+
to a modification time (``mtime``) of 0 seconds.
50+
"""
51+
def __init__(self, filename=None, mode=None, compresslevel=9, fileobj=None, mtime=0):
52+
# These two guards are copied from
53+
# https://github.com/python/cpython/blob/6ab65c6/Lib/gzip.py#L171-L174
54+
if mode and 'b' not in mode:
55+
mode += 'b'
56+
if fileobj is None:
57+
fileobj = self.myfileobj = open(filename, mode or 'rb')
58+
return super().__init__(filename="", mode=mode, compresslevel=compresslevel,
59+
fileobj=fileobj, mtime=mtime)
60+
61+
62+
def _gzip_open(filename, mode='rb', compresslevel=9, mtime=0, keep_open=False):
4663

4764
# use indexed_gzip if possible for faster read access. If keep_open ==
4865
# True, we tell IndexedGzipFile to keep the file handle open. Otherwise
@@ -52,7 +69,7 @@ def _gzip_open(filename, mode='rb', compresslevel=9, keep_open=False):
5269

5370
# Fall-back to built-in GzipFile
5471
else:
55-
gzip_file = gzip.GzipFile(filename, mode, compresslevel)
72+
gzip_file = DeterministicGzipFile(filename, mode, compresslevel, mtime=mtime)
5673

5774
return gzip_file
5875

@@ -83,7 +100,7 @@ class Opener(object):
83100
passed to opening method when `fileish` is str. Change of defaults as
84101
for \*args
85102
"""
86-
gz_def = (_gzip_open, ('mode', 'compresslevel', 'keep_open'))
103+
gz_def = (_gzip_open, ('mode', 'compresslevel', 'mtime', 'keep_open'))
87104
bz2_def = (BZ2File, ('mode', 'buffering', 'compresslevel'))
88105
zstd_def = (_zstd_open, ('mode', 'level_or_option', 'zstd_dict'))
89106
compress_ext_map = {
@@ -163,10 +180,7 @@ def name(self):
163180
self._name will be None if object was created with a fileobj, otherwise
164181
it will be the filename.
165182
"""
166-
try:
167-
return self.fobj.name
168-
except AttributeError:
169-
return self._name
183+
return self._name
170184

171185
@property
172186
def mode(self):

nibabel/tests/test_openers.py

+126
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,15 @@
1212
from gzip import GzipFile
1313
from io import BytesIO, UnsupportedOperation
1414
from distutils.version import StrictVersion
15+
import hashlib
16+
import time
1517

1618
from numpy.compat.py3k import asstr, asbytes
1719
from ..openers import (Opener,
1820
ImageOpener,
1921
HAVE_INDEXED_GZIP,
2022
BZ2File,
23+
DeterministicGzipFile,
2124
)
2225
from ..tmpdirs import InTemporaryDirectory
2326
from ..volumeutils import BinOpener
@@ -367,3 +370,126 @@ def test_iter():
367370
lobj = Opener(Lunk(''))
368371
with pytest.raises(TypeError):
369372
list(lobj)
373+
374+
375+
def md5sum(fname):
376+
with open(fname, "rb") as fobj:
377+
return hashlib.md5(fobj.read()).hexdigest()
378+
379+
380+
def test_DeterministicGzipFile():
381+
with InTemporaryDirectory():
382+
msg = b"Hello, I'd like to have an argument."
383+
384+
# No filename, no mtime
385+
with open("ref.gz", "wb") as fobj:
386+
with GzipFile(filename="", mode="wb", fileobj=fobj, mtime=0) as gzobj:
387+
gzobj.write(msg)
388+
anon_chksum = md5sum("ref.gz")
389+
390+
with DeterministicGzipFile("default.gz", "wb") as fobj:
391+
internal_fobj = fobj.myfileobj
392+
fobj.write(msg)
393+
# Check that myfileobj is being closed by GzipFile.close()
394+
# This is in case GzipFile changes its internal implementation
395+
assert internal_fobj.closed
396+
397+
assert md5sum("default.gz") == anon_chksum
398+
399+
# No filename, current mtime
400+
now = time.time()
401+
with open("ref.gz", "wb") as fobj:
402+
with GzipFile(filename="", mode="wb", fileobj=fobj, mtime=now) as gzobj:
403+
gzobj.write(msg)
404+
now_chksum = md5sum("ref.gz")
405+
406+
with DeterministicGzipFile("now.gz", "wb", mtime=now) as fobj:
407+
fobj.write(msg)
408+
409+
assert md5sum("now.gz") == now_chksum
410+
411+
# Change in default behavior
412+
with mock.patch("time.time") as t:
413+
t.return_value = now
414+
415+
# GzipFile will use time.time()
416+
with open("ref.gz", "wb") as fobj:
417+
with GzipFile(filename="", mode="wb", fileobj=fobj) as gzobj:
418+
gzobj.write(msg)
419+
assert md5sum("ref.gz") == now_chksum
420+
421+
# DeterministicGzipFile will use 0
422+
with DeterministicGzipFile("now.gz", "wb") as fobj:
423+
fobj.write(msg)
424+
assert md5sum("now.gz") == anon_chksum
425+
426+
# GzipFile is filename dependent, DeterministicGzipFile is independent
427+
with GzipFile("filenameA.gz", mode="wb", mtime=0) as gzobj:
428+
gzobj.write(msg)
429+
fnameA_chksum = md5sum("filenameA.gz")
430+
assert fnameA_chksum != anon_chksum
431+
432+
with DeterministicGzipFile("filenameA.gz", "wb") as fobj:
433+
fobj.write(msg)
434+
435+
# But the contents are the same with different filenames
436+
assert md5sum("filenameA.gz") == anon_chksum
437+
438+
439+
def test_DeterministicGzipFile_fileobj():
440+
with InTemporaryDirectory():
441+
msg = b"Hello, I'd like to have an argument."
442+
with open("ref.gz", "wb") as fobj:
443+
with GzipFile(filename="", mode="wb", fileobj=fobj, mtime=0) as gzobj:
444+
gzobj.write(msg)
445+
ref_chksum = md5sum("ref.gz")
446+
447+
with open("test.gz", "wb") as fobj:
448+
with DeterministicGzipFile(filename="", mode="wb", fileobj=fobj) as gzobj:
449+
gzobj.write(msg)
450+
md5sum("test.gz") == ref_chksum
451+
452+
with open("test.gz", "wb") as fobj:
453+
with DeterministicGzipFile(fileobj=fobj, mode="wb") as gzobj:
454+
gzobj.write(msg)
455+
md5sum("test.gz") == ref_chksum
456+
457+
with open("test.gz", "wb") as fobj:
458+
with DeterministicGzipFile(filename="test.gz", mode="wb", fileobj=fobj) as gzobj:
459+
gzobj.write(msg)
460+
md5sum("test.gz") == ref_chksum
461+
462+
463+
def test_bitwise_determinism():
464+
with InTemporaryDirectory():
465+
msg = b"Hello, I'd like to have an argument."
466+
# Canonical reference: No filename, no mtime
467+
# Use default compresslevel
468+
with open("ref.gz", "wb") as fobj:
469+
with GzipFile(filename="", mode="wb",
470+
compresslevel=1, fileobj=fobj,
471+
mtime=0) as gzobj:
472+
gzobj.write(msg)
473+
anon_chksum = md5sum("ref.gz")
474+
475+
# Different times, different filenames
476+
now = time.time()
477+
with mock.patch("time.time") as t:
478+
t.return_value = now
479+
with Opener("a.gz", "wb") as fobj:
480+
fobj.write(msg)
481+
t.return_value = now + 1
482+
with Opener("b.gz", "wb") as fobj:
483+
fobj.write(msg)
484+
485+
assert md5sum("a.gz") == anon_chksum
486+
assert md5sum("b.gz") == anon_chksum
487+
488+
# Users can still set mtime, but filenames will not be embedded
489+
with Opener("filenameA.gz", "wb", mtime=0xCAFE10C0) as fobj:
490+
fobj.write(msg)
491+
with Opener("filenameB.gz", "wb", mtime=0xCAFE10C0) as fobj:
492+
fobj.write(msg)
493+
fnameA_chksum = md5sum("filenameA.gz")
494+
fnameB_chksum = md5sum("filenameB.gz")
495+
assert fnameA_chksum == fnameB_chksum != anon_chksum

0 commit comments

Comments
 (0)