Skip to content

Commit b1b8dfd

Browse files
ttaylorrpeff
authored andcommitted
finalize_object_file(): implement collision check
We've had "FIXME!!! Collision check here ?" in finalize_object_file() since aac1794 (Improve sha1 object file writing., 2005-05-03). That is, when we try to write a file with the same name, we assume the on-disk contents are the same and blindly throw away the new copy. One of the reasons we never implemented this is because the files it moves are all named after the cryptographic hash of their contents (either loose objects, or packs which have their hash in the name these days). So we are unlikely to see such a collision by accident. And even though there are weaknesses in sha1, we assume they are mitigated by our use of sha1dc. So while it's a theoretical concern now, it hasn't been a priority. However, if we start using weaker hashes for pack checksums and names, this will become a practical concern. So in preparation, let's actually implement a byte-for-byte collision check. The new check will cause the write of new differing content to be a failure, rather than a silent noop, and we'll retain the temporary file on disk. If there's no collision present, we'll clean up the temporary file as usual after either rename()-ing or link()-ing it into place. Note that this may cause some extra computation when the files are in fact identical, but this should happen rarely. Loose objects are exempt from this check, and the collision check may be skipped by calling the _flags variant of this function with the FOF_SKIP_COLLISION_CHECK bit set. This is done for a couple of reasons: - We don't treat the hash of the loose object file's contents as a checksum, since the same loose object can be stored using different bytes on disk (e.g., when adjusting core.compression, using a different version of zlib, etc.). This is fundamentally different from cases where finalize_object_file() is operating over a file which uses the hash value as a checksum of the contents. In other words, a pair of identical loose objects can be stored using different bytes on disk, and that should not be treated as a collision. - We already use the path of the loose object as its hash value / object name, so checking for collisions at the content level doesn't add anything. Adding a content-level collision check would have to happen at a higher level than in finalize_object_file(), since (avoiding race conditions) writing an object loose which already exists in the repository will prevent us from even reaching finalize_object_file() via the object freshening code. There is a collision check in index-pack via its `check_collision()` function, but there isn't an analogous function in unpack-objects, which just feeds the result to write_object_file(). So skipping the collision check here does not change for better or worse the hardness of loose object writes. As a small note related to the latter bullet point above, we must teach the tmp-objdir routines to similarly skip the content-level collision checks when calling migrate_one() on a loose object file, which we do by setting the FOF_SKIP_COLLISION_CHECK bit when we are inside of a loose object shard. Co-authored-by: Jeff King <[email protected]> Signed-off-by: Jeff King <[email protected]> Helped-by: Elijah Newren <[email protected]> Signed-off-by: Taylor Blau <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 9ca7c2c commit b1b8dfd

File tree

3 files changed

+89
-10
lines changed

3 files changed

+89
-10
lines changed

object-file.c

+64-3
Original file line numberDiff line numberDiff line change
@@ -1899,10 +1899,67 @@ static void write_object_file_prepare_literally(const struct git_hash_algo *algo
18991899
hash_object_body(algo, &c, buf, len, oid, hdr, hdrlen);
19001900
}
19011901

1902+
static int check_collision(const char *filename_a, const char *filename_b)
1903+
{
1904+
char buf_a[4096], buf_b[4096];
1905+
int fd_a = -1, fd_b = -1;
1906+
int ret = 0;
1907+
1908+
fd_a = open(filename_a, O_RDONLY);
1909+
if (fd_a < 0) {
1910+
ret = error_errno(_("unable to open %s"), filename_a);
1911+
goto out;
1912+
}
1913+
1914+
fd_b = open(filename_b, O_RDONLY);
1915+
if (fd_b < 0) {
1916+
ret = error_errno(_("unable to open %s"), filename_b);
1917+
goto out;
1918+
}
1919+
1920+
while (1) {
1921+
ssize_t sz_a, sz_b;
1922+
1923+
sz_a = read_in_full(fd_a, buf_a, sizeof(buf_a));
1924+
if (sz_a < 0) {
1925+
ret = error_errno(_("unable to read %s"), filename_a);
1926+
goto out;
1927+
}
1928+
1929+
sz_b = read_in_full(fd_b, buf_b, sizeof(buf_b));
1930+
if (sz_b < 0) {
1931+
ret = error_errno(_("unable to read %s"), filename_b);
1932+
goto out;
1933+
}
1934+
1935+
if (sz_a != sz_b || memcmp(buf_a, buf_b, sz_a)) {
1936+
ret = error(_("files '%s' and '%s' differ in contents"),
1937+
filename_a, filename_b);
1938+
goto out;
1939+
}
1940+
1941+
if (sz_a < sizeof(buf_a))
1942+
break;
1943+
}
1944+
1945+
out:
1946+
if (fd_a > -1)
1947+
close(fd_a);
1948+
if (fd_b > -1)
1949+
close(fd_b);
1950+
return ret;
1951+
}
1952+
19021953
/*
19031954
* Move the just written object into its final resting place.
19041955
*/
19051956
int finalize_object_file(const char *tmpfile, const char *filename)
1957+
{
1958+
return finalize_object_file_flags(tmpfile, filename, 0);
1959+
}
1960+
1961+
int finalize_object_file_flags(const char *tmpfile, const char *filename,
1962+
enum finalize_object_file_flags flags)
19061963
{
19071964
struct stat st;
19081965
int ret = 0;
@@ -1941,7 +1998,9 @@ int finalize_object_file(const char *tmpfile, const char *filename)
19411998
errno = saved_errno;
19421999
return error_errno(_("unable to write file %s"), filename);
19432000
}
1944-
/* FIXME!!! Collision check here ? */
2001+
if (!(flags & FOF_SKIP_COLLISION_CHECK) &&
2002+
check_collision(tmpfile, filename))
2003+
return -1;
19452004
unlink_or_warn(tmpfile);
19462005
}
19472006

@@ -2195,7 +2254,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
21952254
warning_errno(_("failed utime() on %s"), tmp_file.buf);
21962255
}
21972256

2198-
return finalize_object_file(tmp_file.buf, filename.buf);
2257+
return finalize_object_file_flags(tmp_file.buf, filename.buf,
2258+
FOF_SKIP_COLLISION_CHECK);
21992259
}
22002260

22012261
static int freshen_loose_object(const struct object_id *oid)
@@ -2317,7 +2377,8 @@ int stream_loose_object(struct input_stream *in_stream, size_t len,
23172377
strbuf_release(&dir);
23182378
}
23192379

2320-
err = finalize_object_file(tmp_file.buf, filename.buf);
2380+
err = finalize_object_file_flags(tmp_file.buf, filename.buf,
2381+
FOF_SKIP_COLLISION_CHECK);
23212382
if (!err && compat)
23222383
err = repo_add_loose_object_map(the_repository, oid, &compat_oid);
23232384
cleanup:

object-file.h

+6
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,13 @@ int check_object_signature(struct repository *r, const struct object_id *oid,
117117
*/
118118
int stream_object_signature(struct repository *r, const struct object_id *oid);
119119

120+
enum finalize_object_file_flags {
121+
FOF_SKIP_COLLISION_CHECK = 1,
122+
};
123+
120124
int finalize_object_file(const char *tmpfile, const char *filename);
125+
int finalize_object_file_flags(const char *tmpfile, const char *filename,
126+
enum finalize_object_file_flags flags);
121127

122128
/* Helper to check and "touch" a file */
123129
int check_and_freshen_file(const char *fn, int freshen);

tmp-objdir.c

+19-7
Original file line numberDiff line numberDiff line change
@@ -204,9 +204,11 @@ static int read_dir_paths(struct string_list *out, const char *path)
204204
return 0;
205205
}
206206

207-
static int migrate_paths(struct strbuf *src, struct strbuf *dst);
207+
static int migrate_paths(struct strbuf *src, struct strbuf *dst,
208+
enum finalize_object_file_flags flags);
208209

209-
static int migrate_one(struct strbuf *src, struct strbuf *dst)
210+
static int migrate_one(struct strbuf *src, struct strbuf *dst,
211+
enum finalize_object_file_flags flags)
210212
{
211213
struct stat st;
212214

@@ -218,12 +220,18 @@ static int migrate_one(struct strbuf *src, struct strbuf *dst)
218220
return -1;
219221
} else if (errno != EEXIST)
220222
return -1;
221-
return migrate_paths(src, dst);
223+
return migrate_paths(src, dst, flags);
222224
}
223-
return finalize_object_file(src->buf, dst->buf);
225+
return finalize_object_file_flags(src->buf, dst->buf, flags);
224226
}
225227

226-
static int migrate_paths(struct strbuf *src, struct strbuf *dst)
228+
static int is_loose_object_shard(const char *name)
229+
{
230+
return strlen(name) == 2 && isxdigit(name[0]) && isxdigit(name[1]);
231+
}
232+
233+
static int migrate_paths(struct strbuf *src, struct strbuf *dst,
234+
enum finalize_object_file_flags flags)
227235
{
228236
size_t src_len = src->len, dst_len = dst->len;
229237
struct string_list paths = STRING_LIST_INIT_DUP;
@@ -237,11 +245,15 @@ static int migrate_paths(struct strbuf *src, struct strbuf *dst)
237245

238246
for (i = 0; i < paths.nr; i++) {
239247
const char *name = paths.items[i].string;
248+
enum finalize_object_file_flags flags_copy = flags;
240249

241250
strbuf_addf(src, "/%s", name);
242251
strbuf_addf(dst, "/%s", name);
243252

244-
ret |= migrate_one(src, dst);
253+
if (is_loose_object_shard(name))
254+
flags_copy |= FOF_SKIP_COLLISION_CHECK;
255+
256+
ret |= migrate_one(src, dst, flags_copy);
245257

246258
strbuf_setlen(src, src_len);
247259
strbuf_setlen(dst, dst_len);
@@ -269,7 +281,7 @@ int tmp_objdir_migrate(struct tmp_objdir *t)
269281
strbuf_addbuf(&src, &t->path);
270282
strbuf_addstr(&dst, get_object_directory());
271283

272-
ret = migrate_paths(&src, &dst);
284+
ret = migrate_paths(&src, &dst, 0);
273285

274286
strbuf_release(&src);
275287
strbuf_release(&dst);

0 commit comments

Comments
 (0)