From ff9e25d40ff1f2a2942eac7b3dd108ca739f362a Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 13 Jan 2017 12:58:16 -0500 Subject: [PATCH 01/11] sha1_file: add read_loose_object() function commit f6371f9210418f1beabc85b097e2a3470aeeb54d upstream. It's surprisingly hard to ask the sha1_file code to open a _specific_ incarnation of a loose object. Most of the functions take a sha1, and loop over the various object types (packed versus loose) and locations (local versus alternates) at a low level. However, some tools like fsck need to look at a specific file. This patch gives them a function they can use to open the loose object at a given path. The implementation unfortunately ends up repeating bits of related functions, but there's not a good way around it without some major refactoring of the whole sha1_file stack. We need to mmap the specific file, then partially read the zlib stream to know whether we're streaming or not, and then finally either stream it or copy the data to a buffer. We can do that by assembling some of the more arcane internal sha1_file functions, but we end up having to essentially reimplement unpack_sha1_file(), along with the streaming bits of check_sha1_signature(). Still, most of the ugliness is contained in the new function, and the interface is clean enough that it may be reusable (though it seems unlikely anything but git-fsck would care about opening a specific file). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano Signed-off-by: Jonathan Nieder --- cache.h | 13 ++++++ sha1_file.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 143 insertions(+), 2 deletions(-) diff --git a/cache.h b/cache.h index 61d43c6..fa53623 100644 --- a/cache.h +++ b/cache.h @@ -1113,6 +1113,19 @@ extern int finalize_object_file(const char *tmpfile, const char *filename); extern int has_sha1_pack(const unsigned char *sha1); +/* + * Open the loose object at path, check its sha1, and return the contents, + * type, and size. If the object is a blob, then "contents" may return NULL, + * to allow streaming of large blobs. + * + * Returns 0 on success, negative on error (details may be written to stderr). + */ +int read_loose_object(const char *path, + const unsigned char *expected_sha1, + enum object_type *type, + unsigned long *size, + void **contents); + /* * Return true iff we have an object named sha1, whether local or in * an alternate object database, and whether packed or loose. This diff --git a/sha1_file.c b/sha1_file.c index cb571ac..6d29840 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -1547,12 +1547,21 @@ static int open_sha1_file(const unsigned char *sha1) return -1; } -void *map_sha1_file(const unsigned char *sha1, unsigned long *size) +/* + * Map the loose object at "path" if it is not NULL, or the path found by + * searching for a loose object named "sha1". + */ +static void *map_sha1_file_1(const char *path, + const unsigned char *sha1, + unsigned long *size) { void *map; int fd; - fd = open_sha1_file(sha1); + if (path) + fd = git_open_noatime(path); + else + fd = open_sha1_file(sha1); map = NULL; if (fd >= 0) { struct stat st; @@ -1571,6 +1580,11 @@ void *map_sha1_file(const unsigned char *sha1, unsigned long *size) return map; } +void *map_sha1_file(const unsigned char *sha1, unsigned long *size) +{ + return map_sha1_file_1(NULL, sha1, size); +} + unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep) { @@ -3644,3 +3658,117 @@ int for_each_packed_object(each_packed_object_fn cb, void *data, unsigned flags) } return r ? r : pack_errors; } + +static int check_stream_sha1(git_zstream *stream, + const char *hdr, + unsigned long size, + const char *path, + const unsigned char *expected_sha1) +{ + git_SHA_CTX c; + unsigned char real_sha1[GIT_SHA1_RAWSZ]; + unsigned char buf[4096]; + unsigned long total_read; + int status = Z_OK; + + git_SHA1_Init(&c); + git_SHA1_Update(&c, hdr, stream->total_out); + + /* + * We already read some bytes into hdr, but the ones up to the NUL + * do not count against the object's content size. + */ + total_read = stream->total_out - strlen(hdr) - 1; + + /* + * This size comparison must be "<=" to read the final zlib packets; + * see the comment in unpack_sha1_rest for details. + */ + while (total_read <= size && + (status == Z_OK || status == Z_BUF_ERROR)) { + stream->next_out = buf; + stream->avail_out = sizeof(buf); + if (size - total_read < stream->avail_out) + stream->avail_out = size - total_read; + status = git_inflate(stream, Z_FINISH); + git_SHA1_Update(&c, buf, stream->next_out - buf); + total_read += stream->next_out - buf; + } + git_inflate_end(stream); + + if (status != Z_STREAM_END) { + error("corrupt loose object '%s'", sha1_to_hex(expected_sha1)); + return -1; + } + + git_SHA1_Final(real_sha1, &c); + if (hashcmp(expected_sha1, real_sha1)) { + error("sha1 mismatch for %s (expected %s)", path, + sha1_to_hex(expected_sha1)); + return -1; + } + + return 0; +} + +int read_loose_object(const char *path, + const unsigned char *expected_sha1, + enum object_type *type, + unsigned long *size, + void **contents) +{ + int ret = -1; + int fd = -1; + void *map = NULL; + unsigned long mapsize; + git_zstream stream; + char hdr[32]; + + *contents = NULL; + + map = map_sha1_file_1(path, NULL, &mapsize); + if (!map) { + error_errno("unable to mmap %s", path); + goto out; + } + + if (unpack_sha1_header(&stream, map, mapsize, hdr, sizeof(hdr)) < 0) { + error("unable to unpack header of %s", path); + goto out; + } + + *type = parse_sha1_header(hdr, size); + if (*type < 0) { + error("unable to parse header of %s", path); + git_inflate_end(&stream); + goto out; + } + + if (*type == OBJ_BLOB) { + if (check_stream_sha1(&stream, hdr, *size, path, expected_sha1) < 0) + goto out; + } else { + *contents = unpack_sha1_rest(&stream, hdr, *size, expected_sha1); + if (!*contents) { + error("unable to unpack contents of %s", path); + git_inflate_end(&stream); + goto out; + } + if (check_sha1_signature(expected_sha1, *contents, + *size, typename(*type))) { + error("sha1 mismatch for %s (expected %s)", path, + sha1_to_hex(expected_sha1)); + free(*contents); + goto out; + } + } + + ret = 0; /* everything checks out */ + +out: + if (map) + munmap(map, mapsize); + if (fd >= 0) + close(fd); + return ret; +} -- 2.14.4 From 65074f3eda3338445c7674b7d5b176844970b5de Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 13 Jan 2017 12:59:44 -0500 Subject: [PATCH 02/11] fsck: parse loose object paths directly commit c68b489e56431cf27f7719913ab09ddc62f95912 upstream. When we iterate over the list of loose objects to check, we get the actual path of each object. But we then throw it away and pass just the sha1 to fsck_sha1(), which will do a fresh lookup. Usually it would find the same object, but it may not if an object exists both as a loose and a packed object. We may end up checking the packed object twice, and never look at the loose one. In practice this isn't too terrible, because if fsck doesn't complain, it means you have at least one good copy. But since the point of fsck is to look for corruption, we should be thorough. The new read_loose_object() interface can help us get the data from disk, and then we replace parse_object() with parse_object_buffer(). As a bonus, our error messages now mention the path to a corrupted object, which should make it easier to track down errors when they do happen. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano Signed-off-by: Jonathan Nieder --- builtin/fsck.c | 46 +++++++++++++++++++++++++++++++++------------- t/t1450-fsck.sh | 16 ++++++++++++++++ 2 files changed, 49 insertions(+), 13 deletions(-) diff --git a/builtin/fsck.c b/builtin/fsck.c index 9923b10..150597c 100644 --- a/builtin/fsck.c +++ b/builtin/fsck.c @@ -341,18 +341,6 @@ static int fsck_obj(struct object *obj) return 0; } -static int fsck_sha1(const unsigned char *sha1) -{ - struct object *obj = parse_object(sha1); - if (!obj) { - errors_found |= ERROR_OBJECT; - return error("%s: object corrupt or missing", - sha1_to_hex(sha1)); - } - obj->flags |= HAS_OBJ; - return fsck_obj(obj); -} - static int fsck_obj_buffer(const unsigned char *sha1, enum object_type type, unsigned long size, void *buffer, int *eaten) { @@ -459,9 +447,41 @@ static void get_default_heads(void) } } +static struct object *parse_loose_object(const unsigned char *sha1, + const char *path) +{ + struct object *obj; + void *contents; + enum object_type type; + unsigned long size; + int eaten; + + if (read_loose_object(path, sha1, &type, &size, &contents) < 0) + return NULL; + + if (!contents && type != OBJ_BLOB) + die("BUG: read_loose_object streamed a non-blob"); + + obj = parse_object_buffer(sha1, type, size, contents, &eaten); + + if (!eaten) + free(contents); + return obj; +} + static int fsck_loose(const unsigned char *sha1, const char *path, void *data) { - if (fsck_sha1(sha1)) + struct object *obj = parse_loose_object(sha1, path); + + if (!obj) { + errors_found |= ERROR_OBJECT; + error("%s: object corrupt or missing: %s", + sha1_to_hex(sha1), path); + return 0; /* keep checking other objects */ + } + + obj->flags = HAS_OBJ; + if (fsck_obj(obj)) errors_found |= ERROR_OBJECT; return 0; } diff --git a/t/t1450-fsck.sh b/t/t1450-fsck.sh index 7ee8ea0..17c4c77 100755 --- a/t/t1450-fsck.sh +++ b/t/t1450-fsck.sh @@ -523,4 +523,20 @@ test_expect_success 'fsck --connectivity-only' ' ) ' +test_expect_success 'fsck finds problems in duplicate loose objects' ' + rm -rf broken-duplicate && + git init broken-duplicate && + ( + cd broken-duplicate && + test_commit duplicate && + # no "-d" here, so we end up with duplicates + git repack && + # now corrupt the loose copy + file=$(sha1_file "$(git rev-parse HEAD)") && + rm "$file" && + echo broken >"$file" && + test_must_fail git fsck + ) +' + test_done -- 2.14.4 From 6a26b1fcf92a6c825eb72c1a55399f38af5b7b79 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 2 May 2018 16:37:09 -0400 Subject: [PATCH 03/11] index-pack: make fsck error message more specific commit db5a58c1bda5b20169b9958af1e8b05ddd178b01 upstream. If fsck reports an error, we say only "Error in object". This isn't quite as bad as it might seem, since the fsck code would have dumped some errors to stderr already. But it might help to give a little more context. The earlier output would not have even mentioned "fsck", and that may be a clue that the "fsck.*" or "*.fsckObjects" config may be relevant. Signed-off-by: Jeff King Signed-off-by: Jonathan Nieder --- builtin/index-pack.c | 2 +- builtin/unpack-objects.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/builtin/index-pack.c b/builtin/index-pack.c index 1008d7f..81f3bc7 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -841,7 +841,7 @@ static void sha1_object(const void *data, struct object_entry *obj_entry, die(_("invalid %s"), typename(type)); if (do_fsck_object && fsck_object(obj, buf, size, &fsck_options)) - die(_("Error in object")); + die(_("fsck error in packed object")); if (fsck_walk(obj, NULL, &fsck_options)) die(_("Not all child objects of %s are reachable"), oid_to_hex(&obj->oid)); diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c index 875e7ed..7018829 100644 --- a/builtin/unpack-objects.c +++ b/builtin/unpack-objects.c @@ -205,7 +205,7 @@ static int check_object(struct object *obj, int type, void *data, struct fsck_op if (!obj_buf) die("Whoops! Cannot find object '%s'", oid_to_hex(&obj->oid)); if (fsck_object(obj, obj_buf->buffer, obj_buf->size, &fsck_options)) - die("Error in object"); + die("fsck error in packed object"); fsck_options.walk = check_object; if (fsck_walk(obj, NULL, &fsck_options)) die("Error on reachable objects of %s", oid_to_hex(&obj->oid)); -- 2.14.4 From 8ac12a6cd60b74748494beadd7750de39f727ff3 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Sun, 13 May 2018 12:35:37 -0400 Subject: [PATCH 04/11] fsck: simplify ".git" check commit ed9c3220621d634d543bc4dd998d12167dfc57d4 upstream. There's no need for us to manually check for ".git"; it's a subset of the other filesystem-specific tests. Dropping it makes our code slightly shorter. More importantly, the existing code may make a reader wonder why ".GIT" is not covered here, and whether that is a bug (it isn't, as it's also covered in the filesystem-specific tests). Signed-off-by: Jeff King Signed-off-by: Jonathan Nieder --- fsck.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fsck.c b/fsck.c index 0531545..c807b7f 100644 --- a/fsck.c +++ b/fsck.c @@ -460,9 +460,7 @@ static int fsck_tree(struct tree *item, struct fsck_options *options) has_empty_name |= !*name; has_dot |= !strcmp(name, "."); has_dotdot |= !strcmp(name, ".."); - has_dotgit |= (!strcmp(name, ".git") || - is_hfs_dotgit(name) || - is_ntfs_dotgit(name)); + has_dotgit |= is_hfs_dotgit(name) || is_ntfs_dotgit(name); has_zero_pad |= *(char *)desc.buffer == '0'; update_tree_entry(&desc); -- 2.14.4 From 27d0f7bf0b6ba57e849353a65ff73887088b6d7b Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 2 May 2018 15:44:51 -0400 Subject: [PATCH 05/11] fsck: actually fsck blob data commit 7ac4f3a007e2567f9d2492806186aa063f9a08d6 upstream. Because fscking a blob has always been a noop, we didn't bother passing around the blob data. In preparation for content-level checks, let's fix up a few things: 1. The fsck_object() function just returns success for any blob. Let's a noop fsck_blob(), which we can fill in with actual logic later. 2. The fsck_loose() function in builtin/fsck.c just threw away blob content after loading it. Let's hold onto it until after we've called fsck_object(). The easiest way to do this is to just drop the parse_loose_object() helper entirely. Incidentally, this also fixes a memory leak: if we successfully loaded the object data but did not parse it, we would have left the function without freeing it. 3. When fsck_loose() loads the object data, it does so with a custom read_loose_object() helper. This function streams any blobs, regardless of size, under the assumption that we're only checking the sha1. Instead, let's actually load blobs smaller than big_file_threshold, as the normal object-reading code-paths would do. This lets us fsck small files, and a NULL return is an indication that the blob was so big that it needed to be streamed, and we can pass that information along to fsck_blob(). Signed-off-by: Jeff King Signed-off-by: Jonathan Nieder --- builtin/fsck.c | 39 +++++++++++++++++++-------------------- fsck.c | 8 +++++++- sha1_file.c | 2 +- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/builtin/fsck.c b/builtin/fsck.c index 150597c..61ab1cd 100644 --- a/builtin/fsck.c +++ b/builtin/fsck.c @@ -299,7 +299,7 @@ static void check_connectivity(void) } } -static int fsck_obj(struct object *obj) +static int fsck_obj(struct object *obj, void *buffer, unsigned long size) { if (obj->flags & SEEN) return 0; @@ -311,7 +311,7 @@ static int fsck_obj(struct object *obj) if (fsck_walk(obj, NULL, &fsck_obj_options)) objerror(obj, "broken links"); - if (fsck_object(obj, NULL, 0, &fsck_obj_options)) + if (fsck_object(obj, buffer, size, &fsck_obj_options)) return -1; if (obj->type == OBJ_TREE) { @@ -355,7 +355,7 @@ static int fsck_obj_buffer(const unsigned char *sha1, enum object_type type, return error("%s: object corrupt or missing", sha1_to_hex(sha1)); } obj->flags = HAS_OBJ; - return fsck_obj(obj); + return fsck_obj(obj, buffer, size); } static int default_refs; @@ -447,43 +447,42 @@ static void get_default_heads(void) } } -static struct object *parse_loose_object(const unsigned char *sha1, - const char *path) +static int fsck_loose(const unsigned char *sha1, const char *path, void *data) { struct object *obj; - void *contents; enum object_type type; unsigned long size; + void *contents; int eaten; - if (read_loose_object(path, sha1, &type, &size, &contents) < 0) - return NULL; + if (read_loose_object(path, sha1, &type, &size, &contents) < 0) { + errors_found |= ERROR_OBJECT; + error("%s: object corrupt or missing: %s", + sha1_to_hex(sha1), path); + return 0; /* keep checking other objects */ + } if (!contents && type != OBJ_BLOB) die("BUG: read_loose_object streamed a non-blob"); obj = parse_object_buffer(sha1, type, size, contents, &eaten); - if (!eaten) - free(contents); - return obj; -} - -static int fsck_loose(const unsigned char *sha1, const char *path, void *data) -{ - struct object *obj = parse_loose_object(sha1, path); - if (!obj) { errors_found |= ERROR_OBJECT; - error("%s: object corrupt or missing: %s", + error("%s: object could not be parsed: %s", sha1_to_hex(sha1), path); + if (!eaten) + free(contents); return 0; /* keep checking other objects */ } obj->flags = HAS_OBJ; - if (fsck_obj(obj)) + if (fsck_obj(obj, contents, size)) errors_found |= ERROR_OBJECT; - return 0; + + if (!eaten) + free(contents); + return 0; /* keep checking other objects, even if we saw an error */ } static int fsck_cruft(const char *basename, const char *path, void *data) diff --git a/fsck.c b/fsck.c index c807b7f..a520290 100644 --- a/fsck.c +++ b/fsck.c @@ -795,6 +795,12 @@ static int fsck_tag(struct tag *tag, const char *data, return fsck_tag_buffer(tag, data, size, options); } +static int fsck_blob(struct blob *blob, const char *buf, + unsigned long size, struct fsck_options *options) +{ + return 0; +} + int fsck_object(struct object *obj, void *data, unsigned long size, struct fsck_options *options) { @@ -802,7 +808,7 @@ int fsck_object(struct object *obj, void *data, unsigned long size, return report(options, obj, FSCK_MSG_BAD_OBJECT_SHA1, "no valid object to fsck"); if (obj->type == OBJ_BLOB) - return 0; + return fsck_blob((struct blob *)obj, data, size, options); if (obj->type == OBJ_TREE) return fsck_tree((struct tree *) obj, options); if (obj->type == OBJ_COMMIT) diff --git a/sha1_file.c b/sha1_file.c index 6d29840..a091644 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -3744,7 +3744,7 @@ int read_loose_object(const char *path, goto out; } - if (*type == OBJ_BLOB) { + if (*type == OBJ_BLOB && *size > big_file_threshold) { if (check_stream_sha1(&stream, hdr, *size, path, expected_sha1) < 0) goto out; } else { -- 2.14.4 From d393612c63e6bdfd7ab54139c229be3db89c550d Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 2 May 2018 17:20:08 -0400 Subject: [PATCH 06/11] fsck: detect gitmodules files commit 159e7b080bfa5d34559467cacaa79df89a01afc0 upstream. In preparation for performing fsck checks on .gitmodules files, this commit plumbs in the actual detection of the files. Note that unlike most other fsck checks, this cannot be a property of a single object: we must know that the object is found at a ".gitmodules" path at the root tree of a commit. Since the fsck code only sees one object at a time, we have to mark the related objects to fit the puzzle together. When we see a commit we mark its tree as a root tree, and when we see a root tree with a .gitmodules file, we mark the corresponding blob to be checked. In an ideal world, we'd check the objects in topological order: commits followed by trees followed by blobs. In that case we can avoid ever loading an object twice, since all markings would be complete by the time we get to the marked objects. And indeed, if we are checking a single packfile, this is the order in which Git will generally write the objects. But we can't count on that: 1. git-fsck may show us the objects in arbitrary order (loose objects are fed in sha1 order, but we may also have multiple packs, and we process each pack fully in sequence). 2. The type ordering is just what git-pack-objects happens to write now. The pack format does not require a specific order, and it's possible that future versions of Git (or a custom version trying to fool official Git's fsck checks!) may order it differently. 3. We may not even be fscking all of the relevant objects at once. Consider pushing with transfer.fsckObjects, where one push adds a blob at path "foo", and then a second push adds the same blob at path ".gitmodules". The blob is not part of the second push at all, but we need to mark and check it. So in the general case, we need to make up to three passes over the objects: once to make sure we've seen all commits, then once to cover any trees we might have missed, and then a final pass to cover any .gitmodules blobs we found in the second pass. We can simplify things a bit by loosening the requirement that we find .gitmodules only at root trees. Technically a file like "subdir/.gitmodules" is not parsed by Git, but it's not unreasonable for us to declare that Git is aware of all ".gitmodules" files and make them eligible for checking. That lets us drop the root-tree requirement, which eliminates one pass entirely. And it makes our worst case much better: instead of potentially queueing every root tree to be re-examined, the worst case is that we queue each unique .gitmodules blob for a second look. This patch just adds the boilerplate to find .gitmodules files. The actual content checks will come in a subsequent commit. [jn: backported to 2.11.y: - passing oid->hash instead of oid to lookup_blob - using "struct hashmap" directly since "struct oidset" isn't available] Signed-off-by: Jeff King Signed-off-by: Jonathan Nieder --- fsck.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fsck.h | 7 ++++++ 2 files changed, 98 insertions(+) diff --git a/fsck.c b/fsck.c index a520290..f0a566a 100644 --- a/fsck.c +++ b/fsck.c @@ -9,6 +9,42 @@ #include "refs.h" #include "utf8.h" #include "sha1-array.h" +#include "hashmap.h" + +struct oidhash_entry { + struct hashmap_entry ent; + struct object_id oid; +}; + +static int oidhash_hashcmp(const void *va, const void *vb, + const void *vkey) +{ + const struct oidhash_entry *a = va, *b = vb; + const struct object_id *key = vkey; + return oidcmp(&a->oid, key ? key : &b->oid); +} + +static struct hashmap gitmodules_found; +static struct hashmap gitmodules_done; + +static void oidhash_insert(struct hashmap *h, const struct object_id *oid) +{ + struct oidhash_entry *e; + + if (!h->tablesize) + hashmap_init(h, oidhash_hashcmp, 0); + e = xmalloc(sizeof(*e)); + hashmap_entry_init(&e->ent, sha1hash(oid->hash)); + oidcpy(&e->oid, oid); + hashmap_add(h, e); +} + +static int oidhash_contains(struct hashmap *h, const struct object_id *oid) +{ + return h->tablesize && + !!hashmap_get_from_hash(h, sha1hash(oid->hash), oid); +} + #define FSCK_FATAL -1 #define FSCK_INFO -2 @@ -43,6 +79,7 @@ FUNC(MISSING_TAG_ENTRY, ERROR) \ FUNC(MISSING_TAG_OBJECT, ERROR) \ FUNC(MISSING_TREE, ERROR) \ + FUNC(MISSING_TREE_OBJECT, ERROR) \ FUNC(MISSING_TYPE, ERROR) \ FUNC(MISSING_TYPE_ENTRY, ERROR) \ FUNC(MULTIPLE_AUTHORS, ERROR) \ @@ -50,6 +87,8 @@ FUNC(TREE_NOT_SORTED, ERROR) \ FUNC(UNKNOWN_TYPE, ERROR) \ FUNC(ZERO_PADDED_DATE, ERROR) \ + FUNC(GITMODULES_MISSING, ERROR) \ + FUNC(GITMODULES_BLOB, ERROR) \ /* warnings */ \ FUNC(BAD_FILEMODE, WARN) \ FUNC(EMPTY_NAME, WARN) \ @@ -462,6 +501,10 @@ static int fsck_tree(struct tree *item, struct fsck_options *options) has_dotdot |= !strcmp(name, ".."); has_dotgit |= is_hfs_dotgit(name) || is_ntfs_dotgit(name); has_zero_pad |= *(char *)desc.buffer == '0'; + + if (is_hfs_dotgitmodules(name) || is_ntfs_dotgitmodules(name)) + oidhash_insert(&gitmodules_found, oid); + update_tree_entry(&desc); switch (mode) { @@ -831,3 +874,51 @@ int fsck_error_function(struct object *obj, int msg_type, const char *message) error("object %s: %s", oid_to_hex(&obj->oid), message); return 1; } + +int fsck_finish(struct fsck_options *options) +{ + int ret = 0; + struct hashmap_iter iter; + const struct oidhash_entry *e; + + hashmap_iter_init(&gitmodules_found, &iter); + while ((e = hashmap_iter_next(&iter))) { + const struct object_id *oid = &e->oid; + struct blob *blob; + enum object_type type; + unsigned long size; + char *buf; + + if (oidhash_contains(&gitmodules_done, oid)) + continue; + + blob = lookup_blob(oid->hash); + if (!blob) { + ret |= report(options, &blob->object, + FSCK_MSG_GITMODULES_BLOB, + "non-blob found at .gitmodules"); + continue; + } + + buf = read_sha1_file(oid->hash, &type, &size); + if (!buf) { + ret |= report(options, &blob->object, + FSCK_MSG_GITMODULES_MISSING, + "unable to read .gitmodules blob"); + continue; + } + + if (type == OBJ_BLOB) + ret |= fsck_blob(blob, buf, size, options); + else + ret |= report(options, &blob->object, + FSCK_MSG_GITMODULES_BLOB, + "non-blob found at .gitmodules"); + free(buf); + } + + + hashmap_free(&gitmodules_found, 1); + hashmap_free(&gitmodules_done, 1); + return ret; +} diff --git a/fsck.h b/fsck.h index dded84b..3eebe56 100644 --- a/fsck.h +++ b/fsck.h @@ -50,4 +50,11 @@ int fsck_walk(struct object *obj, void *data, struct fsck_options *options); int fsck_object(struct object *obj, void *data, unsigned long size, struct fsck_options *options); +/* + * Some fsck checks are context-dependent, and may end up queued; run this + * after completing all fsck_object() calls in order to resolve any remaining + * checks. + */ +int fsck_finish(struct fsck_options *options); + #endif -- 2.14.4 From ff398bff4000efa4275058c6b4f8c479831494f0 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 2 May 2018 17:25:27 -0400 Subject: [PATCH 07/11] fsck: check .gitmodules content commit ed8b10f631c9a71df3351d46187bf7f3fa4f9b7e upstream. This patch detects and blocks submodule names which do not match the policy set forth in submodule-config. These should already be caught by the submodule code itself, but putting the check here means that newer versions of Git can protect older ones from malicious entries (e.g., a server with receive.fsckObjects will block the objects, protecting clients which fetch from it). As a side effect, this means fsck will also complain about .gitmodules files that cannot be parsed (or were larger than core.bigFileThreshold). Signed-off-by: Jeff King Signed-off-by: Jonathan Nieder --- fsck.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/fsck.c b/fsck.c index f0a566a..d366ad8 100644 --- a/fsck.c +++ b/fsck.c @@ -10,6 +10,9 @@ #include "utf8.h" #include "sha1-array.h" #include "hashmap.h" +#include "submodule-config.h" + +#define CONFIG_ORIGIN_BLOB 0 struct oidhash_entry { struct hashmap_entry ent; @@ -89,6 +92,8 @@ static int oidhash_contains(struct hashmap *h, const struct object_id *oid) FUNC(ZERO_PADDED_DATE, ERROR) \ FUNC(GITMODULES_MISSING, ERROR) \ FUNC(GITMODULES_BLOB, ERROR) \ + FUNC(GITMODULES_PARSE, ERROR) \ + FUNC(GITMODULES_NAME, ERROR) \ /* warnings */ \ FUNC(BAD_FILEMODE, WARN) \ FUNC(EMPTY_NAME, WARN) \ @@ -838,10 +843,64 @@ static int fsck_tag(struct tag *tag, const char *data, return fsck_tag_buffer(tag, data, size, options); } +struct fsck_gitmodules_data { + struct object *obj; + struct fsck_options *options; + int ret; +}; + +static int fsck_gitmodules_fn(const char *var, const char *value, void *vdata) +{ + struct fsck_gitmodules_data *data = vdata; + const char *subsection, *key; + int subsection_len; + char *name; + + if (parse_config_key(var, "submodule", &subsection, &subsection_len, &key) < 0 || + !subsection) + return 0; + + name = xmemdupz(subsection, subsection_len); + if (check_submodule_name(name) < 0) + data->ret |= report(data->options, data->obj, + FSCK_MSG_GITMODULES_NAME, + "disallowed submodule name: %s", + name); + free(name); + + return 0; +} + static int fsck_blob(struct blob *blob, const char *buf, unsigned long size, struct fsck_options *options) { - return 0; + struct fsck_gitmodules_data data; + + if (!oidhash_contains(&gitmodules_found, &blob->object.oid)) + return 0; + oidhash_insert(&gitmodules_done, &blob->object.oid); + + if (!buf) { + /* + * A missing buffer here is a sign that the caller found the + * blob too gigantic to load into memory. Let's just consider + * that an error. + */ + return report(options, &blob->object, + FSCK_MSG_GITMODULES_PARSE, + ".gitmodules too large to parse"); + } + + data.obj = &blob->object; + data.options = options; + data.ret = 0; + if (git_config_from_mem(fsck_gitmodules_fn, CONFIG_ORIGIN_BLOB, + ".gitmodules", buf, size, &data)) + data.ret |= report(options, &blob->object, + FSCK_MSG_GITMODULES_PARSE, + "could not parse gitmodules blob"); + + return data.ret; } int fsck_object(struct object *obj, void *data, unsigned long size, -- 2.14.4 From 3f7875be4a5b68058e243a0525d75165620738cb Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 2 May 2018 17:20:35 -0400 Subject: [PATCH 08/11] fsck: call fsck_finish() after fscking objects commit 1995b5e03e1cc97116be58cdc0502d4a23547856 upstream. Now that the internal fsck code is capable of checking .gitmodules files, we just need to teach its callers to use the "finish" function to check any queued objects. With this, we can now catch the malicious case in t7415 with git-fsck. Signed-off-by: Jeff King Signed-off-by: Jonathan Nieder --- builtin/fsck.c | 3 +++ t/t7415-submodule-names.sh | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/builtin/fsck.c b/builtin/fsck.c index 61ab1cd..70f620b 100644 --- a/builtin/fsck.c +++ b/builtin/fsck.c @@ -660,6 +660,9 @@ int cmd_fsck(int argc, const char **argv, const char *prefix) count += p->num_objects; } stop_progress(&progress); + + if (fsck_finish(&fsck_obj_options)) + errors_found |= ERROR_OBJECT; } heads = 0; diff --git a/t/t7415-submodule-names.sh b/t/t7415-submodule-names.sh index b30fa33..5d61627 100755 --- a/t/t7415-submodule-names.sh +++ b/t/t7415-submodule-names.sh @@ -73,4 +73,8 @@ test_expect_success 'clone evil superproject' ' ! grep "RUNNING POST CHECKOUT" output ' +test_expect_success 'fsck detects evil superproject' ' + test_must_fail git fsck +' + test_done -- 2.14.4 From 563989871514b892c2c9a4f4fd2ef515dd6adaee Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 4 May 2018 19:40:08 -0400 Subject: [PATCH 09/11] unpack-objects: call fsck_finish() after fscking objects commit 6e328d6caef218db320978e3e251009135d87d0e upstream. As with the previous commit, we must call fsck's "finish" function in order to catch any queued objects for .gitmodules checks. This second pass will be able to access any incoming objects, because we will have exploded them to loose objects by now. This isn't quite ideal, because it means that bad objects may have been written to the object database (and a subsequent operation could then reference them, even if the other side doesn't send the objects again). However, this is sufficient when used with receive.fsckObjects, since those loose objects will all be placed in a temporary quarantine area that will get wiped if we find any problems. Signed-off-by: Jeff King Signed-off-by: Jonathan Nieder --- builtin/unpack-objects.c | 5 ++++- t/t7415-submodule-names.sh | 7 +++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c index 7018829..27ba86e 100644 --- a/builtin/unpack-objects.c +++ b/builtin/unpack-objects.c @@ -560,8 +560,11 @@ int cmd_unpack_objects(int argc, const char **argv, const char *prefix) unpack_all(); git_SHA1_Update(&ctx, buffer, offset); git_SHA1_Final(sha1, &ctx); - if (strict) + if (strict) { write_rest(); + if (fsck_finish(&fsck_options)) + die(_("fsck error in pack objects")); + } if (hashcmp(fill(20), sha1)) die("final sha1 did not match"); use(20); diff --git a/t/t7415-submodule-names.sh b/t/t7415-submodule-names.sh index 5d61627..a09087c 100755 --- a/t/t7415-submodule-names.sh +++ b/t/t7415-submodule-names.sh @@ -77,4 +77,11 @@ test_expect_success 'fsck detects evil superproject' ' test_must_fail git fsck ' +test_expect_success 'transfer.fsckObjects detects evil superproject (unpack)' ' + rm -rf dst.git && + git init --bare dst.git && + git -C dst.git config transfer.fsckObjects true && + test_must_fail git push dst.git HEAD +' + test_done -- 2.14.4 From 0e3a3c30f0ba16f3e3fd4581cea36b2d2d4d204a Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 4 May 2018 19:45:01 -0400 Subject: [PATCH 10/11] index-pack: check .gitmodules files with --strict commit 73c3f0f704a91b6792e0199a3f3ab6e3a1971675 upstream. Now that the internal fsck code has all of the plumbing we need, we can start checking incoming .gitmodules files. Naively, it seems like we would just need to add a call to fsck_finish() after we've processed all of the objects. And that would be enough to cover the initial test included here. But there are two extra bits: 1. We currently don't bother calling fsck_object() at all for blobs, since it has traditionally been a noop. We'd actually catch these blobs in fsck_finish() at the end, but it's more efficient to check them when we already have the object loaded in memory. 2. The second pass done by fsck_finish() needs to access the objects, but we're actually indexing the pack in this process. In theory we could give the fsck code a special callback for accessing the in-pack data, but it's actually quite tricky: a. We don't have an internal efficient index mapping oids to packfile offsets. We only generate it on the fly as part of writing out the .idx file. b. We'd still have to reconstruct deltas, which means we'd basically have to replicate all of the reading logic in packfile.c. Instead, let's avoid running fsck_finish() until after we've written out the .idx file, and then just add it to our internal packed_git list. This does mean that the objects are "in the repository" before we finish our fsck checks. But unpack-objects already exhibits this same behavior, and it's an acceptable tradeoff here for the same reason: the quarantine mechanism means that pushes will be fully protected. In addition to a basic push test in t7415, we add a sneaky pack that reverses the usual object order in the pack, requiring that index-pack access the tree and blob during the "finish" step. This already works for unpack-objects (since it will have written out loose objects), but we'll check it with this sneaky pack for good measure. Signed-off-by: Jeff King Signed-off-by: Jonathan Nieder --- builtin/index-pack.c | 10 ++++++++++ t/lib-pack.sh | 12 ++++++++++++ t/t7415-submodule-names.sh | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+) diff --git a/builtin/index-pack.c b/builtin/index-pack.c index 81f3bc7..f53bf22 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -825,6 +825,9 @@ static void sha1_object(const void *data, struct object_entry *obj_entry, blob->object.flags |= FLAG_CHECKED; else die(_("invalid blob object %s"), sha1_to_hex(sha1)); + if (do_fsck_object && + fsck_object(&blob->object, (void *)data, size, &fsck_options)) + die(_("fsck error in packed object")); } else { struct object *obj; int eaten; @@ -1440,6 +1443,9 @@ static void final(const char *final_pack_name, const char *curr_pack_name, } else chmod(final_index_name, 0444); + if (do_fsck_object) + add_packed_git(final_index_name, strlen(final_index_name), 0); + if (!from_stdin) { printf("%s\n", sha1_to_hex(sha1)); } else { @@ -1775,6 +1781,10 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix) pack_sha1); else close(input_fd); + + if (do_fsck_object && fsck_finish(&fsck_options)) + die(_("fsck error in pack objects")); + free(objects); strbuf_release(&index_name_buf); strbuf_release(&keep_name_buf); diff --git a/t/lib-pack.sh b/t/lib-pack.sh index 7509846..4674899 100644 --- a/t/lib-pack.sh +++ b/t/lib-pack.sh @@ -79,6 +79,18 @@ pack_obj () { ;; esac + # If it's not a delta, we can convince pack-objects to generate a pack + # with just our entry, and then strip off the header (12 bytes) and + # trailer (20 bytes). + if test -z "$2" + then + echo "$1" | git pack-objects --stdout >pack_obj.tmp && + size=$(wc -c &2 "BUG: don't know how to print $1${2:+ (from $2)}" return 1 } diff --git a/t/t7415-submodule-names.sh b/t/t7415-submodule-names.sh index a09087c..fca7091 100755 --- a/t/t7415-submodule-names.sh +++ b/t/t7415-submodule-names.sh @@ -6,6 +6,7 @@ Exercise the name-checking function on a variety of names, and then give a real-world setup that confirms we catch this in practice. ' . ./test-lib.sh +. "$TEST_DIRECTORY"/lib-pack.sh test_expect_success 'check names' ' cat >expect <<-\EOF && @@ -84,4 +85,41 @@ test_expect_success 'transfer.fsckObjects detects evil superproject (unpack)' ' test_must_fail git push dst.git HEAD ' +test_expect_success 'transfer.fsckObjects detects evil superproject (index)' ' + rm -rf dst.git && + git init --bare dst.git && + git -C dst.git config transfer.fsckObjects true && + git -C dst.git config transfer.unpackLimit 1 && + test_must_fail git push dst.git HEAD +' + +# Normally our packs contain commits followed by trees followed by blobs. This +# reverses the order, which requires backtracking to find the context of a +# blob. We'll start with a fresh gitmodules-only tree to make it simpler. +test_expect_success 'create oddly ordered pack' ' + git checkout --orphan odd && + git rm -rf --cached . && + git add .gitmodules && + git commit -m odd && + { + pack_header 3 && + pack_obj $(git rev-parse HEAD:.gitmodules) && + pack_obj $(git rev-parse HEAD^{tree}) && + pack_obj $(git rev-parse HEAD) + } >odd.pack && + pack_trailer odd.pack +' + +test_expect_success 'transfer.fsckObjects handles odd pack (unpack)' ' + rm -rf dst.git && + git init --bare dst.git && + test_must_fail git -C dst.git unpack-objects --strict Date: Fri, 4 May 2018 20:03:35 -0400 Subject: [PATCH 11/11] fsck: complain when .gitmodules is a symlink commit b7b1fca175f1ed7933f361028c631b9ac86d868d upstream. We've recently forbidden .gitmodules to be a symlink in verify_path(). And it's an easy way to circumvent our fsck checks for .gitmodules content. So let's complain when we see it. Signed-off-by: Jeff King Signed-off-by: Jonathan Nieder --- fsck.c | 14 ++++++++++---- t/t7415-submodule-names.sh | 29 +++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/fsck.c b/fsck.c index d366ad8..79694b0 100644 --- a/fsck.c +++ b/fsck.c @@ -94,6 +94,7 @@ static int oidhash_contains(struct hashmap *h, const struct object_id *oid) FUNC(GITMODULES_BLOB, ERROR) \ FUNC(GITMODULES_PARSE, ERROR) \ FUNC(GITMODULES_NAME, ERROR) \ + FUNC(GITMODULES_SYMLINK, ERROR) \ /* warnings */ \ FUNC(BAD_FILEMODE, WARN) \ FUNC(EMPTY_NAME, WARN) \ @@ -472,7 +473,7 @@ static int verify_ordered(unsigned mode1, const char *name1, unsigned mode2, con static int fsck_tree(struct tree *item, struct fsck_options *options) { - int retval; + int retval = 0; int has_null_sha1 = 0; int has_full_path = 0; int has_empty_name = 0; @@ -507,8 +508,14 @@ static int fsck_tree(struct tree *item, struct fsck_options *options) has_dotgit |= is_hfs_dotgit(name) || is_ntfs_dotgit(name); has_zero_pad |= *(char *)desc.buffer == '0'; - if (is_hfs_dotgitmodules(name) || is_ntfs_dotgitmodules(name)) - oidhash_insert(&gitmodules_found, oid); + if (is_hfs_dotgitmodules(name) || is_ntfs_dotgitmodules(name)) { + if (!S_ISLNK(mode)) + oidhash_insert(&gitmodules_found, oid); + else + retval += report(options, &item->object, + FSCK_MSG_GITMODULES_SYMLINK, + ".gitmodules is a symbolic link"); + } update_tree_entry(&desc); @@ -551,7 +558,6 @@ static int fsck_tree(struct tree *item, struct fsck_options *options) o_name = name; } - retval = 0; if (has_null_sha1) retval += report(options, &item->object, FSCK_MSG_NULL_SHA1, "contains entries pointing to null sha1"); if (has_full_path) diff --git a/t/t7415-submodule-names.sh b/t/t7415-submodule-names.sh index fca7091..6ee7115 100755 --- a/t/t7415-submodule-names.sh +++ b/t/t7415-submodule-names.sh @@ -122,4 +122,33 @@ test_expect_success 'transfer.fsckObjects handles odd pack (index)' ' test_must_fail git -C dst.git index-pack --strict --stdin output && + grep gitmodulesSymlink output + ) +' + test_done -- 2.14.4