From 5a670b1b0b1878e1981306b7e4eb6bd07c34fda7 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Thu, 2 Apr 2026 22:23:16 +0200 Subject: [PATCH 1/2] gitindex: use --unordered for git cat-file --- gitindex/catfile.go | 69 ++++++++++++++++------ gitindex/catfile_bench_test.go | 67 ++++++++++++---------- gitindex/catfile_hardening_test.go | 64 ++++++++++----------- gitindex/catfile_test.go | 92 +++++++++++++++++++++++++----- gitindex/index.go | 28 +++++++-- 5 files changed, 222 insertions(+), 98 deletions(-) diff --git a/gitindex/catfile.go b/gitindex/catfile.go index 1933c337c..308e80f75 100644 --- a/gitindex/catfile.go +++ b/gitindex/catfile.go @@ -30,12 +30,15 @@ import ( type catfileReaderOptions struct { filterSpec string + unordered bool } // catfileReader provides streaming access to git blob objects via a pipelined // "git cat-file --batch --buffer" process. A writer goroutine feeds all blob // SHAs to stdin while the caller reads responses one at a time, similar to -// archive/tar.Reader. +// archive/tar.Reader. When the reader is configured with --unordered, git may +// return objects in pack order instead of request order, so callers must use +// the object ID returned by Next to correlate responses. // // The --buffer flag switches git's output from per-object flush (write_or_die) // to libc stdio buffering (fwrite), reducing syscalls. After stdin EOF, git @@ -48,13 +51,14 @@ type catfileReaderOptions struct { // defer cr.Close() // // for { -// size, missing, excluded, err := cr.Next() +// id, size, missing, excluded, err := cr.Next() // if err == io.EOF { break } // if missing { continue } // if excluded { continue } // if size > maxSize { continue } // unread bytes auto-skipped // content := make([]byte, size) // io.ReadFull(cr, content) +// _ = id // match back to the requested blob if using --unordered // } type catfileReader struct { cmd *exec.Cmd @@ -74,6 +78,9 @@ type catfileReader struct { // when done. Pass a zero-value catfileReaderOptions when no options are needed. func newCatfileReader(repoDir string, ids []plumbing.Hash, opts catfileReaderOptions) (*catfileReader, error) { args := []string{"cat-file", "--batch", "--buffer"} + if opts.unordered { + args = append(args, "--unordered") + } if opts.filterSpec != "" { args = append(args, "--filter="+opts.filterSpec) } @@ -123,18 +130,18 @@ func newCatfileReader(repoDir string, ids []plumbing.Hash, opts catfileReaderOpt }, nil } -// Next advances to the next blob entry. It returns the blob's size and whether -// it is missing or excluded by the configured filter. Any unread content from -// the previous entry is automatically discarded. Returns io.EOF when all -// entries have been consumed. +// Next advances to the next blob entry. It returns the blob ID, size, and +// whether it is missing or excluded by the configured filter. Any unread +// content from the previous entry is automatically discarded. Returns io.EOF +// when all entries have been consumed. // // After Next returns successfully with missing=false and excluded=false, call // Read to consume the blob content, or call Next again to skip it. -func (cr *catfileReader) Next() (size int, missing bool, excluded bool, err error) { +func (cr *catfileReader) Next() (id plumbing.Hash, size int, missing bool, excluded bool, err error) { // Discard unread content from the previous entry. if cr.pending > 0 { if _, err := cr.reader.Discard(cr.pending); err != nil { - return 0, false, false, fmt.Errorf("discard pending bytes: %w", err) + return plumbing.ZeroHash, 0, false, false, fmt.Errorf("discard pending bytes: %w", err) } cr.pending = 0 } @@ -142,33 +149,57 @@ func (cr *catfileReader) Next() (size int, missing bool, excluded bool, err erro headerBytes, err := cr.reader.ReadBytes('\n') if err != nil { if err == io.EOF { - return 0, false, false, io.EOF + return plumbing.ZeroHash, 0, false, false, io.EOF } - return 0, false, false, fmt.Errorf("read header: %w", err) + return plumbing.ZeroHash, 0, false, false, fmt.Errorf("read header: %w", err) } header := headerBytes[:len(headerBytes)-1] // trim \n + firstSpace := bytes.IndexByte(header, ' ') + if firstSpace == -1 { + return plumbing.ZeroHash, 0, false, false, fmt.Errorf("unexpected header: %q", header) + } - if bytes.HasSuffix(header, []byte(" missing")) { - return 0, true, false, nil + id, err = parseCatfileObjectID(header[:firstSpace]) + if err != nil { + return plumbing.ZeroHash, 0, false, false, fmt.Errorf("parse object id from %q: %w", header, err) } - if bytes.HasSuffix(header, []byte(" excluded")) { - return 0, false, true, nil + rest := header[firstSpace+1:] + if bytes.Equal(rest, []byte("missing")) { + return id, 0, true, false, nil + } + + if bytes.Equal(rest, []byte("excluded")) { + return id, 0, false, true, nil } - // Parse size from " ". lastSpace := bytes.LastIndexByte(header, ' ') - if lastSpace == -1 { - return 0, false, false, fmt.Errorf("unexpected header: %q", header) + if lastSpace <= firstSpace { + return plumbing.ZeroHash, 0, false, false, fmt.Errorf("unexpected header: %q", header) } + + // Parse size from " ". size, err = strconv.Atoi(string(header[lastSpace+1:])) if err != nil { - return 0, false, false, fmt.Errorf("parse size from %q: %w", header, err) + return plumbing.ZeroHash, 0, false, false, fmt.Errorf("parse size from %q: %w", header, err) } // Track pending bytes: content + trailing LF. cr.pending = size + 1 - return size, false, false, nil + return id, size, false, false, nil +} + +func parseCatfileObjectID(raw []byte) (plumbing.Hash, error) { + if len(raw) != 40 { + return plumbing.ZeroHash, fmt.Errorf("invalid object id length %d", len(raw)) + } + + var id plumbing.Hash + if _, err := hex.Decode(id[:], raw); err != nil { + return plumbing.ZeroHash, err + } + + return id, nil } // Read reads from the current blob's content. Implements io.Reader. Returns diff --git a/gitindex/catfile_bench_test.go b/gitindex/catfile_bench_test.go index 5b8897380..e383892f6 100644 --- a/gitindex/catfile_bench_test.go +++ b/gitindex/catfile_bench_test.go @@ -107,9 +107,10 @@ func BenchmarkBlobRead_GoGit(b *testing.B) { } } -// BenchmarkBlobRead_CatfileReader measures the streaming catfileReader approach: -// all SHAs written to stdin at once via --buffer, responses read one at a time. -// This is the production path used by indexGitRepo. +// BenchmarkBlobRead_CatfileReader measures the streaming catfileReader +// approach: all SHAs written to stdin at once via --buffer, responses read one +// at a time. It compares the legacy ordered stream with the production +// unordered mode used by indexGitRepo. func BenchmarkBlobRead_CatfileReader(b *testing.B) { repoDir := requireBenchGitRepo(b) files, gitDir := collectBlobKeys(b, repoDir) @@ -125,35 +126,43 @@ func BenchmarkBlobRead_CatfileReader(b *testing.B) { n = min(n, len(keys)) subset := ids[:n] - b.Run(fmt.Sprintf("files=%d", n), func(b *testing.B) { - b.ReportAllocs() - var totalBytes int64 - for b.Loop() { - totalBytes = 0 - cr, err := newCatfileReader(gitDir, subset, catfileReaderOptions{}) - if err != nil { - b.Fatalf("newCatfileReader: %v", err) - } - for range subset { - size, missing, excluded, err := cr.Next() + for _, benchMode := range []struct { + name string + unordered bool + }{ + {name: "ordered"}, + {name: "unordered", unordered: true}, + } { + b.Run(fmt.Sprintf("files=%d/mode=%s", n, benchMode.name), func(b *testing.B) { + b.ReportAllocs() + var totalBytes int64 + for b.Loop() { + totalBytes = 0 + cr, err := newCatfileReader(gitDir, subset, catfileReaderOptions{unordered: benchMode.unordered}) if err != nil { - cr.Close() - b.Fatalf("Next: %v", err) + b.Fatalf("newCatfileReader: %v", err) } - if missing || excluded { - continue + for range subset { + _, size, missing, excluded, err := cr.Next() + if err != nil { + cr.Close() + b.Fatalf("Next: %v", err) + } + if missing || excluded { + continue + } + content := make([]byte, size) + if _, err := io.ReadFull(cr, content); err != nil { + cr.Close() + b.Fatalf("ReadFull: %v", err) + } + totalBytes += int64(len(content)) } - content := make([]byte, size) - if _, err := io.ReadFull(cr, content); err != nil { - cr.Close() - b.Fatalf("ReadFull: %v", err) - } - totalBytes += int64(len(content)) + cr.Close() } - cr.Close() - } - b.ReportMetric(float64(totalBytes), "content-bytes/op") - b.ReportMetric(float64(len(subset)), "files/op") - }) + b.ReportMetric(float64(totalBytes), "content-bytes/op") + b.ReportMetric(float64(len(subset)), "files/op") + }) + } } } diff --git a/gitindex/catfile_hardening_test.go b/gitindex/catfile_hardening_test.go index e1da3faf7..967170c4e 100644 --- a/gitindex/catfile_hardening_test.go +++ b/gitindex/catfile_hardening_test.go @@ -30,7 +30,7 @@ func TestCatfileReader_DoubleClose(t *testing.T) { } // Consume the entry so the process can exit cleanly. - if _, _, _, err := cr.Next(); err != nil { + if _, _, _, _, err := cr.Next(); err != nil { t.Fatal(err) } @@ -71,7 +71,7 @@ func TestCatfileReader_ConcurrentClose(t *testing.T) { } // Read one entry, leave two unconsumed. - if _, _, _, err := cr.Next(); err != nil { + if _, _, _, _, err := cr.Next(); err != nil { t.Fatal(err) } @@ -153,7 +153,7 @@ func TestCatfileReader_CloseBeforeExhausted_ManyBlobs(t *testing.T) { } // Read only 1 of 200 entries. - if _, _, _, err := cr.Next(); err != nil { + if _, _, _, _, err := cr.Next(); err != nil { t.Fatal(err) } @@ -261,7 +261,7 @@ func TestCatfileReader_ReadAfterFullConsumption(t *testing.T) { } defer cr.Close() - size, _, _, _ := cr.Next() + _, size, _, _, _ := cr.Next() content := make([]byte, size) if _, err := io.ReadFull(cr, content); err != nil { t.Fatal(err) @@ -292,7 +292,7 @@ func TestCatfileReader_SmallBufferReads(t *testing.T) { } defer cr.Close() - size, _, _, _ := cr.Next() + _, size, _, _, _ := cr.Next() var result []byte buf := make([]byte, 1) @@ -336,7 +336,7 @@ func TestCatfileReader_PartialReadThenNext(t *testing.T) { defer cr.Close() // Read only 5 of 12 bytes from hello.txt. - size, _, _, _ := cr.Next() + _, size, _, _, _ := cr.Next() if size != 12 { t.Fatalf("hello.txt size = %d, want 12", size) } @@ -349,7 +349,7 @@ func TestCatfileReader_PartialReadThenNext(t *testing.T) { } // Advance — must discard remaining 7 content bytes + trailing LF. - size, _, _, err = cr.Next() + _, size, _, _, err = cr.Next() if err != nil { t.Fatalf("Next binary.bin after partial read: %v", err) } @@ -383,7 +383,7 @@ func TestCatfileReader_PartialReadExactlyOneByteShort(t *testing.T) { } defer cr.Close() - size, _, _, _ := cr.Next() + _, size, _, _, _ := cr.Next() // Read exactly size-1 bytes — leaves 1 content byte + trailing LF. buf := make([]byte, size-1) if _, err := io.ReadFull(cr, buf); err != nil { @@ -395,7 +395,7 @@ func TestCatfileReader_PartialReadExactlyOneByteShort(t *testing.T) { // Advance — pending should be 2 (1 content byte + 1 LF). The // Discard call must handle this exact boundary correctly. - size, missing, excluded, err := cr.Next() + _, size, missing, excluded, err := cr.Next() if err != nil { t.Fatalf("Next after size-1 partial read: %v", err) } @@ -428,7 +428,7 @@ func TestCatfileReader_EmptyIds(t *testing.T) { } defer cr.Close() - _, _, _, err = cr.Next() + _, _, _, _, err = cr.Next() if err != io.EOF { t.Fatalf("expected io.EOF for empty ids, got %v", err) } @@ -453,7 +453,7 @@ func TestCatfileReader_MultipleEmptyBlobs(t *testing.T) { defer cr.Close() for i := range ids { - size, missing, excluded, err := cr.Next() + _, size, missing, excluded, err := cr.Next() if err != nil { t.Fatalf("Next #%d: %v", i, err) } @@ -466,7 +466,7 @@ func TestCatfileReader_MultipleEmptyBlobs(t *testing.T) { // Don't read — Next should discard the trailing LF for us. } - _, _, _, err = cr.Next() + _, _, _, _, err = cr.Next() if err != io.EOF { t.Fatalf("expected EOF after %d empty blobs, got %v", len(ids), err) } @@ -490,7 +490,7 @@ func TestCatfileReader_EmptyBlobRead(t *testing.T) { } defer cr.Close() - size, _, _, _ := cr.Next() + _, size, _, _, _ := cr.Next() if size != 0 { t.Fatalf("empty.txt size = %d", size) } @@ -504,7 +504,7 @@ func TestCatfileReader_EmptyBlobRead(t *testing.T) { // The trailing LF must have been consumed. Verify by reading the // next entry — if the LF leaked, the header parse would fail. - size, _, _, err = cr.Next() + _, size, _, _, err = cr.Next() if err != nil { t.Fatalf("Next hello.txt after empty blob Read: %v", err) } @@ -543,7 +543,7 @@ func TestCatfileReader_AllMissing(t *testing.T) { defer cr.Close() for i, id := range ids { - _, missing, excluded, err := cr.Next() + _, _, missing, excluded, err := cr.Next() if err != nil { t.Fatalf("Next #%d (%s): %v", i, id, err) } @@ -555,7 +555,7 @@ func TestCatfileReader_AllMissing(t *testing.T) { } } - _, _, _, err = cr.Next() + _, _, _, _, err = cr.Next() if err != io.EOF { t.Fatalf("expected EOF after all missing, got %v", err) } @@ -586,13 +586,13 @@ func TestCatfileReader_AlternatingMissingPresent(t *testing.T) { defer cr.Close() // fake1 — missing - _, missing, excluded, err := cr.Next() + _, _, missing, excluded, err := cr.Next() if err != nil || !missing || excluded { t.Fatalf("fake1: err=%v missing=%v excluded=%v", err, missing, excluded) } // hello.txt — present, read it - size, missing, excluded, err := cr.Next() + _, size, missing, excluded, err := cr.Next() if err != nil || missing || excluded { t.Fatalf("hello.txt: err=%v missing=%v excluded=%v", err, missing, excluded) } @@ -605,13 +605,13 @@ func TestCatfileReader_AlternatingMissingPresent(t *testing.T) { } // fake2 — missing - _, missing, excluded, err = cr.Next() + _, _, missing, excluded, err = cr.Next() if err != nil || !missing || excluded { t.Fatalf("fake2: err=%v missing=%v excluded=%v", err, missing, excluded) } // empty.txt — present, skip it - size, missing, excluded, err = cr.Next() + _, size, missing, excluded, err = cr.Next() if err != nil || missing || excluded { t.Fatalf("empty.txt: err=%v missing=%v excluded=%v", err, missing, excluded) } @@ -620,7 +620,7 @@ func TestCatfileReader_AlternatingMissingPresent(t *testing.T) { } // binary.bin — present, read it - size, missing, excluded, err = cr.Next() + _, size, missing, excluded, err = cr.Next() if err != nil || missing || excluded { t.Fatalf("binary.bin: err=%v missing=%v excluded=%v", err, missing, excluded) } @@ -632,7 +632,7 @@ func TestCatfileReader_AlternatingMissingPresent(t *testing.T) { t.Errorf("binary.bin[0] = 0x%02x, want 0x00", binContent[0]) } - _, _, _, err = cr.Next() + _, _, _, _, err = cr.Next() if err != io.EOF { t.Fatalf("expected EOF, got %v", err) } @@ -661,13 +661,13 @@ func TestCatfileReader_MissingThenSkip(t *testing.T) { defer cr.Close() // missing - _, missing, excluded, _ := cr.Next() + _, _, missing, excluded, _ := cr.Next() if !missing || excluded { t.Fatal("expected missing") } // large.bin — skip - size, missing, excluded, err := cr.Next() + _, size, missing, excluded, err := cr.Next() if err != nil || missing || excluded { t.Fatalf("large.bin: err=%v missing=%v excluded=%v", err, missing, excluded) } @@ -677,7 +677,7 @@ func TestCatfileReader_MissingThenSkip(t *testing.T) { // deliberately don't read // hello.txt — read after missing+skip - size, missing, excluded, err = cr.Next() + _, size, missing, excluded, err = cr.Next() if err != nil || missing || excluded { t.Fatalf("hello.txt: err=%v missing=%v excluded=%v", err, missing, excluded) } @@ -707,19 +707,19 @@ func TestCatfileReader_RepeatedNextAfterEOF(t *testing.T) { defer cr.Close() // Consume and skip the only entry. - if _, _, _, err := cr.Next(); err != nil { + if _, _, _, _, err := cr.Next(); err != nil { t.Fatal(err) } // First EOF. - _, _, _, err = cr.Next() + _, _, _, _, err = cr.Next() if err != io.EOF { t.Fatalf("first post-exhaust Next: %v, want io.EOF", err) } // Second and third EOF — must be stable. for i := 0; i < 2; i++ { - _, _, _, err = cr.Next() + _, _, _, _, err = cr.Next() if err != io.EOF { t.Fatalf("Next #%d after EOF: %v, want io.EOF", i+2, err) } @@ -743,7 +743,7 @@ func TestCatfileReader_LargeBlobBytePrecision(t *testing.T) { } defer cr.Close() - size, _, _, err := cr.Next() + _, size, _, _, err := cr.Next() if err != nil { t.Fatal(err) } @@ -793,7 +793,7 @@ func TestCatfileReader_LargeBlobChunkedRead(t *testing.T) { } defer cr.Close() - size, _, _, _ := cr.Next() + _, size, _, _, _ := cr.Next() if size != 64*1024 { t.Fatalf("size = %d", size) } @@ -844,7 +844,7 @@ func TestCatfileReader_DuplicateSHAs(t *testing.T) { defer cr.Close() for i := 0; i < 3; i++ { - size, missing, excluded, err := cr.Next() + _, size, missing, excluded, err := cr.Next() if err != nil { t.Fatalf("Next #%d: %v", i, err) } @@ -863,7 +863,7 @@ func TestCatfileReader_DuplicateSHAs(t *testing.T) { } } - _, _, _, err = cr.Next() + _, _, _, _, err = cr.Next() if err != io.EOF { t.Fatalf("expected EOF, got %v", err) } diff --git a/gitindex/catfile_test.go b/gitindex/catfile_test.go index 161141273..fbecc9570 100644 --- a/gitindex/catfile_test.go +++ b/gitindex/catfile_test.go @@ -74,7 +74,7 @@ func TestCatfileReader(t *testing.T) { defer cr.Close() // hello.txt - size, missing, excluded, err := cr.Next() + _, size, missing, excluded, err := cr.Next() if err != nil { t.Fatalf("Next hello.txt: %v", err) } @@ -93,7 +93,7 @@ func TestCatfileReader(t *testing.T) { } // empty.txt - size, missing, excluded, err = cr.Next() + _, size, missing, excluded, err = cr.Next() if err != nil { t.Fatalf("Next empty.txt: %v", err) } @@ -105,7 +105,7 @@ func TestCatfileReader(t *testing.T) { } // binary.bin — read content and verify binary data survives. - size, missing, excluded, err = cr.Next() + _, size, missing, excluded, err = cr.Next() if err != nil { t.Fatalf("Next binary.bin: %v", err) } @@ -121,7 +121,7 @@ func TestCatfileReader(t *testing.T) { } // large.bin - size, missing, excluded, err = cr.Next() + _, size, missing, excluded, err = cr.Next() if err != nil { t.Fatalf("Next large.bin: %v", err) } @@ -137,7 +137,7 @@ func TestCatfileReader(t *testing.T) { } // EOF after all entries. - _, _, _, err = cr.Next() + _, _, _, _, err = cr.Next() if err != io.EOF { t.Errorf("expected io.EOF after last entry, got %v", err) } @@ -161,13 +161,13 @@ func TestCatfileReader_Skip(t *testing.T) { defer cr.Close() // Skip hello.txt by calling Next again without reading. - _, _, _, err = cr.Next() + _, _, _, _, err = cr.Next() if err != nil { t.Fatalf("Next hello.txt: %v", err) } // Skip large.bin too. - size, _, _, err := cr.Next() + _, size, _, _, err := cr.Next() if err != nil { t.Fatalf("Next large.bin: %v", err) } @@ -176,7 +176,7 @@ func TestCatfileReader_Skip(t *testing.T) { } // Read binary.bin after skipping two entries. - size, _, _, err = cr.Next() + _, size, _, _, err = cr.Next() if err != nil { t.Fatalf("Next binary.bin: %v", err) } @@ -208,7 +208,7 @@ func TestCatfileReader_Missing(t *testing.T) { defer cr.Close() // hello.txt — read normally. - size, missing, excluded, err := cr.Next() + _, size, missing, excluded, err := cr.Next() if err != nil || missing || excluded { t.Fatalf("Next hello.txt: err=%v missing=%v excluded=%v", err, missing, excluded) } @@ -221,7 +221,7 @@ func TestCatfileReader_Missing(t *testing.T) { } // fakeHash — missing. - _, missing, excluded, err = cr.Next() + _, _, missing, excluded, err = cr.Next() if err != nil { t.Fatalf("Next fakeHash: %v", err) } @@ -233,7 +233,7 @@ func TestCatfileReader_Missing(t *testing.T) { } // empty.txt — still works after missing entry. - size, missing, excluded, err = cr.Next() + _, size, missing, excluded, err = cr.Next() if err != nil || missing || excluded { t.Fatalf("Next empty.txt: err=%v missing=%v excluded=%v", err, missing, excluded) } @@ -258,7 +258,7 @@ func TestCatfileReader_Excluded(t *testing.T) { } defer cr.Close() - _, missing, excluded, err := cr.Next() + _, _, missing, excluded, err := cr.Next() if err != nil { t.Fatalf("Next large.bin: %v", err) } @@ -269,7 +269,7 @@ func TestCatfileReader_Excluded(t *testing.T) { t.Fatal("large.bin unexpectedly included") } - size, missing, excluded, err := cr.Next() + _, size, missing, excluded, err := cr.Next() if err != nil { t.Fatalf("Next hello.txt: %v", err) } @@ -284,3 +284,69 @@ func TestCatfileReader_Excluded(t *testing.T) { t.Errorf("hello.txt = %q", content) } } + +func TestCatfileReader_Unordered(t *testing.T) { + t.Parallel() + + repoDir, blobs := createTestRepo(t) + fakeHash := plumbing.NewHash("deadbeefdeadbeefdeadbeefdeadbeefdeadbeef") + + ids := []plumbing.Hash{ + blobs["large.bin"], + blobs["hello.txt"], + fakeHash, + blobs["hello.txt"], + } + + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{ + filterSpec: "blob:limit=1k", + unordered: true, + }) + if err != nil { + t.Fatalf("newCatfileReader: %v", err) + } + defer cr.Close() + + counts := map[plumbing.Hash]int{} + for range ids { + id, size, missing, excluded, err := cr.Next() + if err != nil { + t.Fatalf("Next: %v", err) + } + counts[id]++ + + switch id { + case blobs["large.bin"]: + if missing || !excluded { + t.Fatalf("large.bin: missing=%v excluded=%v", missing, excluded) + } + case fakeHash: + if !missing || excluded { + t.Fatalf("fakeHash: missing=%v excluded=%v", missing, excluded) + } + case blobs["hello.txt"]: + if missing || excluded { + t.Fatalf("hello.txt: missing=%v excluded=%v", missing, excluded) + } + content := make([]byte, size) + if _, err := io.ReadFull(cr, content); err != nil { + t.Fatalf("ReadFull hello.txt: %v", err) + } + if !bytes.Equal(content, []byte("hello world\n")) { + t.Fatalf("hello.txt content = %q", content) + } + default: + t.Fatalf("unexpected blob id %s", id) + } + } + + if counts[blobs["hello.txt"]] != 2 { + t.Fatalf("hello.txt seen %d times, want 2", counts[blobs["hello.txt"]]) + } + if counts[blobs["large.bin"]] != 1 { + t.Fatalf("large.bin seen %d times, want 1", counts[blobs["large.bin"]]) + } + if counts[fakeHash] != 1 { + t.Fatalf("fakeHash seen %d times, want 1", counts[fakeHash]) + } +} diff --git a/gitindex/index.go b/gitindex/index.go index 5b9d5abf1..2b81fb9c4 100644 --- a/gitindex/index.go +++ b/gitindex/index.go @@ -648,6 +648,7 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { if len(mainRepoIDs) > 0 { crOpts := catfileReaderOptions{ filterSpec: catfileFilterSpec(opts), + unordered: true, } cr, err := newCatfileReader(opts.RepoDir, mainRepoIDs, crOpts) if err != nil { @@ -680,15 +681,32 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { // indexCatfileBlobs streams main-repo blobs from the catfileReader into the // builder. Large blobs are skipped without reading content into memory. -// keys must correspond 1:1 (in order) with the ids passed to newCatfileReader. -// The reader is always closed when this function returns. +// The reader may return blobs out of request order, so responses are matched +// back to the queued file keys by object ID. The reader is always closed when +// this function returns. func indexCatfileBlobs(cr *catfileReader, keys []fileKey, repos map[fileKey]BlobLocation, opts Options, builder *index.Builder) error { defer cr.Close() - for idx, key := range keys { - size, missing, excluded, err := cr.Next() + keysByID := make(map[plumbing.Hash][]fileKey, len(keys)) + for _, key := range keys { + keysByID[key.ID] = append(keysByID[key.ID], key) + } + + for idx := 0; idx < len(keys); idx++ { + id, size, missing, excluded, err := cr.Next() if err != nil { - return fmt.Errorf("cat-file next for %s: %w", key.FullPath(), err) + return fmt.Errorf("cat-file next for blob %s: %w", id, err) + } + + queued := keysByID[id] + if len(queued) == 0 { + return fmt.Errorf("cat-file returned unexpected blob %s", id) + } + key := queued[0] + if len(queued) == 1 { + delete(keysByID, id) + } else { + keysByID[id] = queued[1:] } branches := repos[key].Branches From aff0f93146973b06542787c69fd82544a47f3473 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Thu, 9 Apr 2026 16:20:18 +0200 Subject: [PATCH 2/2] measure max rss --- gitindex/catfile.go | 27 +++++++++++++++++++++++++++ gitindex/catfile_bench_test.go | 19 ++++++++++++++++++- 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/gitindex/catfile.go b/gitindex/catfile.go index 308e80f75..115d5fc2c 100644 --- a/gitindex/catfile.go +++ b/gitindex/catfile.go @@ -21,6 +21,7 @@ import ( "fmt" "io" "os/exec" + "runtime" "strconv" "sync" "syscall" @@ -69,6 +70,9 @@ type catfileReader struct { // entry. Next() discards any pending bytes before reading the next header. pending int + peakRSSBytes uint64 + hasPeakRSS bool + closeOnce sync.Once closeErr error } @@ -255,6 +259,9 @@ func (cr *catfileReader) Close() error { // Wait for writer goroutine (unblocks via broken pipe from Kill). <-cr.writeErr err := cr.cmd.Wait() + if ps := cr.cmd.ProcessState; ps != nil { + cr.peakRSSBytes, cr.hasPeakRSS = maxRSSBytes(ps.SysUsage()) + } // Suppress the expected "signal: killed" error from our own Kill(). if isKilledErr(err) { err = nil @@ -264,6 +271,26 @@ func (cr *catfileReader) Close() error { return cr.closeErr } +func (cr *catfileReader) maxRSSBytes() (uint64, bool) { + return cr.peakRSSBytes, cr.hasPeakRSS +} + +func maxRSSBytes(sysUsage any) (uint64, bool) { + rusage, ok := sysUsage.(*syscall.Rusage) + if !ok || rusage == nil || rusage.Maxrss < 0 { + return 0, false + } + + // Darwin reports ru_maxrss in bytes, while Linux reports it in KiB. + maxRSS := uint64(rusage.Maxrss) + switch runtime.GOOS { + case "linux", "android": + maxRSS *= 1024 + } + + return maxRSS, true +} + // isKilledErr reports whether err is an exec.ExitError caused by SIGKILL. func isKilledErr(err error) bool { exitErr, ok := err.(*exec.ExitError) diff --git a/gitindex/catfile_bench_test.go b/gitindex/catfile_bench_test.go index e383892f6..2a5a9088f 100644 --- a/gitindex/catfile_bench_test.go +++ b/gitindex/catfile_bench_test.go @@ -136,6 +136,8 @@ func BenchmarkBlobRead_CatfileReader(b *testing.B) { b.Run(fmt.Sprintf("files=%d/mode=%s", n, benchMode.name), func(b *testing.B) { b.ReportAllocs() var totalBytes int64 + var totalPeakRSS uint64 + var peakRSSSamples int for b.Loop() { totalBytes = 0 cr, err := newCatfileReader(gitDir, subset, catfileReaderOptions{unordered: benchMode.unordered}) @@ -158,10 +160,25 @@ func BenchmarkBlobRead_CatfileReader(b *testing.B) { } totalBytes += int64(len(content)) } - cr.Close() + // Force the child to close stdout before Close() so the recorded + // rusage reflects the fully-drained cat-file process. + if _, _, _, _, err := cr.Next(); err != io.EOF { + cr.Close() + b.Fatalf("final Next: got %v, want io.EOF", err) + } + if err := cr.Close(); err != nil { + b.Fatalf("Close: %v", err) + } + if peakRSS, ok := cr.maxRSSBytes(); ok { + totalPeakRSS += peakRSS + peakRSSSamples++ + } } b.ReportMetric(float64(totalBytes), "content-bytes/op") b.ReportMetric(float64(len(subset)), "files/op") + if peakRSSSamples > 0 { + b.ReportMetric(float64(totalPeakRSS)/float64(peakRSSSamples), "git-maxrss-bytes/op") + } }) } }