sourcegraph · keegancsmith · Apr 10, 2026 · Apr 1, 2026 · Apr 9, 2026 · keegancsmith
diff --git a/gitindex/index.go b/gitindex/index.go
@@ -685,6 +685,8 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) {
 func indexCatfileBlobs(cr *catfileReader, keys []fileKey, repos map[fileKey]BlobLocation, opts Options, builder *index.Builder) error {
 	defer cr.Close()
 
+	slab := newContentSlab(16 << 20) // 16 MB per slab
+
 	for idx, key := range keys {
 		size, missing, excluded, err := cr.Next()
 		if err != nil {
@@ -707,10 +709,7 @@ func indexCatfileBlobs(cr *catfileReader, keys []fileKey, repos map[fileKey]Blob
 				// Skip without reading content into memory.
 				doc = skippedDoc(key, branches, index.SkipReasonTooLarge)
 			} else {
-				// Pre-allocate and read the full blob content in one call.
-				// io.ReadFull is preferred over io.LimitedReader here as it
-				// avoids the intermediate allocation and the size is known.
-				content := make([]byte, size)
+				content := slab.alloc(size)
 				if _, err := io.ReadFull(cr, content); err != nil {
 					return fmt.Errorf("read blob %s: %w", keyFullPath, err)
 				}

diff --git a/gitindex/slab.go b/gitindex/slab.go
@@ -0,0 +1,32 @@
+package gitindex
+
+// contentSlab reduces per-file heap allocations by sub-slicing from a
+// shared buffer. Each returned slice has its capacity capped (3-index
+// slice) so appending to one file's content cannot overwrite adjacent
+// data. Files larger than the slab get their own allocation.
+type contentSlab struct {
+	buf []byte
+	cap int
+}
+
+func newContentSlab(slabCap int) contentSlab {
+	return contentSlab{
+		buf: make([]byte, 0, slabCap),
+		cap: slabCap,
+	}
+}
+
+// alloc returns a byte slice of length n. The caller must write into it
+// immediately (the bytes are uninitialized when sourced from the slab).
+func (s *contentSlab) alloc(n int) []byte {
+	if n > s.cap {
+		return make([]byte, n)
+	}
+	if len(s.buf)+n > cap(s.buf) {
+		s.buf = make([]byte, n, s.cap)
+		return s.buf[:n:n]
+	}
+	off := len(s.buf)
+	s.buf = s.buf[:off+n]
+	return s.buf[off : off+n : off+n]
+}
diff --git a/gitindex/slab_test.go b/gitindex/slab_test.go
@@ -0,0 +1,72 @@
+package gitindex
+
+import "testing"
+
+func TestContentSlab(t *testing.T) {
+	t.Run("fits in slab", func(t *testing.T) {
+		s := newContentSlab(1024)
+		b := s.alloc(100)
+		if len(b) != 100 {
+			t.Fatalf("len = %d, want 100", len(b))
+		}
+		if cap(b) != 100 {
+			t.Fatalf("cap = %d, want 100 (3-index slice)", cap(b))
+		}
+	})
+
+	t.Run("cap is capped so append cannot corrupt adjacent data", func(t *testing.T) {
+		s := newContentSlab(1024)
+		a := s.alloc(10)
+		copy(a, []byte("aaaaaaaaaa"))
+
+		b := s.alloc(10)
+		copy(b, []byte("bbbbbbbbbb"))
+
+		// Appending to a must not overwrite b.
+		a = append(a, 'X') // triggers new backing array since cap==len
+		if string(b) != "bbbbbbbbbb" {
+			t.Fatalf("adjacent data corrupted: got %q", b)
+		}
+		_ = a
+	})
+
+	t.Run("slab rollover", func(t *testing.T) {
+		s := newContentSlab(64)
+		a := s.alloc(60)
+		if len(a) != 60 || cap(a) != 60 {
+			t.Fatalf("a: len=%d cap=%d", len(a), cap(a))
+		}
+		// Next alloc doesn't fit in remaining 4 bytes → new slab.
+		b := s.alloc(10)
+		if len(b) != 10 || cap(b) != 10 {
+			t.Fatalf("b: len=%d cap=%d", len(b), cap(b))
+		}
+		// a and b should not share backing arrays.
+		copy(a, make([]byte, 60))
+		copy(b, []byte("0123456789"))
+		if string(b) != "0123456789" {
+			t.Fatal("rollover corrupted data")
+		}
+	})
+
+	t.Run("oversized allocation", func(t *testing.T) {
+		s := newContentSlab(64)
+		b := s.alloc(128)
+		if len(b) != 128 {
+			t.Fatalf("len = %d, want 128", len(b))
+		}
+		// Oversized alloc should not consume slab space.
+		c := s.alloc(32)
+		if len(c) != 32 || cap(c) != 32 {
+			t.Fatalf("c: len=%d cap=%d", len(c), cap(c))
+		}
+	})
+
+	t.Run("zero size", func(t *testing.T) {
+		s := newContentSlab(64)
+		b := s.alloc(0)
+		if len(b) != 0 {
+			t.Fatalf("len = %d, want 0", len(b))
+		}
+	})
+}
diff --git a/index/builder.go b/index/builder.go
@@ -37,7 +37,6 @@ import (
 
 	"github.com/bmatcuk/doublestar"
 	"github.com/dustin/go-humanize"
-	"github.com/go-enry/go-enry/v2"
 	"github.com/rs/xid"
 	"golang.org/x/sys/unix"
 
@@ -625,6 +624,11 @@ func (b *Builder) Add(doc Document) error {
 		doc.SkipReason = skip
 	}
 
+	// Pre-compute file category and language while content is still
+	// available, before content is dropped for skipped documents.
+	DetermineFileCategory(&doc)
+	DetermineLanguageIfUnknown(&doc)
+
 	b.todo = append(b.todo, &doc)
 
 	if doc.SkipReason == SkipReasonNone {
@@ -888,18 +892,19 @@ func rank(d *Document, origIdx int) []float64 {
 		skipped = 1.0
 	}
 
+	// Use pre-computed Category from DetermineFileCategory.
 	generated := 0.0
-	if enry.IsGenerated(d.Name, d.Content) {
+	if d.Category == FileCategoryGenerated {
 		generated = 1.0
 	}
 
 	vendor := 0.0
-	if enry.IsVendor(d.Name) {
+	if d.Category == FileCategoryVendored {
 		vendor = 1.0
 	}
 
 	test := 0.0
-	if enry.IsTest(d.Name) {
+	if d.Category == FileCategoryTest {
 		test = 1.0
 	}
 

diff --git a/index/builder_test.go b/index/builder_test.go
@@ -1084,6 +1084,9 @@ func testFileRankAspect(t *testing.T, c filerankCase) {
 
 	got := make([]*Document, len(c.docs))
 	copy(got, c.docs)
+	for _, d := range got {
+		DetermineFileCategory(d)
+	}
 	sortDocuments(got)
 
 	print := func(ds []*Document) string {

diff --git a/index/ctags.go b/index/ctags.go
@@ -55,8 +55,6 @@ func parseSymbols(todo []*Document, languageMap ctags.LanguageMap, parserBins ct
 			continue
 		}
 
-		DetermineLanguageIfUnknown(doc)
-
 		parserType := languageMap[normalizeLanguage(doc.Language)]
 		if parserType == ctags.NoCTags {
 			continue

diff --git a/index/shard_builder.go b/index/shard_builder.go
@@ -235,8 +235,15 @@ func (s *postingsBuilder) newSearchableString(data []byte, byteSections []Docume
 				s.postings[ng] = pl
 			}
 		}
-		m := binary.PutUvarint(buf[:], uint64(newOff-pl.lastOff))
-		pl.data = append(pl.data, buf[:m]...)
+		delta := uint64(newOff - pl.lastOff)
+		if delta < 0x80 {
+			// Single-byte varint fast path: ~80% of deltas are < 128.
+			// append(slice, byte) is cheaper than append(slice, slice...).
+			pl.data = append(pl.data, byte(delta))
+		} else {
+			m := binary.PutUvarint(buf[:], delta)
+			pl.data = append(pl.data, buf[:m]...)
+		}
 		pl.lastOff = newOff
 	}
 	s.runeCount += runeIndex
@@ -536,8 +543,12 @@ func DetermineLanguageIfUnknown(doc *Document) {
 
 // Add a file which only occurs in certain branches.
 func (b *ShardBuilder) Add(doc Document) error {
-	if index := bytes.IndexByte(doc.Content, 0); index > 0 {
-		doc.SkipReason = SkipReasonBinary
+	// Skip binary check if already computed (e.g., by Builder.Add
+	// which calls DocChecker.Check before docs reach buildShard).
+	if doc.Category == FileCategoryMissing {
+		if index := bytes.IndexByte(doc.Content, 0); index > 0 {
+			doc.SkipReason = SkipReasonBinary
+		}
 	}
 
 	if doc.SkipReason != SkipReasonNone {
@@ -547,7 +558,9 @@ func (b *ShardBuilder) Add(doc Document) error {
 	}
 
 	DetermineLanguageIfUnknown(&doc)
-	DetermineFileCategory(&doc)
+	if doc.Category == FileCategoryMissing {
+		DetermineFileCategory(&doc)
+	}
 
 	sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData})
 	var last DocumentSection