Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions gitindex/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,8 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) {
func indexCatfileBlobs(cr *catfileReader, keys []fileKey, repos map[fileKey]BlobLocation, opts Options, builder *index.Builder) error {
defer cr.Close()

slab := newContentSlab(16 << 20) // 16 MB per slab

for idx, key := range keys {
size, missing, excluded, err := cr.Next()
if err != nil {
Expand All @@ -707,10 +709,7 @@ func indexCatfileBlobs(cr *catfileReader, keys []fileKey, repos map[fileKey]Blob
// Skip without reading content into memory.
doc = skippedDoc(key, branches, index.SkipReasonTooLarge)
} else {
// Pre-allocate and read the full blob content in one call.
// io.ReadFull is preferred over io.LimitedReader here as it
// avoids the intermediate allocation and the size is known.
content := make([]byte, size)
content := slab.alloc(size)
if _, err := io.ReadFull(cr, content); err != nil {
return fmt.Errorf("read blob %s: %w", keyFullPath, err)
}
Expand Down
32 changes: 32 additions & 0 deletions gitindex/slab.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package gitindex

// contentSlab reduces per-file heap allocations by sub-slicing from a
// shared buffer. Each returned slice has its capacity capped (3-index
// slice) so appending to one file's content cannot overwrite adjacent
// data. Files larger than the slab get their own allocation.
type contentSlab struct {
buf []byte
cap int
}

func newContentSlab(slabCap int) contentSlab {
return contentSlab{
buf: make([]byte, 0, slabCap),
cap: slabCap,
}
}

// alloc returns a byte slice of length n. The caller must write into it
// immediately (the bytes are uninitialized when sourced from the slab).
func (s *contentSlab) alloc(n int) []byte {
if n > s.cap {
return make([]byte, n)
}
if len(s.buf)+n > cap(s.buf) {
s.buf = make([]byte, n, s.cap)
return s.buf[:n:n]
}
off := len(s.buf)
s.buf = s.buf[:off+n]
return s.buf[off : off+n : off+n]
}
72 changes: 72 additions & 0 deletions gitindex/slab_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package gitindex

import "testing"

func TestContentSlab(t *testing.T) {
t.Run("fits in slab", func(t *testing.T) {
s := newContentSlab(1024)
b := s.alloc(100)
if len(b) != 100 {
t.Fatalf("len = %d, want 100", len(b))
}
if cap(b) != 100 {
t.Fatalf("cap = %d, want 100 (3-index slice)", cap(b))
}
})

t.Run("cap is capped so append cannot corrupt adjacent data", func(t *testing.T) {
s := newContentSlab(1024)
a := s.alloc(10)
copy(a, []byte("aaaaaaaaaa"))

b := s.alloc(10)
copy(b, []byte("bbbbbbbbbb"))

// Appending to a must not overwrite b.
a = append(a, 'X') // triggers new backing array since cap==len
if string(b) != "bbbbbbbbbb" {
t.Fatalf("adjacent data corrupted: got %q", b)
}
_ = a
})

t.Run("slab rollover", func(t *testing.T) {
s := newContentSlab(64)
a := s.alloc(60)
if len(a) != 60 || cap(a) != 60 {
t.Fatalf("a: len=%d cap=%d", len(a), cap(a))
}
// Next alloc doesn't fit in remaining 4 bytes → new slab.
b := s.alloc(10)
if len(b) != 10 || cap(b) != 10 {
t.Fatalf("b: len=%d cap=%d", len(b), cap(b))
}
// a and b should not share backing arrays.
copy(a, make([]byte, 60))
copy(b, []byte("0123456789"))
if string(b) != "0123456789" {
t.Fatal("rollover corrupted data")
}
})

t.Run("oversized allocation", func(t *testing.T) {
s := newContentSlab(64)
b := s.alloc(128)
if len(b) != 128 {
t.Fatalf("len = %d, want 128", len(b))
}
// Oversized alloc should not consume slab space.
c := s.alloc(32)
if len(c) != 32 || cap(c) != 32 {
t.Fatalf("c: len=%d cap=%d", len(c), cap(c))
}
})

t.Run("zero size", func(t *testing.T) {
s := newContentSlab(64)
b := s.alloc(0)
if len(b) != 0 {
t.Fatalf("len = %d, want 0", len(b))
}
})
}
13 changes: 9 additions & 4 deletions index/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ import (

"github.com/bmatcuk/doublestar"
"github.com/dustin/go-humanize"
"github.com/go-enry/go-enry/v2"
"github.com/rs/xid"
"golang.org/x/sys/unix"

Expand Down Expand Up @@ -625,6 +624,11 @@ func (b *Builder) Add(doc Document) error {
doc.SkipReason = skip
}

// Pre-compute file category and language while content is still
// available, before content is dropped for skipped documents.
DetermineFileCategory(&doc)
Comment thread
clemlesne marked this conversation as resolved.
DetermineLanguageIfUnknown(&doc)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a bit subtle, but previously in ShardBuilder.Add we would call this before determining category. However, I believe this is the better order to do it in since determine language has a fast path for docs with reason skip. Nice little fix


b.todo = append(b.todo, &doc)

if doc.SkipReason == SkipReasonNone {
Expand Down Expand Up @@ -888,18 +892,19 @@ func rank(d *Document, origIdx int) []float64 {
skipped = 1.0
}

// Use pre-computed Category from DetermineFileCategory.
generated := 0.0
if enry.IsGenerated(d.Name, d.Content) {
if d.Category == FileCategoryGenerated {
generated = 1.0
}

vendor := 0.0
if enry.IsVendor(d.Name) {
if d.Category == FileCategoryVendored {
vendor = 1.0
}

test := 0.0
if enry.IsTest(d.Name) {
if d.Category == FileCategoryTest {
test = 1.0
}

Expand Down
3 changes: 3 additions & 0 deletions index/builder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1084,6 +1084,9 @@ func testFileRankAspect(t *testing.T, c filerankCase) {

got := make([]*Document, len(c.docs))
copy(got, c.docs)
for _, d := range got {
DetermineFileCategory(d)
}
sortDocuments(got)

print := func(ds []*Document) string {
Expand Down
2 changes: 0 additions & 2 deletions index/ctags.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,6 @@ func parseSymbols(todo []*Document, languageMap ctags.LanguageMap, parserBins ct
continue
}

DetermineLanguageIfUnknown(doc)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm guessing this was a pointless check?


parserType := languageMap[normalizeLanguage(doc.Language)]
if parserType == ctags.NoCTags {
continue
Expand Down
23 changes: 18 additions & 5 deletions index/shard_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,8 +235,15 @@ func (s *postingsBuilder) newSearchableString(data []byte, byteSections []Docume
s.postings[ng] = pl
}
}
m := binary.PutUvarint(buf[:], uint64(newOff-pl.lastOff))
pl.data = append(pl.data, buf[:m]...)
delta := uint64(newOff - pl.lastOff)
if delta < 0x80 {
// Single-byte varint fast path: ~80% of deltas are < 128.
// append(slice, byte) is cheaper than append(slice, slice...).
pl.data = append(pl.data, byte(delta))
} else {
m := binary.PutUvarint(buf[:], delta)
pl.data = append(pl.data, buf[:m]...)
}
pl.lastOff = newOff
}
s.runeCount += runeIndex
Expand Down Expand Up @@ -536,8 +543,12 @@ func DetermineLanguageIfUnknown(doc *Document) {

// Add a file which only occurs in certain branches.
func (b *ShardBuilder) Add(doc Document) error {
if index := bytes.IndexByte(doc.Content, 0); index > 0 {
doc.SkipReason = SkipReasonBinary
// Skip binary check if already computed (e.g., by Builder.Add
// which calls DocChecker.Check before docs reach buildShard).
if doc.Category == FileCategoryMissing {
if index := bytes.IndexByte(doc.Content, 0); index > 0 {
doc.SkipReason = SkipReasonBinary
}
}

if doc.SkipReason != SkipReasonNone {
Expand All @@ -547,7 +558,9 @@ func (b *ShardBuilder) Add(doc Document) error {
}

DetermineLanguageIfUnknown(&doc)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should do this after determine file category like I commented above. I will send a PR for that to not block this PR any more :)

DetermineFileCategory(&doc)
if doc.Category == FileCategoryMissing {
DetermineFileCategory(&doc)
}

sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData})
var last DocumentSection
Expand Down
Loading