sourcegraph · keegancsmith · Mar 24, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026
diff --git a/index/builder.go b/index/builder.go
@@ -275,6 +275,11 @@ type Builder struct {
 	id string
 
 	finishCalled bool
+
+	// postingsPool reuses postingsBuilder instances across shard builds,
+	// retaining their map and slice allocations to avoid repeated
+	// memclr/madvise overhead.
+	postingsPool sync.Pool
 }
 
 type finishedShard struct {
@@ -984,7 +989,9 @@ func (b *Builder) buildShard(todo []*Document, nextShardNum int) (*finishedShard
 		}
 	}
 
-	return b.writeShard(name, shardBuilder)
+	result, err := b.writeShard(name, shardBuilder)
+	b.returnPostingsBuilders(shardBuilder)
+	return result, err
 }
 
 // CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the
@@ -1018,14 +1025,37 @@ func (b *Builder) CheckMemoryUsage() {
 	}
 }
 
+func (b *Builder) getPostingsBuilder() *postingsBuilder {
+	if pb, ok := b.postingsPool.Get().(*postingsBuilder); ok {
+		pb.reset()
+		return pb
+	}
+	return newPostingsBuilder(b.opts.ShardMax)
+}
+
+// returnPostingsBuilders returns both postings builders from sb to the
+// pool and nils the fields so any subsequent misuse crashes obviously.
+func (b *Builder) returnPostingsBuilders(sb *ShardBuilder) {
+	if sb.contentPostings != nil {
+		b.postingsPool.Put(sb.contentPostings)
+		sb.contentPostings = nil
+	}
+	if sb.namePostings != nil {
+		b.postingsPool.Put(sb.namePostings)
+		sb.namePostings = nil
+	}
+}
+
 func (b *Builder) newShardBuilder() (*ShardBuilder, error) {
 	desc := b.opts.RepositoryDescription
 	desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != ""
 	desc.SubRepoMap = b.opts.SubRepositories
 	desc.IndexOptions = b.opts.GetHash()
 
-	shardBuilder, err := NewShardBuilder(&desc)
-	if err != nil {
+	content := b.getPostingsBuilder()
+	name := b.getPostingsBuilder()
+	shardBuilder := newShardBuilderWithPostings(content, name)
+	if err := shardBuilder.setRepository(&desc); err != nil {
 		return nil, err
 	}
 	shardBuilder.IndexTime = b.indexTime

diff --git a/index/index_test.go b/index/index_test.go
@@ -66,7 +66,7 @@ func testShardBuilder(tb testing.TB, repo *zoekt.Repository, docs ...Document) *
 func testShardBuilderCompound(t *testing.T, repos []*zoekt.Repository, docs [][]Document) *ShardBuilder {
 	t.Helper()
 
-	b := newShardBuilder()
+	b := newShardBuilder(0)
 	b.indexFormatVersion = NextIndexFormatVersion
 
 	if len(repos) != len(docs) {
@@ -2144,7 +2144,7 @@ func TestMetadata(t *testing.T) {
 }
 
 func TestRepoWithMetadata(t *testing.T) {
-	sb := newShardBuilder()
+	sb := newShardBuilder(0)
 	sb.repoList = []zoekt.Repository{
 		{
 			Name:     "repo1",

diff --git a/index/merge.go b/index/merge.go
@@ -98,7 +98,7 @@ func merge(ds ...*indexData) (*ShardBuilder, error) {
 		return ds[i].repoMetaData[0].GetPriority() > ds[j].repoMetaData[0].GetPriority()
 	})
 
-	sb := newShardBuilder()
+	sb := newShardBuilder(0)
 	sb.indexFormatVersion = NextIndexFormatVersion
 
 	for _, d := range ds {
@@ -246,7 +246,7 @@ func explode(dstDir string, f IndexFile, ibFuncs ...shardBuilderFunc) (map[strin
 				}
 			}
 
-			sb = newShardBuilder()
+			sb = newShardBuilder(0)
 			sb.indexFormatVersion = IndexFormatVersion
 			if err := sb.setRepository(&d.repoMetaData[repoID]); err != nil {
 				return shardNames, err

diff --git a/index/postings_bench_test.go b/index/postings_bench_test.go
@@ -0,0 +1,150 @@
+package index
+
+import (
+	"bytes"
+	"fmt"
+	"io/fs"
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+// Set ZOEKT_BENCH_REPO to a source tree (e.g. a kubernetes checkout) to enable.
+//
+//	git clone --depth=1 https://github.com/kubernetes/kubernetes /tmp/k8s
+//	ZOEKT_BENCH_REPO=/tmp/k8s go test ./index/ -bench=BenchmarkPostings -benchmem -count=5 -timeout=600s
+
+func requireBenchRepo(b *testing.B) string {
+	b.Helper()
+	dir := os.Getenv("ZOEKT_BENCH_REPO")
+	if dir == "" {
+		b.Skip("ZOEKT_BENCH_REPO not set")
+	}
+	return dir
+}
+
+// loadRepoFiles walks dir and returns file contents, skipping binary files,
+// empty files, and anything over 1 MB. Returns at most maxFiles entries.
+func loadRepoFiles(b *testing.B, dir string, maxFiles int) [][]byte {
+	b.Helper()
+	var files [][]byte
+	err := filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error {
+		if err != nil {
+			return nil
+		}
+		if d.IsDir() {
+			switch d.Name() {
+			case ".git", "vendor", "node_modules":
+				return filepath.SkipDir
+			}
+			return nil
+		}
+		if len(files) >= maxFiles {
+			return filepath.SkipAll
+		}
+		info, err := d.Info()
+		if err != nil || info.Size() == 0 || info.Size() > 1<<20 {
+			return nil
+		}
+		data, err := os.ReadFile(path)
+		if err != nil {
+			return nil
+		}
+		if bytes.IndexByte(data, 0) >= 0 {
+			return nil // binary
+		}
+		files = append(files, data)
+		return nil
+	})
+	if err != nil {
+		b.Fatalf("walking repo: %v", err)
+	}
+	if len(files) == 0 {
+		b.Fatal("no files found in repo")
+	}
+	return files
+}
+
+func totalSize(files [][]byte) int64 {
+	var n int64
+	for _, f := range files {
+		n += int64(len(f))
+	}
+	return n
+}
+
+// BenchmarkPostings_NewSearchableString measures the core hot path: trigram
+// extraction, map lookups, delta encoding, and per-trigram slice growth.
+// Sub-benchmarks vary corpus size to show scaling with map size.
+func BenchmarkPostings_NewSearchableString(b *testing.B) {
+	dir := requireBenchRepo(b)
+	allFiles := loadRepoFiles(b, dir, 50_000)
+	b.Logf("loaded %d files, %.1f MB", len(allFiles), float64(totalSize(allFiles))/(1<<20))
+
+	for _, n := range []int{1_000, 5_000, len(allFiles)} {
+		n = min(n, len(allFiles))
+		files := allFiles[:n]
+		size := totalSize(files)
+
+		b.Run(fmt.Sprintf("files=%d", n), func(b *testing.B) {
+			b.ReportAllocs()
+			for b.Loop() {
+				pb := newPostingsBuilder(defaultShardMax)
+				for _, data := range files {
+					_, _, _ = pb.newSearchableString(data, nil)
+				}
+			}
+			b.ReportMetric(float64(size), "input-bytes/op")
+		})
+	}
+}
+
+// BenchmarkPostings_Reuse measures the warm path: building postings with a
+// reset (pooled) postingsBuilder that retains its map and slice allocations
+// from a previous shard build.
+func BenchmarkPostings_Reuse(b *testing.B) {
+	dir := requireBenchRepo(b)
+	allFiles := loadRepoFiles(b, dir, 50_000)
+	size := totalSize(allFiles)
+	b.Logf("loaded %d files, %.1f MB", len(allFiles), float64(size)/(1<<20))
+
+	// Warm up the builder so it has allocated map entries and slices.
+	pb := newPostingsBuilder(defaultShardMax)
+	for _, data := range allFiles {
+		_, _, _ = pb.newSearchableString(data, nil)
+	}
+
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		pb.reset()
+		for _, data := range allFiles {
+			_, _, _ = pb.newSearchableString(data, nil)
+		}
+	}
+	b.ReportMetric(float64(size), "input-bytes/op")
+}
+
+// BenchmarkPostings_WritePostings measures the marshaling path: sorting ngram
+// keys and writing varint-encoded posting lists.
+func BenchmarkPostings_WritePostings(b *testing.B) {
+	dir := requireBenchRepo(b)
+	allFiles := loadRepoFiles(b, dir, 50_000)
+
+	pb := newPostingsBuilder(defaultShardMax)
+	for _, data := range allFiles {
+		_, _, _ = pb.newSearchableString(data, nil)
+	}
+	b.Logf("built %d unique ngrams from %d files, %.1f MB", len(pb.postings), len(allFiles), float64(totalSize(allFiles))/(1<<20))
+
+	buf := &bytes.Buffer{}
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		buf.Reset()
+		w := &writer{w: buf}
+		var ngramText, charOffsets, endRunes simpleSection
+		var postings compoundSection
+		writePostings(w, pb, &ngramText, &charOffsets, &postings, &endRunes)
+	}
+}