-
Notifications
You must be signed in to change notification settings - Fork 196
index: 3.7x faster posting list construction via direct-indexed ASCII array #1020
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8029cb3
c02c841
0b70a64
ad39bc9
011193a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -275,6 +275,11 @@ type Builder struct { | |
| id string | ||
|
|
||
| finishCalled bool | ||
|
|
||
| // postingsPool reuses postingsBuilder instances across shard builds, | ||
| // retaining their map and slice allocations to avoid repeated | ||
| // memclr/madvise overhead. | ||
| postingsPool sync.Pool | ||
| } | ||
|
|
||
| type finishedShard struct { | ||
|
|
@@ -984,7 +989,9 @@ func (b *Builder) buildShard(todo []*Document, nextShardNum int) (*finishedShard | |
| } | ||
| } | ||
|
|
||
| return b.writeShard(name, shardBuilder) | ||
| result, err := b.writeShard(name, shardBuilder) | ||
| b.returnPostingsBuilders(shardBuilder) | ||
| return result, err | ||
| } | ||
|
|
||
| // CheckMemoryUsage checks the memory usage of the process and writes a memory profile if the heap usage exceeds the | ||
|
|
@@ -1018,14 +1025,37 @@ func (b *Builder) CheckMemoryUsage() { | |
| } | ||
| } | ||
|
|
||
| func (b *Builder) getPostingsBuilder() *postingsBuilder { | ||
| if pb, ok := b.postingsPool.Get().(*postingsBuilder); ok { | ||
| pb.reset() | ||
| return pb | ||
| } | ||
| return newPostingsBuilder(b.opts.ShardMax) | ||
| } | ||
|
|
||
| // returnPostingsBuilders returns both postings builders from sb to the | ||
| // pool and nils the fields so any subsequent misuse crashes obviously. | ||
| func (b *Builder) returnPostingsBuilders(sb *ShardBuilder) { | ||
| if sb.contentPostings != nil { | ||
| b.postingsPool.Put(sb.contentPostings) | ||
| sb.contentPostings = nil | ||
| } | ||
| if sb.namePostings != nil { | ||
| b.postingsPool.Put(sb.namePostings) | ||
| sb.namePostings = nil | ||
| } | ||
| } | ||
|
|
||
| func (b *Builder) newShardBuilder() (*ShardBuilder, error) { | ||
| desc := b.opts.RepositoryDescription | ||
| desc.HasSymbols = !b.opts.DisableCTags && b.opts.CTagsPath != "" | ||
| desc.SubRepoMap = b.opts.SubRepositories | ||
| desc.IndexOptions = b.opts.GetHash() | ||
|
|
||
| shardBuilder, err := NewShardBuilder(&desc) | ||
| if err != nil { | ||
| content := b.getPostingsBuilder() | ||
| name := b.getPostingsBuilder() | ||
| shardBuilder := newShardBuilderWithPostings(content, name) | ||
| if err := shardBuilder.setRepository(&desc); err != nil { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. an error here is super rare, is it worth complicating matter and doing a put after this?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed, removed in ad39bc9. |
||
| return nil, err | ||
| } | ||
| shardBuilder.IndexTime = b.indexTime | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,150 @@ | ||
| package index | ||
|
|
||
| import ( | ||
| "bytes" | ||
| "fmt" | ||
| "io/fs" | ||
| "os" | ||
| "path/filepath" | ||
| "testing" | ||
| ) | ||
|
|
||
| // Set ZOEKT_BENCH_REPO to a source tree (e.g. a kubernetes checkout) to enable. | ||
| // | ||
| // git clone --depth=1 https://github.com/kubernetes/kubernetes /tmp/k8s | ||
| // ZOEKT_BENCH_REPO=/tmp/k8s go test ./index/ -bench=BenchmarkPostings -benchmem -count=5 -timeout=600s | ||
|
|
||
| func requireBenchRepo(b *testing.B) string { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI we have some infra here to download copies of code for these sorts of tests
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good pointer, thanks. Will look into aligning with |
||
| b.Helper() | ||
| dir := os.Getenv("ZOEKT_BENCH_REPO") | ||
| if dir == "" { | ||
| b.Skip("ZOEKT_BENCH_REPO not set") | ||
| } | ||
| return dir | ||
| } | ||
|
|
||
| // loadRepoFiles walks dir and returns file contents, skipping binary files, | ||
| // empty files, and anything over 1 MB. Returns at most maxFiles entries. | ||
| func loadRepoFiles(b *testing.B, dir string, maxFiles int) [][]byte { | ||
| b.Helper() | ||
| var files [][]byte | ||
| err := filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error { | ||
| if err != nil { | ||
| return nil | ||
| } | ||
| if d.IsDir() { | ||
| switch d.Name() { | ||
| case ".git", "vendor", "node_modules": | ||
| return filepath.SkipDir | ||
| } | ||
| return nil | ||
| } | ||
| if len(files) >= maxFiles { | ||
| return filepath.SkipAll | ||
| } | ||
| info, err := d.Info() | ||
| if err != nil || info.Size() == 0 || info.Size() > 1<<20 { | ||
| return nil | ||
| } | ||
| data, err := os.ReadFile(path) | ||
| if err != nil { | ||
| return nil | ||
| } | ||
| if bytes.IndexByte(data, 0) >= 0 { | ||
| return nil // binary | ||
| } | ||
| files = append(files, data) | ||
| return nil | ||
| }) | ||
| if err != nil { | ||
| b.Fatalf("walking repo: %v", err) | ||
| } | ||
| if len(files) == 0 { | ||
| b.Fatal("no files found in repo") | ||
| } | ||
| return files | ||
| } | ||
|
|
||
| func totalSize(files [][]byte) int64 { | ||
| var n int64 | ||
| for _, f := range files { | ||
| n += int64(len(f)) | ||
| } | ||
| return n | ||
| } | ||
|
|
||
| // BenchmarkPostings_NewSearchableString measures the core hot path: trigram | ||
| // extraction, map lookups, delta encoding, and per-trigram slice growth. | ||
| // Sub-benchmarks vary corpus size to show scaling with map size. | ||
| func BenchmarkPostings_NewSearchableString(b *testing.B) { | ||
| dir := requireBenchRepo(b) | ||
| allFiles := loadRepoFiles(b, dir, 50_000) | ||
| b.Logf("loaded %d files, %.1f MB", len(allFiles), float64(totalSize(allFiles))/(1<<20)) | ||
|
|
||
| for _, n := range []int{1_000, 5_000, len(allFiles)} { | ||
| n = min(n, len(allFiles)) | ||
| files := allFiles[:n] | ||
| size := totalSize(files) | ||
|
|
||
| b.Run(fmt.Sprintf("files=%d", n), func(b *testing.B) { | ||
| b.ReportAllocs() | ||
| for b.Loop() { | ||
| pb := newPostingsBuilder(defaultShardMax) | ||
| for _, data := range files { | ||
| _, _, _ = pb.newSearchableString(data, nil) | ||
| } | ||
| } | ||
| b.ReportMetric(float64(size), "input-bytes/op") | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| // BenchmarkPostings_Reuse measures the warm path: building postings with a | ||
| // reset (pooled) postingsBuilder that retains its map and slice allocations | ||
| // from a previous shard build. | ||
| func BenchmarkPostings_Reuse(b *testing.B) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this accurately measures the actual impact on reuse. This is since we won't be calling newPostingsBuilder in a hot loop. Instead it will be called once per shard.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right — the Reuse benchmark measures allocation reduction potential, not realistic shard-building throughput. The real end-to-end impact measured via hyperfine on kubernetes: |
||
| dir := requireBenchRepo(b) | ||
| allFiles := loadRepoFiles(b, dir, 50_000) | ||
| size := totalSize(allFiles) | ||
| b.Logf("loaded %d files, %.1f MB", len(allFiles), float64(size)/(1<<20)) | ||
|
|
||
| // Warm up the builder so it has allocated map entries and slices. | ||
| pb := newPostingsBuilder(defaultShardMax) | ||
| for _, data := range allFiles { | ||
| _, _, _ = pb.newSearchableString(data, nil) | ||
| } | ||
|
|
||
| b.ResetTimer() | ||
| b.ReportAllocs() | ||
| for b.Loop() { | ||
| pb.reset() | ||
| for _, data := range allFiles { | ||
| _, _, _ = pb.newSearchableString(data, nil) | ||
| } | ||
| } | ||
| b.ReportMetric(float64(size), "input-bytes/op") | ||
| } | ||
|
|
||
| // BenchmarkPostings_WritePostings measures the marshaling path: sorting ngram | ||
| // keys and writing varint-encoded posting lists. | ||
| func BenchmarkPostings_WritePostings(b *testing.B) { | ||
| dir := requireBenchRepo(b) | ||
| allFiles := loadRepoFiles(b, dir, 50_000) | ||
|
|
||
| pb := newPostingsBuilder(defaultShardMax) | ||
| for _, data := range allFiles { | ||
| _, _, _ = pb.newSearchableString(data, nil) | ||
| } | ||
| b.Logf("built %d unique ngrams from %d files, %.1f MB", len(pb.postings), len(allFiles), float64(totalSize(allFiles))/(1<<20)) | ||
|
|
||
| buf := &bytes.Buffer{} | ||
| b.ResetTimer() | ||
| b.ReportAllocs() | ||
| for b.Loop() { | ||
| buf.Reset() | ||
| w := &writer{w: buf} | ||
| var ngramText, charOffsets, endRunes simpleSection | ||
| var postings compoundSection | ||
| writePostings(w, pb, &ngramText, &charOffsets, &postings, &endRunes) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm kinda surprised this pool has a hit rate. Under what conditions did you test this? Did you have smaller shard size?
The kubernetes repo is a handful of shards right? With concurrency the opportunity to re-use an older postingsPool before garbage collection has had a chance to free stuff from sync.Pool seems low?
Maybe the GC just doesn't free as often as I would expect from sync.Pool.
If you are just testing this with all the normal defaults (including GOGC being on, etc) and its working cool lets use this optimization. If you are tweaking the defaults somehow, then we could probably do better with maintaining our own free list instead of relying on sync.Pool
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Default
Parallelismis 4, so up to 4 shards build concurrently. sync.Pool has per-P caching — each goroutine gets/puts on its own P cache without contention. After a goroutine finishes and Puts, the next goroutine scheduled on that P Gets the cached builder. Worst case (GC clears pool between shards) = fresh allocation, identical to no pool.Tested with normal defaults (GOGC=100, no tuning).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you get hits outside of benchmarks though? Mind testing end-to-end with something like zoekt-git-index (or some other zoekt binary) and adding in a log line or something on if we successfully get a value out of this sync.Pool.
I agree this is useful optimization, especially for much larger repos or the shard size is tuned down so we have more shards. However, I have a suspicion in practice sync.Pool for this has a bad hitrate since GC would of run by the time sync.Pool.Get is called. So we would rather need to implement our own free list than just plugging in sync.Pool to get the benefit in practice.
But this is all a suspicion of mine, would be great to get confirmation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instrumented
getPostingsBuilderwith a log line and tested end-to-end withzoekt-git-indexon kubernetes (23K files, 162 MB):Your suspicion is partially confirmed — at default shard size (few large shards), the hit rate is poor. But with smaller shards it reaches 59%, and the pool never hurts (miss = fresh allocation = status quo).
Since Go 1.13,
sync.Pooluses a victim cache that survives one extra GC cycle, so objects aren't cleared immediately — they get ~2 cycles. The 29% at 100 MB shards reflects that shard builds are still long enough for 2+ GC cycles to pass.I think keeping the pool is the right call:
Separately, I added a sparse index (
asciiPopulated []uint32) that tracks populated ASCII array slots. This makesreset()andwritePostingsiterate only ~275K populated entries instead of scanning all 2M slots, while still retaining all postingList allocations for pool reuse. The warm-path benchmark confirms identical B/op (0.55 MB) and allocs/op (22,978) to the original.