From 2c5222ce403158742b621f0a1521f53b92c5af9f Mon Sep 17 00:00:00 2001 From: David Ndungu Date: Thu, 26 Mar 2026 11:20:12 -0700 Subject: [PATCH] fix: enable addLeadingSpace for SentencePiece unigram models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SetSentencePiece(true) now sets addLeadingSpace=true as a persistent field on BPETokenizer, matching llama.cpp / SentencePiece default behavior. Previously addLeadingSpace was only a parameter passed through the call chain — making it a field ensures the first word always gets the ▁ prefix prepended, so tokens like ▁What are found by the Viterbi DP instead of falling back to character-level tokens. Also adds SetAddLeadingSpace() for GGUF models that override the default via tokenizer.ggml.add_space_prefix metadata. --- bpe.go | 20 ++++++++++++--- bpe_test.go | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 3 deletions(-) diff --git a/bpe.go b/bpe.go index e89f4ba..063b185 100644 --- a/bpe.go +++ b/bpe.go @@ -37,6 +37,10 @@ type BPETokenizer struct { // sentencePiece enables SentencePiece-style pre-tokenization where spaces // are replaced with ▁ (U+2581) and words are split at ▁ boundaries. sentencePiece bool + // addLeadingSpace prepends ▁ to the first word during SentencePiece + // pre-tokenization. This is true by default for SentencePiece models, + // matching llama.cpp / SentencePiece behavior. + addLeadingSpace bool // specialTokens maps special token strings to their IDs for exact matching // during encoding (e.g., "" -> 105). specialTokens map[string]int @@ -92,8 +96,9 @@ func (t *BPETokenizer) Encode(text string) ([]int, error) { } // encodeSegment tokenizes a text segment that contains no special tokens. -// addLeadingSpace controls whether SentencePiece mode prepends ▁ to the text. -func (t *BPETokenizer) encodeSegment(text string, addLeadingSpace bool) ([]int, error) { +// isFirstSegment indicates this is the first text segment (before any special +// tokens), which determines whether the addLeadingSpace field applies. +func (t *BPETokenizer) encodeSegment(text string, isFirstSegment bool) ([]int, error) { if text == "" { return nil, nil } @@ -101,7 +106,7 @@ func (t *BPETokenizer) encodeSegment(text string, addLeadingSpace bool) ([]int, if t.byteLevelBPE { words = t.byteLevelPreTokenize(text) } else if t.sentencePiece { - words = t.sentencePiecePreTokenize(text, addLeadingSpace) + words = t.sentencePiecePreTokenize(text, isFirstSegment && t.addLeadingSpace) } else { words = strings.Fields(text) } @@ -276,6 +281,15 @@ func (t *BPETokenizer) SpecialTokens() SpecialTokens { // are replaced with ▁ (U+2581) and the text is split at ▁ boundaries. func (t *BPETokenizer) SetSentencePiece(enabled bool) { t.sentencePiece = enabled + t.addLeadingSpace = enabled +} + +// SetAddLeadingSpace controls whether SentencePiece mode prepends ▁ to the +// first word. By default this is set to true when SetSentencePiece is called, +// matching llama.cpp / SentencePiece behavior. GGUF models may override this +// via the tokenizer.ggml.add_space_prefix metadata key. +func (t *BPETokenizer) SetAddLeadingSpace(enabled bool) { + t.addLeadingSpace = enabled } // SetSpecialTokenStrings registers token strings that should be matched diff --git a/bpe_test.go b/bpe_test.go index aa61f0e..bc1654b 100644 --- a/bpe_test.go +++ b/bpe_test.go @@ -938,6 +938,79 @@ func TestSentencePieceUnigram_ByteFallbackStillWorksForUnknownChars(t *testing.T } } +func TestSentencePieceUnigram_AddLeadingSpaceDefault(t *testing.T) { + // Regression test: SetSentencePiece(true) must enable addLeadingSpace so + // the Viterbi receives "▁What" (7 bytes) as input rather than "What" (4 bytes). + // Without addLeadingSpace, the ▁ prefix is missing and the Viterbi produces + // byte-level or character-level fallback tokens instead of matching "▁What". + vocab := map[string]int{ + "": 0, + "": 1, + "": 2, + "\u2581What": 3, + "\u2581is": 4, + "\u2581the": 5, + "\u2581capital": 6, + "\u2581of": 7, + "\u2581France": 8, + "?": 9, + "W": 10, + "h": 11, + "a": 12, + "t": 13, + } + // Add byte fallback tokens. + nextID := 14 + for b := 0; b < 256; b++ { + tok := fmt.Sprintf("<0x%02X>", b) + vocab[tok] = nextID + nextID++ + } + + scores := make([]float32, nextID) + scores[0] = -100 + scores[1] = -100 + scores[2] = -100 + scores[3] = -2.0 // ▁What + scores[4] = -2.0 // ▁is + scores[5] = -2.0 // ▁the + scores[6] = -2.0 // ▁capital + scores[7] = -2.0 // ▁of + scores[8] = -2.0 // ▁France + scores[9] = -3.0 // ? + scores[10] = -5.0 // W + scores[11] = -5.0 // h + scores[12] = -5.0 // a + scores[13] = -5.0 // t + for i := 14; i < nextID; i++ { + scores[i] = -10.0 + } + + special := SpecialTokens{BOS: 1, EOS: 2, PAD: 0, UNK: 0} + tok := NewBPETokenizer(vocab, nil, special, false) + tok.SetSentencePiece(true) // Must also set addLeadingSpace = true + tok.SetScores(scores) + + ids, err := tok.Encode("What is the capital of France?") + if err != nil { + t.Fatalf("Encode error: %v", err) + } + // With addLeadingSpace=true, pre-tokenizer produces: + // ["▁What", "▁is", "▁the", "▁capital", "▁of", "▁France?"] + // The Viterbi should match ▁What (ID 3) as a single token. + // Without addLeadingSpace, "What" has no ▁ prefix and falls back to + // character tokens [W, h, a, t] — this was the bug. + want := []int{3, 4, 5, 6, 7, 8, 9} + if len(ids) != len(want) { + t.Fatalf("Encode produced %d tokens %v, want %d tokens %v", len(ids), ids, len(want), want) + } + for i, id := range ids { + if id != want[i] { + t.Errorf("[%d] = %d, want %d", i, id, want[i]) + } + } +} + func TestDecodeSentencePieceBytes(t *testing.T) { tests := []struct { name string