From cd50f638214bfd4bcc44c9d12bdf7dafcc173fc9 Mon Sep 17 00:00:00 2001 From: David Ndungu Date: Thu, 26 Mar 2026 11:31:25 -0700 Subject: [PATCH] fix: use greedy longest-match for SentencePiece (matches llama.cpp) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace Viterbi DP with greedy leftmost-longest match in sentencePieceEncode. This matches llama.cpp's llm_tokenizer_spm behavior and fixes token splitting where sub-token score sums beat whole-token scores (e.g., "▁What" splitting into "Wh"+"at"). --- bpe.go | 116 ++++++++++++++++------------------------------------ bpe_test.go | 40 +++++++++--------- 2 files changed, 54 insertions(+), 102 deletions(-) diff --git a/bpe.go b/bpe.go index 063b185..b17b094 100644 --- a/bpe.go +++ b/bpe.go @@ -18,7 +18,8 @@ type MergePair struct { // BPETokenizer implements the Tokenizer interface using byte-pair encoding. // It loads vocabulary and merge rules from HuggingFace tokenizer.json format. // When scores are set and merges are empty, it falls back to SentencePiece -// unigram encoding using greedy longest-match with score-based selection. +// encoding using greedy leftmost-longest match with score-based tie-breaking, +// matching llama.cpp behavior. // // Stable. type BPETokenizer struct { @@ -313,109 +314,62 @@ func (t *BPETokenizer) SetScores(scores []float32) { } } -// sentencePieceEncode tokenizes text using Viterbi dynamic programming to find -// the segmentation that maximizes the sum of log-probability scores. +// sentencePieceEncode tokenizes text using greedy leftmost-longest match. // -// This is used for SentencePiece unigram models that provide vocabulary -// scores but no BPE merge table (e.g., Mistral 7B GGUF). The Viterbi approach -// finds the globally optimal segmentation, unlike greedy longest-match which -// can produce suboptimal splits. +// At each position, the longest vocabulary token is selected. Ties in length +// are broken by score (higher wins). This matches the llama.cpp SentencePiece +// tokenizer (llm_tokenizer_spm::tokenize) and produces the same output as +// HuggingFace for GGUF models. When no vocabulary token matches a byte, +// <0xNN> byte fallback tokens are used. func (t *BPETokenizer) sentencePieceEncode(text string) []int { if text == "" { return nil } - n := len(text) // byte length + var ids []int + pos := 0 + n := len(text) - // Viterbi forward pass: find best segmentation. - // bestScore[i] = best total score for text[:i] - // bestLen[i] = byte length of the last token in the best path ending at i - bestScore := make([]float64, n+1) - bestLen := make([]int, n+1) - for i := range bestScore { - bestScore[i] = math.Inf(-1) - } - bestScore[0] = 0 + for pos < n { + // Find the longest matching token at the current position. + bestID := -1 + bestLen := 0 + bestScore := float32(math.Inf(-1)) - for i := 0; i < n; i++ { - if math.IsInf(bestScore[i], -1) { - continue - } - // Try all possible tokens starting at position i. maxLen := t.maxTokenLen - if maxLen > n-i { - maxLen = n - i + if maxLen > n-pos { + maxLen = n - pos } + for tokenLen := 1; tokenLen <= maxLen; tokenLen++ { - candidate := text[i : i+tokenLen] + candidate := text[pos : pos+tokenLen] if id, ok := t.vocab[candidate]; ok { - score := bestScore[i] + float64(t.tokenScore(id)) - if score > bestScore[i+tokenLen] { - bestScore[i+tokenLen] = score - bestLen[i+tokenLen] = tokenLen + score := t.tokenScore(id) + // Prefer longest match; break ties by score. + if tokenLen > bestLen || (tokenLen == bestLen && score > bestScore) { + bestID = id + bestLen = tokenLen + bestScore = score } } } - // Byte fallback: use <0xNN> as last resort when no vocab token covers - // position i. Byte tokens get a fixed penalty of -1e6 so they never - // beat real vocabulary tokens in the Viterbi DP. This matches - // llama.cpp / SentencePiece behavior where byte fallback is only - // used for characters that have no vocabulary coverage. - byteToken := fmt.Sprintf("<0x%02X>", text[i]) - if id, ok := t.vocab[byteToken]; ok { - _ = id // byte token exists but we ignore its vocab score - score := bestScore[i] + (-1e6) - if score > bestScore[i+1] { - bestScore[i+1] = score - bestLen[i+1] = 1 - } - } else { - // Byte token not in vocab; use unknown score as last resort. - score := bestScore[i] + float64(t.unknownScore()) - if score > bestScore[i+1] { - bestScore[i+1] = score - bestLen[i+1] = 1 - } - } - } - - // If we can't reach the end, return nil. - if math.IsInf(bestScore[n], -1) { - return nil - } - // Backtrack to find token sequence. - var tokens []int - pos := n - for pos > 0 { - tokLen := bestLen[pos] - candidate := text[pos-tokLen : pos] - if id, ok := t.vocab[candidate]; ok { - tokens = append(tokens, id) + if bestID >= 0 { + ids = append(ids, bestID) + pos += bestLen } else { - // Byte fallback for single-byte token. - byteToken := fmt.Sprintf("<0x%02X>", text[pos-tokLen]) + // Byte fallback: use <0xNN> for the current byte. + byteToken := fmt.Sprintf("<0x%02X>", text[pos]) if id, ok := t.vocab[byteToken]; ok { - tokens = append(tokens, id) + ids = append(ids, id) } else { - tokens = append(tokens, t.special.UNK) + ids = append(ids, t.special.UNK) } + pos++ } - pos -= tokLen } - // Reverse (we built it backwards). - for i, j := 0, len(tokens)-1; i < j; i, j = i+1, j-1 { - tokens[i], tokens[j] = tokens[j], tokens[i] - } - - return tokens -} - -// unknownScore returns a very negative score used for byte fallback tokens -// when the <0xNN> token is not in the vocabulary. -func (t *BPETokenizer) unknownScore() float32 { - return -100.0 + return ids } // tokenScore returns the score for a token ID, or 0 if scores are not set diff --git a/bpe_test.go b/bpe_test.go index bc1654b..ca03590 100644 --- a/bpe_test.go +++ b/bpe_test.go @@ -559,9 +559,9 @@ func TestSentencePieceUnigram_RoundTrip(t *testing.T) { func TestSentencePieceUnigram_UnknownChars(t *testing.T) { tok := makeTestSentencePieceUnigram() - // Characters not in vocab should produce UNK or ▁ tokens via Viterbi. + // Characters not in vocab should produce UNK or ▁ tokens via byte fallback. // "xyz" -> pre-tokenized as "▁xyz". Since ▁x, ▁y, ▁z are not in vocab, - // Viterbi will match ▁ first, then x, y, z via byte fallback or UNK. + // greedy will match ▁ first, then x, y, z via byte fallback or UNK. ids, err := tok.Encode("xyz") if err != nil { t.Fatalf("Encode error: %v", err) @@ -576,11 +576,11 @@ func TestSentencePieceUnigram_UnknownChars(t *testing.T) { } } -func TestSentencePieceUnigram_ViterbiOptimal(t *testing.T) { +func TestSentencePieceUnigram_LongestMatch(t *testing.T) { tok := makeTestSentencePieceUnigram() - // "Hello" should encode as one token ▁Hello (id=8) via Viterbi, - // since it has the best score (-1.0) vs splitting into subwords. + // "Hello" should encode as one token ▁Hello (id=8) via greedy longest-match, + // since it is the longest matching token at position 0. ids, err := tok.Encode("Hello") if err != nil { t.Fatalf("Encode error: %v", err) @@ -608,11 +608,10 @@ func TestSentencePieceUnigram_WithBPEFallback(t *testing.T) { } } -func TestSentencePieceUnigram_ViterbiBeatsGreedy(t *testing.T) { - // This test demonstrates that Viterbi finds a better segmentation than greedy. - // Vocab has "▁hel" and "lo" with good scores, and "▁h" and "ello" with worse scores. - // Greedy longest-match would pick "▁hello" if available, or "▁hell" + "o". - // But here we set up scores so "▁hel" + "lo" is strictly better than "▁hell" + "o". +func TestSentencePieceUnigram_GreedyLongestMatch(t *testing.T) { + // Greedy leftmost-longest match picks the longest token at each position. + // For "▁hello": longest match at pos 0 is "▁hell" (4 chars), then "o". + // This matches llama.cpp / HuggingFace SentencePiece behavior. vocab := map[string]int{ "": 0, "": 1, @@ -633,11 +632,11 @@ func TestSentencePieceUnigram_ViterbiBeatsGreedy(t *testing.T) { scores[3] = -5.0 // ▁ scores[4] = -4.0 // ▁h scores[5] = -3.0 // ▁he - scores[6] = -1.5 // ▁hel (good) - scores[7] = -3.0 // ▁hell (worse than ▁hel) - scores[8] = -3.0 // o (bad) - scores[9] = -4.0 // l (bad) - scores[10] = -1.0 // lo (good) + scores[6] = -1.5 // ▁hel + scores[7] = -3.0 // ▁hell + scores[8] = -3.0 // o + scores[9] = -4.0 // l + scores[10] = -1.0 // lo special := SpecialTokens{BOS: 1, EOS: 2, PAD: 0, UNK: 0} tok := NewBPETokenizer(vocab, nil, special, false) @@ -648,9 +647,8 @@ func TestSentencePieceUnigram_ViterbiBeatsGreedy(t *testing.T) { if err != nil { t.Fatalf("Encode error: %v", err) } - // Viterbi should choose "▁hel" + "lo" (score -1.5 + -1.0 = -2.5) - // over "▁hell" + "o" (score -3.0 + -3.0 = -6.0). - want := []int{6, 10} // ▁hel, lo + // Greedy picks longest match: "▁hell" (id=7) then "o" (id=8). + want := []int{7, 8} // ▁hell, o if len(ids) != len(want) { t.Fatalf("Encode(\"hello\") = %v, want %v", ids, want) } @@ -940,8 +938,8 @@ func TestSentencePieceUnigram_ByteFallbackStillWorksForUnknownChars(t *testing.T func TestSentencePieceUnigram_AddLeadingSpaceDefault(t *testing.T) { // Regression test: SetSentencePiece(true) must enable addLeadingSpace so - // the Viterbi receives "▁What" (7 bytes) as input rather than "What" (4 bytes). - // Without addLeadingSpace, the ▁ prefix is missing and the Viterbi produces + // the encoder receives "▁What" (7 bytes) as input rather than "What" (4 bytes). + // Without addLeadingSpace, the ▁ prefix is missing and the encoder produces // byte-level or character-level fallback tokens instead of matching "▁What". vocab := map[string]int{ "": 0, @@ -997,7 +995,7 @@ func TestSentencePieceUnigram_AddLeadingSpaceDefault(t *testing.T) { } // With addLeadingSpace=true, pre-tokenizer produces: // ["▁What", "▁is", "▁the", "▁capital", "▁of", "▁France?"] - // The Viterbi should match ▁What (ID 3) as a single token. + // The encoder should match ▁What (ID 3) as a single token. // Without addLeadingSpace, "What" has no ▁ prefix and falls back to // character tokens [W, h, a, t] — this was the bug. want := []int{3, 4, 5, 6, 7, 8, 9}