Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions bpe.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ type BPETokenizer struct {
// sentencePiece enables SentencePiece-style pre-tokenization where spaces
// are replaced with ▁ (U+2581) and words are split at ▁ boundaries.
sentencePiece bool
// addLeadingSpace prepends ▁ to the first word during SentencePiece
// pre-tokenization. This is true by default for SentencePiece models,
// matching llama.cpp / SentencePiece behavior.
addLeadingSpace bool
// specialTokens maps special token strings to their IDs for exact matching
// during encoding (e.g., "<start_of_turn>" -> 105).
specialTokens map[string]int
Expand Down Expand Up @@ -92,16 +96,17 @@ func (t *BPETokenizer) Encode(text string) ([]int, error) {
}

// encodeSegment tokenizes a text segment that contains no special tokens.
// addLeadingSpace controls whether SentencePiece mode prepends ▁ to the text.
func (t *BPETokenizer) encodeSegment(text string, addLeadingSpace bool) ([]int, error) {
// isFirstSegment indicates this is the first text segment (before any special
// tokens), which determines whether the addLeadingSpace field applies.
func (t *BPETokenizer) encodeSegment(text string, isFirstSegment bool) ([]int, error) {
if text == "" {
return nil, nil
}
var words []string
if t.byteLevelBPE {
words = t.byteLevelPreTokenize(text)
} else if t.sentencePiece {
words = t.sentencePiecePreTokenize(text, addLeadingSpace)
words = t.sentencePiecePreTokenize(text, isFirstSegment && t.addLeadingSpace)
} else {
words = strings.Fields(text)
}
Expand Down Expand Up @@ -276,6 +281,15 @@ func (t *BPETokenizer) SpecialTokens() SpecialTokens {
// are replaced with ▁ (U+2581) and the text is split at ▁ boundaries.
func (t *BPETokenizer) SetSentencePiece(enabled bool) {
t.sentencePiece = enabled
t.addLeadingSpace = enabled
}

// SetAddLeadingSpace controls whether SentencePiece mode prepends ▁ to the
// first word. By default this is set to true when SetSentencePiece is called,
// matching llama.cpp / SentencePiece behavior. GGUF models may override this
// via the tokenizer.ggml.add_space_prefix metadata key.
func (t *BPETokenizer) SetAddLeadingSpace(enabled bool) {
t.addLeadingSpace = enabled
}

// SetSpecialTokenStrings registers token strings that should be matched
Expand Down
73 changes: 73 additions & 0 deletions bpe_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -938,6 +938,79 @@ func TestSentencePieceUnigram_ByteFallbackStillWorksForUnknownChars(t *testing.T
}
}

func TestSentencePieceUnigram_AddLeadingSpaceDefault(t *testing.T) {
// Regression test: SetSentencePiece(true) must enable addLeadingSpace so
// the Viterbi receives "▁What" (7 bytes) as input rather than "What" (4 bytes).
// Without addLeadingSpace, the ▁ prefix is missing and the Viterbi produces
// byte-level or character-level fallback tokens instead of matching "▁What".
vocab := map[string]int{
"<unk>": 0,
"<s>": 1,
"</s>": 2,
"\u2581What": 3,
"\u2581is": 4,
"\u2581the": 5,
"\u2581capital": 6,
"\u2581of": 7,
"\u2581France": 8,
"?": 9,
"W": 10,
"h": 11,
"a": 12,
"t": 13,
}
// Add byte fallback tokens.
nextID := 14
for b := 0; b < 256; b++ {
tok := fmt.Sprintf("<0x%02X>", b)
vocab[tok] = nextID
nextID++
}

scores := make([]float32, nextID)
scores[0] = -100
scores[1] = -100
scores[2] = -100
scores[3] = -2.0 // ▁What
scores[4] = -2.0 // ▁is
scores[5] = -2.0 // ▁the
scores[6] = -2.0 // ▁capital
scores[7] = -2.0 // ▁of
scores[8] = -2.0 // ▁France
scores[9] = -3.0 // ?
scores[10] = -5.0 // W
scores[11] = -5.0 // h
scores[12] = -5.0 // a
scores[13] = -5.0 // t
for i := 14; i < nextID; i++ {
scores[i] = -10.0
}

special := SpecialTokens{BOS: 1, EOS: 2, PAD: 0, UNK: 0}
tok := NewBPETokenizer(vocab, nil, special, false)
tok.SetSentencePiece(true) // Must also set addLeadingSpace = true
tok.SetScores(scores)

ids, err := tok.Encode("What is the capital of France?")
if err != nil {
t.Fatalf("Encode error: %v", err)
}
// With addLeadingSpace=true, pre-tokenizer produces:
// ["▁What", "▁is", "▁the", "▁capital", "▁of", "▁France?"]
// The Viterbi should match ▁What (ID 3) as a single token.
// Without addLeadingSpace, "What" has no ▁ prefix and falls back to
// character tokens [W, h, a, t] — this was the bug.
want := []int{3, 4, 5, 6, 7, 8, 9}
if len(ids) != len(want) {
t.Fatalf("Encode produced %d tokens %v, want %d tokens %v", len(ids), ids, len(want), want)
}
for i, id := range ids {
if id != want[i] {
t.Errorf("[%d] = %d, want %d", i, id, want[i])
}
}
}

func TestDecodeSentencePieceBytes(t *testing.T) {
tests := []struct {
name string
Expand Down
Loading