ztoken/tokenizer.go at main · zerfoo/ztoken · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
// Package tokenizer provides text tokenization for ML model inference.
//
// The Tokenizer interface abstracts over different tokenization algorithms
// (whitespace, BPE, SentencePiece). Implementations include WhitespaceTokenizer
// for testing and BPETokenizer for production use with HuggingFace models.
package ztoken

import (
	"strings"
)

// SpecialTokens holds IDs for commonly used special tokens.
//
// Stable.
type SpecialTokens struct {
	BOS int // Beginning of sequence
	EOS int // End of sequence
	PAD int // Padding
	UNK int // Unknown token
}

// Tokenizer is the interface for all tokenizer implementations.
//
// Stable.
type Tokenizer interface {
	// Encode converts text into a sequence of token IDs.
	Encode(text string) ([]int, error)

	// Decode converts a sequence of token IDs back into text.
	Decode(ids []int) (string, error)

	// VocabSize returns the total number of tokens in the vocabulary.
	VocabSize() int

	// GetToken returns the string token for a given ID and whether it exists.
	GetToken(id int) (string, bool)

	// GetID returns the token ID for a given string and whether it exists.
	GetID(token string) (int, bool)

	// SpecialTokens returns the special token IDs for this tokenizer.
	SpecialTokens() SpecialTokens
}

// WhitespaceTokenizer provides simple whitespace-based tokenization.
// It splits text on whitespace boundaries and maps words to integer IDs.
// Useful for testing and non-production scenarios.
//
// Stable.
type WhitespaceTokenizer struct {
	vocab        map[string]int
	reverseVocab map[int]string
	nextID       int
	special      SpecialTokens
}

// NewWhitespaceTokenizer creates a WhitespaceTokenizer pre-loaded with
// standard special tokens: <unk> (0), <s> (1), </s> (2), <pad> (3).
func NewWhitespaceTokenizer() *WhitespaceTokenizer {
	t := &WhitespaceTokenizer{
		vocab:        make(map[string]int),
		reverseVocab: make(map[int]string),
		nextID:       0,
	}
	unkID := t.AddToken("<unk>")
	bosID := t.AddToken("<s>")
	eosID := t.AddToken("</s>")
	padID := t.AddToken("<pad>")
	t.special = SpecialTokens{
		BOS: bosID,
		EOS: eosID,
		PAD: padID,
		UNK: unkID,
	}
	return t
}

// AddToken adds a token to the vocabulary if it does not already exist.
// Returns the token's ID.
func (t *WhitespaceTokenizer) AddToken(token string) int {
	if id, ok := t.vocab[token]; ok {
		return id
	}
	id := t.nextID
	t.vocab[token] = id
	t.reverseVocab[id] = token
	t.nextID++
	return id
}

// Encode splits text on whitespace and returns token IDs.
// Unknown words map to the UNK token ID.
func (t *WhitespaceTokenizer) Encode(text string) ([]int, error) {
	words := strings.Fields(text)
	tokenIDs := make([]int, len(words))
	for i, word := range words {
		if id, ok := t.vocab[word]; ok {
			tokenIDs[i] = id
		} else {
			tokenIDs[i] = t.special.UNK
		}
	}
	return tokenIDs, nil
}

// Decode converts token IDs back to a space-separated string.
func (t *WhitespaceTokenizer) Decode(ids []int) (string, error) {
	words := make([]string, len(ids))
	for i, id := range ids {
		if word, ok := t.reverseVocab[id]; ok {
			words[i] = word
		} else {
			words[i] = "<unk>"
		}
	}
	return strings.Join(words, " "), nil
}

// VocabSize returns the number of tokens in the vocabulary.
func (t *WhitespaceTokenizer) VocabSize() int {
	return len(t.vocab)
}

// GetToken returns the string token for a given ID.
func (t *WhitespaceTokenizer) GetToken(id int) (string, bool) {
	word, ok := t.reverseVocab[id]
	return word, ok
}

// GetID returns the token ID for a given string.
func (t *WhitespaceTokenizer) GetID(token string) (int, bool) {
	id, ok := t.vocab[token]
	return id, ok
}

// SpecialTokens returns the special token IDs.
func (t *WhitespaceTokenizer) SpecialTokens() SpecialTokens {
	return t.special
}

// Statically assert WhitespaceTokenizer implements Tokenizer.
var _ Tokenizer = (*WhitespaceTokenizer)(nil)