-
Notifications
You must be signed in to change notification settings - Fork 11
Expand file tree
/
Copy pathtokenizer.go
More file actions
121 lines (103 loc) · 2.97 KB
/
tokenizer.go
File metadata and controls
121 lines (103 loc) · 2.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
package llama
import (
"encoding/binary"
"fmt"
"io"
"math"
"os"
"strings"
)
// LoadTokenizer reads vocab and scores from a binary file.
func LoadTokenizer(path string, vocabSize int) ([]string, []float32, uint32, error) {
f, err := os.Open(path)
if err != nil {
return nil, nil, 0, err
}
defer f.Close()
var maxLen uint32
if err := binary.Read(f, binary.LittleEndian, &maxLen); err != nil {
return nil, nil, 0, err
}
vocab := make([]string, vocabSize)
scores := make([]float32, vocabSize)
for i := range vocabSize {
var score float32
if err := binary.Read(f, binary.LittleEndian, &score); err != nil {
return nil, nil, 0, fmt.Errorf("failed to read score for index %d: %w", i, err)
}
scores[i] = score
var len int32
if err := binary.Read(f, binary.LittleEndian, &len); err != nil {
return nil, nil, 0, fmt.Errorf("failed to read token length for index %d: %w", i, err)
}
buf := make([]byte, len)
if _, err := io.ReadFull(f, buf); err != nil {
return nil, nil, 0, fmt.Errorf("failed to read token for index %d: %w", i, err)
}
vocab[i] = string(buf)
}
return vocab, scores, maxLen, nil
}
// BPEEncode tokenizes text using byte-pair encoding, aligning with the C implementation.
func BPEEncode(text string, vocab []string, scores []float32, vocabMap map[string]int32, bos bool, eos bool) ([]int32, error) {
// 1. Initial tokenization from string to IDs
tokens := make([]int32, 0, len(text)+3)
// Add BOS token if requested
if bos {
tokens = append(tokens, 1)
}
// llama tokenizer behavior: Add a dummy prefix space if the string is not empty.
if text != "" {
dummyPrefix, ok := vocabMap[" "]
if !ok {
return nil, fmt.Errorf("dummy prefix ' ' not found in vocabulary")
}
tokens = append(tokens, dummyPrefix)
}
// Process the string rune by rune (handles UTF-8 correctly)
for _, r := range text {
charStr := string(r)
id, ok := vocabMap[charStr]
if ok {
// Character is in the vocabulary
tokens = append(tokens, id)
} else {
// Byte-level fallback for unknown characters
// This matches the C code's `(unsigned char)str_buffer[i] + 3`
for _, b := range []byte(charStr) {
tokens = append(tokens, int32(b)+3)
}
}
}
// 2. Iteratively merge the best pair
builder := strings.Builder{}
for {
bestScore := float32(math.Inf(-1))
bestID := int32(-1)
bestIdx := -1
for i := 0; i < len(tokens)-1; i++ {
// Form the merged token string
builder.Reset()
builder.WriteString(vocab[tokens[i]])
builder.WriteString(vocab[tokens[i+1]])
merged := builder.String()
if id, ok := vocabMap[merged]; ok && scores[id] > bestScore {
bestScore = scores[id]
bestID = id
bestIdx = i
}
}
if bestIdx == -1 {
break // No more merges possible
}
// Merge the best pair
tokens[bestIdx] = bestID
// and delete the second token of the pair
tokens = append(tokens[:bestIdx+1], tokens[bestIdx+2:]...)
}
// Add EOS token if requested
if eos {
tokens = append(tokens, 2)
}
return tokens, nil
}