Skip to content

Commit 7307334

Browse files
committed
Rename Tokenizer into TokenCountEstimator
langchain4j/langchain4j#2874
1 parent 9322a13 commit 7307334

File tree

2 files changed

+24
-22
lines changed

2 files changed

+24
-22
lines changed
Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
package dev.langchain4j.model.embedding.onnx;
22

33
import ai.djl.huggingface.tokenizers.Encoding;
4+
import ai.djl.huggingface.tokenizers.HuggingFaceTokenizer;
45
import dev.langchain4j.data.message.*;
5-
import dev.langchain4j.model.Tokenizer;
6+
import dev.langchain4j.model.TokenCountEstimator;
67

78
import java.io.InputStream;
89
import java.nio.file.Path;
@@ -13,21 +14,21 @@
1314
import static java.nio.file.Files.newInputStream;
1415

1516
/**
16-
* A <a href="https://huggingface.co/">HuggingFace</a> tokenizer.
17+
* A token count estimator for models that can be found on <a href="https://huggingface.co/">HuggingFace</a>.
1718
* <br>
18-
* Uses DJL's {@link ai.djl.huggingface.tokenizers.HuggingFaceTokenizer} under the hood.
19+
* Uses DJL's {@link HuggingFaceTokenizer} under the hood.
1920
* <br>
2021
* Requires {@code tokenizer.json} to instantiate.
2122
* An <a href="https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/blob/main/tokenizer.json">example</a>.
2223
*/
23-
public class HuggingFaceTokenizer implements Tokenizer {
24+
public class HuggingFaceTokenCountEstimator implements TokenCountEstimator {
2425

25-
private final ai.djl.huggingface.tokenizers.HuggingFaceTokenizer tokenizer;
26+
private final HuggingFaceTokenizer tokenizer;
2627

2728
/**
28-
* Creates an instance of a {@code HuggingFaceTokenizer} using a built-in {@code tokenizer.json} file.
29+
* Creates an instance of a {@code HuggingFaceTokenCountEstimator} using a built-in {@code tokenizer.json} file.
2930
*/
30-
public HuggingFaceTokenizer() {
31+
public HuggingFaceTokenCountEstimator() {
3132

3233
Map<String, String> options = new HashMap<>();
3334
options.put("padding", "false");
@@ -37,22 +38,22 @@ public HuggingFaceTokenizer() {
3738
}
3839

3940
/**
40-
* Creates an instance of a {@code HuggingFaceTokenizer} using a provided {@code tokenizer.json} file.
41+
* Creates an instance of a {@code HuggingFaceTokenCountEstimator} using a provided {@code tokenizer.json} file.
4142
*
4243
* @param pathToTokenizer The path to the tokenizer file (e.g., "/path/to/tokenizer.json")
4344
*/
44-
public HuggingFaceTokenizer(Path pathToTokenizer) {
45+
public HuggingFaceTokenCountEstimator(Path pathToTokenizer) {
4546
this(pathToTokenizer, null);
4647
}
4748

4849
/**
49-
* Creates an instance of a {@code HuggingFaceTokenizer} using a provided {@code tokenizer.json} file
50+
* Creates an instance of a {@code HuggingFaceTokenCountEstimator} using a provided {@code tokenizer.json} file
5051
* and a map of DJL's tokenizer options.
5152
*
5253
* @param pathToTokenizer The path to the tokenizer file (e.g., "/path/to/tokenizer.json")
5354
* @param options The DJL's tokenizer options
5455
*/
55-
public HuggingFaceTokenizer(Path pathToTokenizer, Map<String, String> options) {
56+
public HuggingFaceTokenCountEstimator(Path pathToTokenizer, Map<String, String> options) {
5657
try {
5758
this.tokenizer = createFrom(newInputStream(pathToTokenizer), options);
5859
} catch (Exception e) {
@@ -61,33 +62,33 @@ public HuggingFaceTokenizer(Path pathToTokenizer, Map<String, String> options) {
6162
}
6263

6364
/**
64-
* Creates an instance of a {@code HuggingFaceTokenizer} using a provided {@code tokenizer.json} file.
65+
* Creates an instance of a {@code HuggingFaceTokenCountEstimator} using a provided {@code tokenizer.json} file.
6566
*
6667
* @param pathToTokenizer The path to the tokenizer file (e.g., "/path/to/tokenizer.json")
6768
*/
68-
public HuggingFaceTokenizer(String pathToTokenizer) {
69+
public HuggingFaceTokenCountEstimator(String pathToTokenizer) {
6970
this(pathToTokenizer, null);
7071
}
7172

7273
/**
73-
* Creates an instance of a {@code HuggingFaceTokenizer} using a provided {@code tokenizer.json} file
74+
* Creates an instance of a {@code HuggingFaceTokenCountEstimator} using a provided {@code tokenizer.json} file
7475
* and a map of DJL's tokenizer options.
7576
*
7677
* @param pathToTokenizer The path to the tokenizer file (e.g., "/path/to/tokenizer.json")
7778
* @param options The DJL's tokenizer options
7879
*/
79-
public HuggingFaceTokenizer(String pathToTokenizer, Map<String, String> options) {
80+
public HuggingFaceTokenCountEstimator(String pathToTokenizer, Map<String, String> options) {
8081
try {
8182
this.tokenizer = createFrom(newInputStream(Paths.get(pathToTokenizer)), options);
8283
} catch (Exception e) {
8384
throw new RuntimeException(e);
8485
}
8586
}
8687

87-
private static ai.djl.huggingface.tokenizers.HuggingFaceTokenizer createFrom(InputStream tokenizer,
88-
Map<String, String> options) {
88+
private static HuggingFaceTokenizer createFrom(InputStream tokenizer,
89+
Map<String, String> options) {
8990
try {
90-
return ai.djl.huggingface.tokenizers.HuggingFaceTokenizer.newInstance(tokenizer, options);
91+
return HuggingFaceTokenizer.newInstance(tokenizer, options);
9192
} catch (Exception e) {
9293
throw new RuntimeException(e);
9394
}
Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,21 @@
11
package dev.langchain4j.model.embedding.onnx;
22

3+
import dev.langchain4j.model.TokenCountEstimator;
34
import org.junit.jupiter.api.Test;
45

56
import static dev.langchain4j.internal.Utils.repeat;
67
import static org.assertj.core.api.Assertions.assertThat;
78

8-
class HuggingFaceTokenizerTest {
9+
class HuggingFaceTokenCountEstimatorTest {
910

10-
HuggingFaceTokenizer tokenizer = new HuggingFaceTokenizer();
11+
TokenCountEstimator tokenCountEstimator = new HuggingFaceTokenCountEstimator();
1112

1213
@Test
1314
void should_count_tokens_in_text_shorter_than_512_tokens() {
1415

1516
String text = "Hello, how are you doing?";
1617

17-
int tokenCount = tokenizer.estimateTokenCountInText(text);
18+
int tokenCount = tokenCountEstimator.estimateTokenCountInText(text);
1819

1920
assertThat(tokenCount).isEqualTo(7);
2021
}
@@ -24,7 +25,7 @@ void should_count_tokens_in_text_longer_than_512_tokens() {
2425

2526
String text = repeat("Hello, how are you doing?", 100);
2627

27-
int tokenCount = tokenizer.estimateTokenCountInText(text);
28+
int tokenCount = tokenCountEstimator.estimateTokenCountInText(text);
2829

2930
assertThat(tokenCount).isEqualTo(700);
3031
}

0 commit comments

Comments
 (0)