Rename Tokenizer into TokenCountEstimator

dliubarskyi · dliubarskyi · commit 730733493e72 · 2025-04-15T16:38:55.000+02:00
langchain4j/langchain4j#2874
diff --git a/langchain4j-embeddings/src/main/java/dev/langchain4j/model/embedding/onnx/HuggingFaceTokenCountEstimator.java b/langchain4j-embeddings/src/main/java/dev/langchain4j/model/embedding/onnx/HuggingFaceTokenCountEstimator.java
@@ -1,8 +1,9 @@
 package dev.langchain4j.model.embedding.onnx;
 
 import ai.djl.huggingface.tokenizers.Encoding;
+import ai.djl.huggingface.tokenizers.HuggingFaceTokenizer;
 import dev.langchain4j.data.message.*;
-import dev.langchain4j.model.Tokenizer;
+import dev.langchain4j.model.TokenCountEstimator;
 
 import java.io.InputStream;
 import java.nio.file.Path;
@@ -13,21 +14,21 @@
 import static java.nio.file.Files.newInputStream;
 
 /**
- * A <a href="https://huggingface.co/">HuggingFace</a> tokenizer.
+ * A token count estimator for models that can be found on <a href="https://huggingface.co/">HuggingFace</a>.
  * <br>
- * Uses DJL's {@link ai.djl.huggingface.tokenizers.HuggingFaceTokenizer} under the hood.
+ * Uses DJL's {@link HuggingFaceTokenizer} under the hood.
  * <br>
  * Requires {@code tokenizer.json} to instantiate.
  * An <a href="https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/blob/main/tokenizer.json">example</a>.
  */
-public class HuggingFaceTokenizer implements Tokenizer {
+public class HuggingFaceTokenCountEstimator implements TokenCountEstimator {
 
-    private final ai.djl.huggingface.tokenizers.HuggingFaceTokenizer tokenizer;
+    private final HuggingFaceTokenizer tokenizer;
 
     /**
-     * Creates an instance of a {@code HuggingFaceTokenizer} using a built-in {@code tokenizer.json} file.
+     * Creates an instance of a {@code HuggingFaceTokenCountEstimator} using a built-in {@code tokenizer.json} file.
      */
-    public HuggingFaceTokenizer() {
+    public HuggingFaceTokenCountEstimator() {
 
         Map<String, String> options = new HashMap<>();
         options.put("padding", "false");
@@ -37,22 +38,22 @@ public HuggingFaceTokenizer() {
     }
 
     /**
-     * Creates an instance of a {@code HuggingFaceTokenizer} using a provided {@code tokenizer.json} file.
+     * Creates an instance of a {@code HuggingFaceTokenCountEstimator} using a provided {@code tokenizer.json} file.
      *
      * @param pathToTokenizer The path to the tokenizer file (e.g., "/path/to/tokenizer.json")
      */
-    public HuggingFaceTokenizer(Path pathToTokenizer) {
+    public HuggingFaceTokenCountEstimator(Path pathToTokenizer) {
         this(pathToTokenizer, null);
     }
 
     /**
-     * Creates an instance of a {@code HuggingFaceTokenizer} using a provided {@code tokenizer.json} file
+     * Creates an instance of a {@code HuggingFaceTokenCountEstimator} using a provided {@code tokenizer.json} file
      * and a map of DJL's tokenizer options.
      *
      * @param pathToTokenizer The path to the tokenizer file (e.g., "/path/to/tokenizer.json")
      * @param options         The DJL's tokenizer options
      */
-    public HuggingFaceTokenizer(Path pathToTokenizer, Map<String, String> options) {
+    public HuggingFaceTokenCountEstimator(Path pathToTokenizer, Map<String, String> options) {
         try {
             this.tokenizer = createFrom(newInputStream(pathToTokenizer), options);
         } catch (Exception e) {
@@ -61,33 +62,33 @@ public HuggingFaceTokenizer(Path pathToTokenizer, Map<String, String> options) {
     }
 
     /**
-     * Creates an instance of a {@code HuggingFaceTokenizer} using a provided {@code tokenizer.json} file.
+     * Creates an instance of a {@code HuggingFaceTokenCountEstimator} using a provided {@code tokenizer.json} file.
      *
      * @param pathToTokenizer The path to the tokenizer file (e.g., "/path/to/tokenizer.json")
      */
-    public HuggingFaceTokenizer(String pathToTokenizer) {
+    public HuggingFaceTokenCountEstimator(String pathToTokenizer) {
         this(pathToTokenizer, null);
     }
 
     /**
-     * Creates an instance of a {@code HuggingFaceTokenizer} using a provided {@code tokenizer.json} file
+     * Creates an instance of a {@code HuggingFaceTokenCountEstimator} using a provided {@code tokenizer.json} file
      * and a map of DJL's tokenizer options.
      *
      * @param pathToTokenizer The path to the tokenizer file (e.g., "/path/to/tokenizer.json")
      * @param options         The DJL's tokenizer options
      */
-    public HuggingFaceTokenizer(String pathToTokenizer, Map<String, String> options) {
+    public HuggingFaceTokenCountEstimator(String pathToTokenizer, Map<String, String> options) {
         try {
             this.tokenizer = createFrom(newInputStream(Paths.get(pathToTokenizer)), options);
         } catch (Exception e) {
             throw new RuntimeException(e);
         }
     }
 
-    private static ai.djl.huggingface.tokenizers.HuggingFaceTokenizer createFrom(InputStream tokenizer,
-                                                                                 Map<String, String> options) {
+    private static HuggingFaceTokenizer createFrom(InputStream tokenizer,
+                                                   Map<String, String> options) {
         try {
-            return ai.djl.huggingface.tokenizers.HuggingFaceTokenizer.newInstance(tokenizer, options);
+            return HuggingFaceTokenizer.newInstance(tokenizer, options);
         } catch (Exception e) {
             throw new RuntimeException(e);
         }
diff --git a/langchain4j-embeddings/src/test/java/dev/langchain4j/model/embedding/onnx/HuggingFaceTokenCountEstimatorTest.java b/langchain4j-embeddings/src/test/java/dev/langchain4j/model/embedding/onnx/HuggingFaceTokenCountEstimatorTest.java
@@ -1,20 +1,21 @@
 package dev.langchain4j.model.embedding.onnx;
 
+import dev.langchain4j.model.TokenCountEstimator;
 import org.junit.jupiter.api.Test;
 
 import static dev.langchain4j.internal.Utils.repeat;
 import static org.assertj.core.api.Assertions.assertThat;
 
-class HuggingFaceTokenizerTest {
+class HuggingFaceTokenCountEstimatorTest {
 
-    HuggingFaceTokenizer tokenizer = new HuggingFaceTokenizer();
+    TokenCountEstimator tokenCountEstimator = new HuggingFaceTokenCountEstimator();
 
     @Test
     void should_count_tokens_in_text_shorter_than_512_tokens() {
 
         String text = "Hello, how are you doing?";
 
-        int tokenCount = tokenizer.estimateTokenCountInText(text);
+        int tokenCount = tokenCountEstimator.estimateTokenCountInText(text);
 
         assertThat(tokenCount).isEqualTo(7);
     }
@@ -24,7 +25,7 @@ void should_count_tokens_in_text_longer_than_512_tokens() {
 
         String text = repeat("Hello, how are you doing?", 100);
 
-        int tokenCount = tokenizer.estimateTokenCountInText(text);
+        int tokenCount = tokenCountEstimator.estimateTokenCountInText(text);
 
         assertThat(tokenCount).isEqualTo(700);
     }