DenisovAV · DenisovAV · Mar 29, 2026 · Mar 28, 2026 · Mar 29, 2026 · Mar 29, 2026
diff --git a/.pubignore b/.pubignore
@@ -8,6 +8,7 @@ docs/
 .idea/
 .vscode/
 .claude/
+test_reports/
 
 # Build artifacts
 build/
@@ -65,6 +66,9 @@ litertlm-server/hs_err_*.log
 # Desktop - TFLite C library (downloaded at build time)
 macos/Resources/tflite/
 
+# Desktop - JAR (downloaded at build time via prepare_resources.sh)
+macos/Resources/litertlm-server.jar
+
 # Misc
 test_exports.dart
 trash_*

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## 0.12.8
+- **ToolChoice enum**: `auto` / `required` / `none` parameter in `createChat()` to control tool calling behavior
+- **Parallel Tool Calls**: `ParallelFunctionCallResponse` for multiple function calls in one response
+- **Strategy Pattern Parser**: Per-model `FunctionCallFormat` implementations (Gemma, Qwen, DeepSeek, Llama, Phi, FunctionGemma)
+- **`<tool_call>` Format**: Qwen/Mistral-style function call parsing
+- **ModelType.phi**: Dedicated model type for Phi-4 with `<|tool_calls|>` format support
+- **NPU Fix**: Pass `nativeLibraryDir` to LiteRT-LM `Backend.NPU()`
+- **Embeddings**: Models return L2-normalized vectors (dot product = cosine similarity)
+- **Windows/Linux Embeddings Fix**: TFLite C library now correctly copied to build output (#200)
+
 ## 0.12.7
 - **Dual-Prefix Embeddings (TaskType)**: Improved RAG retrieval quality with query/document prefixes
   - `TaskType.retrievalQuery` (default) — for search queries

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -1244,7 +1244,7 @@ flutter_gemma/
 
 - **GitHub**: https://github.com/DenisovAV/flutter_gemma
 - **Pub.dev**: https://pub.dev/packages/flutter_gemma
-- **Current Version**: 0.12.6
+- **Current Version**: 0.12.8
 - **License**: Check repository for license details
 - **Issues**: Report bugs via GitHub Issues
 - **Changelog**: See `CHANGELOG.md` for version history
diff --git a/README.md b/README.md
@@ -1357,30 +1357,53 @@ final chat = await inferenceModel.createChat(
   topK: 1,
   tools: _tools, // Pass your tools
   supportsFunctionCalls: true, // Enable function calling (required for tools)
-  // tokenBuffer: 256, // Adjust if needed for function calling
+  toolChoice: ToolChoice.auto, // auto (default) | required | none
 );
 ```
 
-**Step 3: Handle Different Response Types**
+**ToolChoice modes:**
+| Mode | Behavior |
+|------|----------|
+| `ToolChoice.auto` | Model decides whether to call a tool (default) |
+| `ToolChoice.required` | Model must respond with a function call |
+| `ToolChoice.none` | Tools are hidden, model responds with text only |
 
-The model can now return two types of responses:
+**Step 3: Handle Response Types**
+
+The model can return text, a single function call, or multiple parallel function calls:
 
 ```dart
-// Add user message
 await chat.addQueryChunk(Message.text(text: 'Change the background to blue', isUser: true));
 
-// Handle async responses
-chat.generateChatResponseAsync().listen((response) {
+// Sync mode
+final response = await chat.generateChatResponse();
+
+if (response is TextResponse) {
+  print('Text: ${response.token}');
+} else if (response is FunctionCallResponse) {
+  // Single function call
+  print('Call: ${response.name}(${response.args})');
+  _handleFunctionCall(response);
+} else if (response is ParallelFunctionCallResponse) {
+  // Multiple function calls (e.g. "Change title and background color")
+  for (final call in response.calls) {
+    print('Call: ${call.name}(${call.args})');
+    await _handleFunctionCall(call);
+  }
+}
+
+// Streaming mode — same types arrive via stream
+await for (final response in chat.generateChatResponseAsync()) {
   if (response is TextResponse) {
-    // Regular text token from the model
-    print('Text: ${response.token}');
-    // Update your UI with the text
+    print(response.token);
   } else if (response is FunctionCallResponse) {
-    // Model wants to call a function
-    print('Function Call: ${response.name}(${response.args})');
     _handleFunctionCall(response);
+  } else if (response is ParallelFunctionCallResponse) {
+    for (final call in response.calls) {
+      await _handleFunctionCall(call);
+    }
   }
-});
+}
 ```
 
 **Step 4: Execute Function and Send Response Back**
@@ -1682,6 +1705,9 @@ double cosineSimilarity(List<double> a, List<double> b) {
 final similarity = cosineSimilarity(embeddings[0], embeddings[1]);
 print('Similarity: $similarity');
 
+// Note: EmbeddingGemma and Gecko return L2-normalized vectors (‖v‖ ≈ 1.0),
+// so dot product alone equals cosine similarity — you can skip normalization.
+
 // Close model when done
 await embeddingModel.close();
 ```

diff --git a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/litertlm/LiteRtLmEngine.kt b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/litertlm/LiteRtLmEngine.kt
@@ -57,7 +57,7 @@ class LiteRtLmEngine(
         // Map PreferredBackend to LiteRT-LM Backend
         val backend = when (config.preferredBackend) {
             PreferredBackend.GPU -> Backend.GPU()
-            PreferredBackend.NPU -> Backend.NPU()
+            PreferredBackend.NPU -> Backend.NPU(nativeLibraryDir = context.applicationInfo.nativeLibraryDir)
             PreferredBackend.CPU,
             null -> Backend.CPU()
         }

diff --git a/example/.gitignore b/example/.gitignore
@@ -51,4 +51,5 @@ app.*.map.json
 *.task
 *.bin
 *.tflite
+*.litertlm
 *.safetensors
diff --git a/example/integration_test/embedding_dual_prefix_test.dart b/example/integration_test/embedding_dual_prefix_test.dart
@@ -1,16 +1,16 @@
 // Integration test: compare single-prefix vs dual-prefix RAG ranking.
-// Run on macOS:   flutter test integration_test/embedding_dual_prefix_test.dart -d macos --dart-define=HF_TOKEN=...
-// Run on Android: flutter test integration_test/embedding_dual_prefix_test.dart -d <device_id> --dart-define=HF_TOKEN=...
+// Uses model from assets (no network download).
+// Run on macOS:   flutter test integration_test/embedding_dual_prefix_test.dart -d macos
+// Run on Android: flutter test integration_test/embedding_dual_prefix_test.dart -d <device_id>
 
 import 'dart:math' as math;
 import 'package:flutter_test/flutter_test.dart';
 import 'package:integration_test/integration_test.dart';
 import 'package:flutter_gemma/flutter_gemma.dart';
 
-const _modelUrl =
-    'https://huggingface.co/litert-community/embeddinggemma-300m/resolve/main/embeddinggemma-300M_seq256_mixed-precision.tflite';
-const _tokenizerUrl =
-    'https://huggingface.co/litert-community/embeddinggemma-300m/resolve/main/sentencepiece.model';
+const _modelPath =
+    'assets/models/embeddinggemma-300M_seq256_mixed-precision.tflite';
+const _tokenizerPath = 'assets/models/sentencepiece.model';
 
 const _documents = {
   'flutter_intro': 'Flutter is an open-source UI framework by Google for building natively compiled applications for mobile, web, and desktop from a single codebase.',
@@ -31,10 +31,9 @@ void main() {
   testWidgets('Dual prefix vs single prefix RAG ranking', (tester) async {
     await FlutterGemma.initialize();
 
-    final hfToken = const String.fromEnvironment('HF_TOKEN');
     await FlutterGemma.installEmbedder()
-        .modelFromNetwork(_modelUrl, token: hfToken.isNotEmpty ? hfToken : null)
-        .tokenizerFromNetwork(_tokenizerUrl, token: hfToken.isNotEmpty ? hfToken : null)
+        .modelFromAsset(_modelPath)
+        .tokenizerFromAsset(_tokenizerPath)
         .install();
 
     final model = await FlutterGemma.getActiveEmbedder();

diff --git a/example/integration_test/embedding_rag_diagnosis_test.dart b/example/integration_test/embedding_rag_diagnosis_test.dart
@@ -1,15 +1,15 @@
-// Integration test: diagnose RAG ranking quality with EmbeddingGemma on desktop.
+// Integration test: diagnose RAG ranking quality with EmbeddingGemma.
+// Uses model from assets (no network download).
 // Run: flutter test integration_test/embedding_rag_diagnosis_test.dart -d macos
 
 import 'dart:math' as math;
 import 'package:flutter_test/flutter_test.dart';
 import 'package:integration_test/integration_test.dart';
 import 'package:flutter_gemma/flutter_gemma.dart';
 
-const _modelUrl =
-    'https://huggingface.co/litert-community/embeddinggemma-300m/resolve/main/embeddinggemma-300M_seq256_mixed-precision.tflite';
-const _tokenizerUrl =
-    'https://huggingface.co/litert-community/embeddinggemma-300m/resolve/main/sentencepiece.model';
+const _modelPath =
+    'assets/models/embeddinggemma-300M_seq256_mixed-precision.tflite';
+const _tokenizerPath = 'assets/models/sentencepiece.model';
 
 const _documents = {
   'flutter_intro': 'Flutter is an open-source UI framework by Google for building natively compiled applications for mobile, web, and desktop from a single codebase.',
@@ -30,16 +30,14 @@ void main() {
   testWidgets('RAG ranking diagnosis with EmbeddingGemma', (tester) async {
     await FlutterGemma.initialize();
 
-    final hfToken = const String.fromEnvironment('HF_TOKEN');
     await FlutterGemma.installEmbedder()
-        .modelFromNetwork(_modelUrl, token: hfToken.isNotEmpty ? hfToken : null)
-        .tokenizerFromNetwork(_tokenizerUrl, token: hfToken.isNotEmpty ? hfToken : null)
+        .modelFromAsset(_modelPath)
+        .tokenizerFromAsset(_tokenizerPath)
         .install();
 
     final model = await FlutterGemma.getActiveEmbedder();
 
     try {
-      // Test: standard (prefix already in DesktopEmbeddingModel)
       print('\n\n========== RAW embeddings ==========');
       final docEmbRaw = <String, List<double>>{};
       for (final entry in _documents.entries) {
@@ -50,7 +48,6 @@ void main() {
         _printRanking(query, queryEmb, docEmbRaw);
       }
 
-      // Test: L2 normalized embeddings
       print('\n\n========== L2 NORMALIZED embeddings ==========');
       final docEmbNorm = <String, List<double>>{};
       for (final entry in docEmbRaw.entries) {

diff --git a/example/integration_test/embedding_score_comparison_test.dart b/example/integration_test/embedding_score_comparison_test.dart
@@ -1,40 +1,31 @@
 // Cross-platform embedding score comparison test.
-// Downloads model from network, runs same texts, prints scores.
-// Run on Android: flutter test integration_test/embedding_score_comparison_test.dart -d emulator-5554 --dart-define=HF_TOKEN=...
-// Run on macOS:   flutter test integration_test/embedding_score_comparison_test.dart -d macos --dart-define=HF_TOKEN=...
+// Uses model from assets (no network download).
+// Run on Android: flutter test integration_test/embedding_score_comparison_test.dart -d <device>
+// Run on macOS:   flutter test integration_test/embedding_score_comparison_test.dart -d macos
 
 import 'dart:math' as math;
 import 'package:flutter_test/flutter_test.dart';
 import 'package:integration_test/integration_test.dart';
 import 'package:flutter_gemma/flutter_gemma.dart';
 
-// EmbeddingGemma 300M seq256 — gated model, requires HF token
-const _modelUrl =
-    'https://huggingface.co/litert-community/embeddinggemma-300m/resolve/main/embeddinggemma-300M_seq256_mixed-precision.tflite';
-const _tokenizerUrl =
-    'https://huggingface.co/litert-community/embeddinggemma-300m/resolve/main/sentencepiece.model';
+const _modelPath =
+    'assets/models/embeddinggemma-300M_seq256_mixed-precision.tflite';
+const _tokenizerPath = 'assets/models/sentencepiece.model';
 
 void main() {
   IntegrationTestWidgetsFlutterBinding.ensureInitialized();
 
   testWidgets('Embedding score comparison', (WidgetTester tester) async {
-    final hfToken = const String.fromEnvironment('HF_TOKEN');
-
     await FlutterGemma.initialize();
 
     await FlutterGemma.installEmbedder()
-        .modelFromNetwork(_modelUrl,
-            token: hfToken.isNotEmpty ? hfToken : null)
-        .tokenizerFromNetwork(_tokenizerUrl,
-            token: hfToken.isNotEmpty ? hfToken : null)
-        .withModelProgress(
-            (progress) => print('[model] $progress%'))
+        .modelFromAsset(_modelPath)
+        .tokenizerFromAsset(_tokenizerPath)
         .install();
 
     final model = await FlutterGemma.getActiveEmbedder();
 
     try {
-      // Same texts as desktop_embedding_test
       final queryEmb = await model
           .generateEmbedding('Which planet is known as the Red Planet');
       final similarEmb = await model
@@ -51,13 +42,12 @@ void main() {
       print('Gap: ${simScore - diffScore}');
       print('Dimension: ${queryEmb.length}');
 
-      // Basic sanity
       expect(queryEmb.length, equals(768));
       expect(diffScore, lessThan(simScore));
     } finally {
       await model.close();
     }
-  }, timeout: const Timeout(Duration(minutes: 15)));
+  }, timeout: const Timeout(Duration(minutes: 10)));
 }
 
 double _cosineSimilarity(List<double> a, List<double> b) {

diff --git a/example/integration_test/embedding_vector_comparison_test.dart b/example/integration_test/embedding_vector_comparison_test.dart
@@ -1,4 +1,5 @@
 // Integration test: compare embedding vectors across platforms.
+// Uses model from assets (no network download).
 // Run on macOS:   flutter test integration_test/embedding_vector_comparison_test.dart -d macos
 // Run on Android: flutter test integration_test/embedding_vector_comparison_test.dart -d <device_id>
 
@@ -7,11 +8,9 @@ import 'package:flutter_test/flutter_test.dart';
 import 'package:integration_test/integration_test.dart';
 import 'package:flutter_gemma/flutter_gemma.dart';
 
-// EmbeddingGemma 300M seq256 — same model used in RAG example
-const _modelUrl =
-    'https://huggingface.co/litert-community/embeddinggemma-300m/resolve/main/embeddinggemma-300M_seq256_mixed-precision.tflite';
-const _tokenizerUrl =
-    'https://huggingface.co/litert-community/embeddinggemma-300m/resolve/main/sentencepiece.model';
+const _modelPath =
+    'assets/models/embeddinggemma-300M_seq256_mixed-precision.tflite';
+const _tokenizerPath = 'assets/models/sentencepiece.model';
 
 void main() {
   IntegrationTestWidgetsFlutterBinding.ensureInitialized();
@@ -20,21 +19,15 @@ void main() {
     final platform = Platform.operatingSystem;
     print('=== Platform: $platform ===');
 
-    // 1. Initialize
     await FlutterGemma.initialize();
 
-    // 2. Install embedding model
-    final hfToken = const String.fromEnvironment('HF_TOKEN');
     await FlutterGemma.installEmbedder()
-        .modelFromNetwork(_modelUrl, token: hfToken.isNotEmpty ? hfToken : null)
-        .tokenizerFromNetwork(_tokenizerUrl, token: hfToken.isNotEmpty ? hfToken : null)
-        .withModelProgress((p) => print('[Download] $p%'))
+        .modelFromAsset(_modelPath)
+        .tokenizerFromAsset(_tokenizerPath)
         .install();
 
-    // 3. Create embedding model
     final embedder = await FlutterGemma.getActiveEmbedder();
 
-    // 4. Generate embeddings for test phrases
     final testPhrases = [
       'Hello world',
       'The cat sat on the mat',
@@ -45,12 +38,9 @@ void main() {
       final embedding = await embedder.generateEmbedding(phrase);
       final dim = embedding.length;
 
-      // Log first 10 values
       final first10 = embedding.take(10).map((v) => v.toStringAsFixed(6)).join(', ');
-      // Log last 5 values
       final last5 = embedding.skip(dim - 5).map((v) => v.toStringAsFixed(6)).join(', ');
 
-      // Compute L2 norm
       double norm = 0;
       for (final v in embedding) {
         norm += v * v;
@@ -64,7 +54,6 @@ void main() {
       print('[$platform] last5:  [$last5]');
     }
 
-    // 5. Cosine similarity between phrases
     final emb1 = await embedder.generateEmbedding(testPhrases[0]);
     final emb2 = await embedder.generateEmbedding(testPhrases[1]);
     final emb3 = await embedder.generateEmbedding(testPhrases[2]);