feat: switch default embedding model to BAAI/bge-m3 for multilingual support

Michael van den Berg · claude · Michael van den Berg · commit 154f61da5ee8 · 2026-03-10T21:04:00.000+01:00
bge-large-en-v1.5 is English-only. bge-m3 supports 100+ languages
(including German) with the same 1024-dim output — no schema change needed.

Also fix prefix logic: bge-m3 is symmetric (no query prefix), unlike the
English BGE models which require a retrieval instruction prefix.

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/.env.example b/.env.example
@@ -13,9 +13,9 @@ POSTGRES_DB=graphrag
 POSTGRES_USER=graphrag
 
 # Embedding model (HuggingFace model ID)
-# BAAI/bge-large-en-v1.5 → 1024-dim, great quality
+# BAAI/bge-m3 → 1024-dim, great quality
 # Change here AND update vector(1024) in 02_schema.sql if using a different model
-EMBEDDING_MODEL=BAAI/bge-large-en-v1.5
+EMBEDDING_MODEL=BAAI/bge-m3
 EMBEDDING_DEVICE=cuda          # set to "cpu" if no GPU is available
 EMBEDDING_BATCH_SIZE=32
 
diff --git a/README.md b/README.md
@@ -74,13 +74,13 @@ Key variables:
 | Variable | Default | Description |
 |---|---|---|
 | `POSTGRES_PASSWORD` | *(required)* | PostgreSQL password |
-| `EMBEDDING_MODEL` | `BAAI/bge-large-en-v1.5` | HuggingFace model ID |
+| `EMBEDDING_MODEL` | `BAAI/bge-m3` | HuggingFace model ID |
 | `EMBEDDING_DEVICE` | `cuda` | `cuda` or `cpu` |
 | `MCP_PORT` | `8000` | MCP server port |
 
 ## Embedding model
 
-Default: **`BAAI/bge-large-en-v1.5`** (1024-dim, asymmetric retrieval).
+Default: **`BAAI/bge-m3`** (1024-dim, multilingual, 100+ languages including German).
 Optimised for H100/H200/RTX 6000 Ada GPU hardware.
 
 If you change `EMBEDDING_MODEL` to a model with different output dimensions, update `vector(1024)` in `docker/postgres/init/02_schema.sql` accordingly and recreate the database volume.
diff --git a/docker/postgres/init/02_schema.sql b/docker/postgres/init/02_schema.sql
@@ -24,7 +24,7 @@ CREATE TABLE graphrag.chunks (
     position    INTEGER         NOT NULL,               -- ordinal within document (0-based)
     content     TEXT            NOT NULL,               -- raw markdown text of the chunk
     token_count INTEGER,                                -- approximate token count
-    -- BAAI/bge-large-en-v1.5 produces 1024-dimensional embeddings.
+    -- BAAI/bge-m3 produces 1024-dimensional embeddings.
     -- If you change EMBEDDING_MODEL to one with different dimensions,
     -- update this column type and recreate the index accordingly.
     embedding   vector(1024)    NOT NULL,
diff --git a/src/graphrag/config.py b/src/graphrag/config.py
@@ -11,10 +11,10 @@ class Settings(BaseSettings):
     postgres_password: str  # required — no default
 
     # ── Embeddings ────────────────────────────────────────────────────────────
-    # BAAI/bge-large-en-v1.5 → 1024-dim, high quality, asymmetric retrieval
+    # BAAI/bge-m3 → 1024-dim, high quality, asymmetric retrieval
     # If you change this to a model with different output dimensions, you must
     # also update the vector(1024) column in 02_schema.sql and recreate the DB.
-    embedding_model: str = "BAAI/bge-large-en-v1.5"
+    embedding_model: str = "BAAI/bge-m3"
     embedding_device: str = "cuda"   # "cpu" for CPU-only environments
     embedding_batch_size: int = 32
 
diff --git a/src/graphrag/embeddings/embedder.py b/src/graphrag/embeddings/embedder.py
@@ -2,12 +2,16 @@
 
 BGE asymmetric retrieval
 ────────────────────────
-BAAI/bge-large-en-v1.5 uses *asymmetric* embeddings for retrieval:
+BGE English models (bge-large-en-v1.5, bge-base-en-v1.5, …) use asymmetric
+embeddings for retrieval:
   - Document chunks are embedded as-is (no prefix).
   - Queries must be prefixed with the instruction string below.
 
-Skipping the query prefix significantly degrades recall. The prefix is applied
-automatically in ``embed_query()``. Never use ``embed()`` for query strings.
+BGE-M3 (multilingual, 100+ languages incl. German) does NOT use a query
+prefix — both queries and passages are embedded identically.
+
+The correct prefix behaviour is selected automatically in ``embed_query()``
+based on the model name. Never call ``embed()`` with raw query strings.
 """
 
 from __future__ import annotations
@@ -24,7 +28,7 @@
 class Embedder:
     def __init__(
         self,
-        model_name: str = "BAAI/bge-large-en-v1.5",
+        model_name: str = "BAAI/bge-m3",
         device: str = "cuda",
         batch_size: int = 32,
     ) -> None:
@@ -52,7 +56,7 @@ def embed(self, texts: list[str]) -> list[list[float]]:
         return [v.tolist() for v in vectors]
 
     def embed_query(self, text: str) -> list[float]:
-        """Embed a query string, applying the BGE retrieval prefix."""
+        """Embed a query string, applying a retrieval prefix where required."""
         prefixed = _bge_prefix(self._model_name, text)
         return self.embed([prefixed])[0]
 
@@ -62,7 +66,13 @@ def dimensions(self) -> int:
 
 
 def _bge_prefix(model_name: str, text: str) -> str:
-    """Apply query prefix for BGE-family models; pass through for others."""
-    if "bge" in model_name.lower():
+    """Apply query prefix where the model requires it.
+
+    - BGE English models (bge-large-en, bge-base-en, …): need the prefix.
+    - BGE-M3 (multilingual): symmetric — no prefix for either queries or docs.
+    - All other models: passed through unchanged.
+    """
+    name = model_name.lower()
+    if "bge" in name and "m3" not in name:
         return _BGE_QUERY_PREFIX + text
     return text