fix: XProvence context pruning with bfloat16 and flash_attn compatibility

Sigrid Jin · User · commit 5631b2e70137 · 2025-12-05T10:10:44.000Z
diff --git a/backends/python/server/text_embeddings_server/models/__init__.py b/backends/python/server/text_embeddings_server/models/__init__.py
@@ -12,11 +12,19 @@
 from text_embeddings_server.models.default_model import DefaultModel
 from text_embeddings_server.models.classification_model import ClassificationModel
 from text_embeddings_server.models.xprovence_model import XProvenceModel
-from text_embeddings_server.models.jinaBert_model import FlashJinaBert
-from text_embeddings_server.models.flash_mistral import FlashMistral
-from text_embeddings_server.models.flash_qwen3 import FlashQwen3
 from text_embeddings_server.utils.device import get_device, use_ipex
 
+# Flash attention models are optional (require flash_attn)
+FlashJinaBert = None
+FlashMistral = None
+FlashQwen3 = None
+try:
+    from text_embeddings_server.models.jinaBert_model import FlashJinaBert
+    from text_embeddings_server.models.flash_mistral import FlashMistral
+    from text_embeddings_server.models.flash_qwen3 import FlashQwen3
+except ImportError as e:
+    logger.warning(f"Flash attention models not available: {e}")
+
 __all__ = ["Model"]
 
 TRUST_REMOTE_CODE = os.getenv("TRUST_REMOTE_CODE", "false").lower() in ["true", "1"]
@@ -86,7 +94,8 @@ def get_model(model_path: Path, dtype: Optional[str], pool: str):
         return XProvenceModel(model_path, device, datatype, trust_remote=True)
 
     if (
-        hasattr(config, "auto_map")
+        FlashJinaBert is not None
+        and hasattr(config, "auto_map")
         and isinstance(config.auto_map, dict)
         and "AutoModel" in config.auto_map
         and config.auto_map["AutoModel"]
@@ -126,13 +135,13 @@ def get_model(model_path: Path, dtype: Optional[str], pool: str):
         else:
             return create_model(DefaultModel, model_path, device, datatype, pool)
 
-    if config.model_type == "mistral" and device.type == "hpu":
+    if FlashMistral is not None and config.model_type == "mistral" and device.type == "hpu":
         try:
             return create_model(FlashMistral, model_path, device, datatype, pool)
         except FileNotFoundError:
             return create_model(DefaultModel, model_path, device, datatype, pool)
 
-    if config.model_type == "qwen3" and device.type == "hpu":
+    if FlashQwen3 is not None and config.model_type == "qwen3" and device.type == "hpu":
         try:
             return create_model(FlashQwen3, model_path, device, datatype, pool)
         except FileNotFoundError:
diff --git a/backends/python/server/text_embeddings_server/models/xprovence_model.py b/backends/python/server/text_embeddings_server/models/xprovence_model.py
@@ -46,6 +46,13 @@ def __init__(
     ):
         # XProvence requires AutoModel with trust_remote_code=True
         model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
+
+        # XProvence's process() method doesn't support bfloat16,
+        # so we use float32 for full pruning support
+        if dtype == torch.bfloat16:
+            logger.info("XProvence: using float32 instead of bfloat16 for process() compatibility")
+            dtype = torch.float32
+
         model = model.to(dtype).to(device)
 
         self.hidden_size = model.config.hidden_size
@@ -105,18 +112,20 @@ def embed(self, batch: PaddedBatch) -> List[Embedding]:
     @tracer.start_as_current_span("predict")
     def predict(self, batch: PaddedBatch) -> List[Score]:
         """
-        XProvence prediction with context pruning.
+        XProvence prediction with context pruning support.
 
-        If raw_query and raw_text are provided, uses XProvence's process()
-        method to perform sentence-level context pruning.
-        Otherwise, falls back to standard reranking without pruning.
+        For single-item batches with raw_query/raw_text available,
+        uses XProvence's process() method for sentence-level pruning.
+        Otherwise falls back to standard forward pass.
         """
-        # Check if raw text is available for XProvence processing
-        if batch.raw_query and batch.raw_text:
+        batch_size = len(batch)
+
+        # Use pruning only for single-item batches with raw text
+        if batch_size == 1 and batch.raw_query and batch.raw_text:
             return self._predict_with_pruning(batch.raw_query, batch.raw_text)
-        else:
-            # Fallback: standard forward pass without pruning
-            return self._predict_standard(batch)
+
+        # Multi-item batches or no raw text: use standard forward pass
+        return self._predict_standard(batch)
 
     def _predict_with_pruning(self, raw_query: str, raw_text: str) -> List[Score]:
         """
@@ -125,12 +134,24 @@ def _predict_with_pruning(self, raw_query: str, raw_text: str) -> List[Score]:
         Returns score with pruned_text containing only relevant sentences.
         """
         try:
-            output = self.model.process(
-                raw_query,
-                raw_text,
-                threshold=self.threshold,
-                always_select_title=self.always_select_title,
-            )
+            # Disable tqdm progress bar to avoid broken pipe errors
+            # when stdout/stderr is captured by the Rust process
+            os.environ["TQDM_DISABLE"] = "1"
+
+            # Force float32 for XProvence's process() method
+            # which creates internal tensors and doesn't support bfloat16
+            original_dtype = torch.get_default_dtype()
+            torch.set_default_dtype(torch.float32)
+
+            try:
+                output = self.model.process(
+                    raw_query,
+                    raw_text,
+                    threshold=self.threshold,
+                    always_select_title=self.always_select_title,
+                )
+            finally:
+                torch.set_default_dtype(original_dtype)
 
             reranking_score = float(output["reranking_score"])
             pruned_context = output["pruned_context"]
@@ -157,17 +178,26 @@ def _predict_standard(self, batch: PaddedBatch) -> List[Score]:
 
         output = self.model(**kwargs, return_dict=True)
 
-        # XProvence forward returns ranking logits at position 0
-        if hasattr(output, "logits"):
-            logits = output.logits
+        # XProvence returns RankingCompressionOutput with ranking_scores
+        if hasattr(output, "ranking_scores"):
+            scores_tensor = output.ranking_scores
+        elif hasattr(output, "logits"):
+            # Fallback for standard classification models
+            scores_tensor = output.logits[:, 0] if output.logits.dim() == 2 else output.logits
         else:
-            # Assume first element is ranking logits
-            logits = output[0]
+            # Assume first element is scores
+            scores_tensor = output[0]
 
-        # Extract scores (first column if multi-dimensional)
-        if logits.dim() == 2 and logits.size(1) >= 1:
-            scores = logits[:, 0].tolist()
+        # Handle scalar (batch_size=1) vs tensor (batch_size>1)
+        if scores_tensor.dim() == 0:
+            # Scalar - single item batch
+            scores = [float(scores_tensor.item())]
         else:
-            scores = logits.tolist()
+            # 1D tensor - multiple items
+            scores = scores_tensor.view(-1).tolist()
+
+        # Ensure scores is a list
+        if isinstance(scores, float):
+            scores = [scores]
 
-        return [Score(values=[s], pruned_text=None) for s in scores]
+        return [Score(values=[float(s)], pruned_text=None) for s in scores]