feat: xprovenance

Sigrid Jin · User · commit fc4f021a78c3 · 2025-12-05T10:10:39.000Z
diff --git a/backends/core/src/lib.rs b/backends/core/src/lib.rs
@@ -14,6 +14,10 @@ pub struct Batch {
     pub max_length: u32,
     pub pooled_indices: Vec<u32>,
     pub raw_indices: Vec<u32>,
+    /// XProvence: raw query texts for context pruning
+    pub raw_queries: Vec<Option<String>>,
+    /// XProvence: raw context texts for context pruning
+    pub raw_texts: Vec<Option<String>>,
 }
 
 impl Batch {
@@ -32,7 +36,16 @@ pub enum Embedding {
 }
 
 pub type Embeddings = IntMap<usize, Embedding>;
-pub type Predictions = IntMap<usize, Vec<f32>>;
+
+/// XProvence: Prediction result containing scores and optional pruned text
+#[derive(Debug, Clone)]
+pub struct Prediction {
+    pub scores: Vec<f32>,
+    /// XProvence: pruned context text after removing irrelevant sentences
+    pub pruned_text: Option<String>,
+}
+
+pub type Predictions = IntMap<usize, Prediction>;
 
 pub trait Backend {
     fn health(&self) -> Result<(), BackendError>;
diff --git a/backends/grpc-client/src/client.rs b/backends/grpc-client/src/client.rs
@@ -73,13 +73,17 @@ impl Client {
         position_ids: Vec<u32>,
         cu_seq_lengths: Vec<u32>,
         max_length: u32,
+        raw_query: Option<String>,
+        raw_text: Option<String>,
     ) -> Result<Vec<Score>> {
         let request = tonic::Request::new(EmbedRequest {
             input_ids,
             token_type_ids,
             position_ids,
             max_length,
             cu_seq_lengths,
+            raw_query,
+            raw_text,
         })
         .inject_context();
         let response = self.stub.predict(request).await?.into_inner();
diff --git a/backends/proto/embed.proto b/backends/proto/embed.proto
@@ -21,6 +21,10 @@ message EmbedRequest {
     repeated uint32 cu_seq_lengths = 4;
     /// Length of the longest request
     uint32 max_length = 5;
+    /// XProvence: raw query text for context pruning
+    optional string raw_query = 6;
+    /// XProvence: raw context text for context pruning
+    optional string raw_text = 7;
 }
 
 message Embedding {
@@ -33,6 +37,8 @@ message EmbedResponse {
 
 message Score {
     repeated float values = 1;
+    /// XProvence: pruned context text after removing irrelevant sentences
+    optional string pruned_text = 2;
 }
 
 message PredictResponse {
diff --git a/backends/python/server/text_embeddings_server/models/__init__.py b/backends/python/server/text_embeddings_server/models/__init__.py
@@ -11,6 +11,7 @@
 from text_embeddings_server.models.masked_model import MaskedLanguageModel
 from text_embeddings_server.models.default_model import DefaultModel
 from text_embeddings_server.models.classification_model import ClassificationModel
+from text_embeddings_server.models.xprovence_model import XProvenceModel
 from text_embeddings_server.models.jinaBert_model import FlashJinaBert
 from text_embeddings_server.models.flash_mistral import FlashMistral
 from text_embeddings_server.models.flash_qwen3 import FlashQwen3
@@ -75,6 +76,15 @@ def get_model(model_path: Path, dtype: Optional[str], pool: str):
 
     config = AutoConfig.from_pretrained(model_path, trust_remote_code=TRUST_REMOTE_CODE)
 
+    # XProvence: Check for XProvence architecture (context pruning reranker)
+    if (
+        hasattr(config, "architectures")
+        and config.architectures
+        and "XProvence" in config.architectures[0]
+    ):
+        logger.info("Detected XProvence model for context pruning")
+        return XProvenceModel(model_path, device, datatype, trust_remote=True)
+
     if (
         hasattr(config, "auto_map")
         and isinstance(config.auto_map, dict)
diff --git a/backends/python/server/text_embeddings_server/models/types.py b/backends/python/server/text_embeddings_server/models/types.py
@@ -36,6 +36,9 @@ class PaddedBatch(Batch):
     token_type_ids: torch.Tensor
     position_ids: torch.Tensor
     attention_mask: torch.Tensor
+    # XProvence: raw text for context pruning
+    raw_query: str = None
+    raw_text: str = None
 
     @classmethod
     @tracer.start_as_current_span("from_pb")
@@ -77,11 +80,17 @@ def from_pb(
         # Move padded tensors all at once
         all_tensors = all_tensors.to(device)
 
+        # XProvence: Extract raw text if present in proto
+        raw_query = pb.raw_query if hasattr(pb, 'raw_query') and pb.raw_query else None
+        raw_text = pb.raw_text if hasattr(pb, 'raw_text') and pb.raw_text else None
+
         return PaddedBatch(
             input_ids=all_tensors[0],
             token_type_ids=all_tensors[1],
             position_ids=all_tensors[2],
             attention_mask=all_tensors[3],
+            raw_query=raw_query,
+            raw_text=raw_text,
         )
 
     def __len__(self):
diff --git a/backends/python/server/text_embeddings_server/models/xprovence_model.py b/backends/python/server/text_embeddings_server/models/xprovence_model.py
@@ -0,0 +1,173 @@
+import os
+import torch
+
+from pathlib import Path
+from typing import Type, List
+from transformers import AutoModel
+from opentelemetry import trace
+from loguru import logger
+
+from text_embeddings_server.models.model import Model
+from text_embeddings_server.models.types import PaddedBatch, Embedding, Score
+
+tracer = trace.get_tracer(__name__)
+
+
+def _parse_bool(value: str) -> bool:
+    """Parse boolean from string with common conventions."""
+    return str(value).lower() in ("true", "1", "t", "yes", "on")
+
+
+class XProvenceModel(Model):
+    """
+    XProvence: Zero-cost context pruning model for RAG.
+
+    XProvence removes irrelevant sentences from passages based on relevance
+    to the query, returning both a reranking score and pruned context.
+
+    Based on bge-reranker-v2-m3 (XLM-RoBERTa), supports 16+ languages.
+
+    Environment Variables:
+        XPROVENCE_THRESHOLD (float): Pruning threshold between 0.0-1.0.
+            - 0.3 (default): Conservative pruning, minimal performance drop
+            - 0.7: Aggressive pruning, higher compression
+        XPROVENCE_ALWAYS_SELECT_TITLE (bool): Keep first sentence as title.
+            - true (default): Always include first sentence (useful for Wikipedia)
+            - false: Only include sentences above threshold
+    """
+
+    def __init__(
+        self,
+        model_path: Path,
+        device: torch.device,
+        dtype: torch.dtype,
+        pool: str = "cls",
+        trust_remote: bool = True,
+    ):
+        # XProvence requires AutoModel with trust_remote_code=True
+        model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
+        model = model.to(dtype).to(device)
+
+        self.hidden_size = model.config.hidden_size
+
+        # XProvence is based on XLM-RoBERTa
+        position_offset = 0
+        model_type = model.config.model_type
+        if model_type in ["xlm-roberta", "camembert", "roberta"]:
+            position_offset = model.config.pad_token_id + 1
+
+        if hasattr(model.config, "max_seq_length"):
+            self.max_input_length = model.config.max_seq_length
+        else:
+            self.max_input_length = (
+                model.config.max_position_embeddings - position_offset
+            )
+
+        # XProvence pruning options from environment variables
+        # XPROVENCE_THRESHOLD: 0.0-1.0, lower = more conservative (default: 0.3)
+        # XPROVENCE_ALWAYS_SELECT_TITLE: keep first sentence as title (default: true)
+        try:
+            threshold_env = os.getenv("XPROVENCE_THRESHOLD", "0.3")
+            self.threshold = float(threshold_env)
+            if not (0.0 <= self.threshold <= 1.0):
+                logger.warning(
+                    f"XPROVENCE_THRESHOLD={self.threshold} out of bounds [0.0, 1.0], "
+                    f"defaulting to 0.3"
+                )
+                self.threshold = 0.3
+        except ValueError:
+            logger.error(
+                f"Invalid XPROVENCE_THRESHOLD='{threshold_env}', defaulting to 0.3"
+            )
+            self.threshold = 0.3
+
+        self.always_select_title = _parse_bool(
+            os.getenv("XPROVENCE_ALWAYS_SELECT_TITLE", "true")
+        )
+
+        logger.info(
+            f"XProvence model loaded: threshold={self.threshold}, "
+            f"always_select_title={self.always_select_title} "
+            f"(Configure via XPROVENCE_THRESHOLD, XPROVENCE_ALWAYS_SELECT_TITLE env vars)"
+        )
+
+        super(XProvenceModel, self).__init__(model=model, dtype=dtype, device=device)
+
+    @property
+    def batch_type(self) -> Type[PaddedBatch]:
+        return PaddedBatch
+
+    @tracer.start_as_current_span("embed")
+    def embed(self, batch: PaddedBatch) -> List[Embedding]:
+        # XProvence is a reranker, not an embedding model
+        pass
+
+    @tracer.start_as_current_span("predict")
+    def predict(self, batch: PaddedBatch) -> List[Score]:
+        """
+        XProvence prediction with context pruning.
+
+        If raw_query and raw_text are provided, uses XProvence's process()
+        method to perform sentence-level context pruning.
+        Otherwise, falls back to standard reranking without pruning.
+        """
+        # Check if raw text is available for XProvence processing
+        if batch.raw_query and batch.raw_text:
+            return self._predict_with_pruning(batch.raw_query, batch.raw_text)
+        else:
+            # Fallback: standard forward pass without pruning
+            return self._predict_standard(batch)
+
+    def _predict_with_pruning(self, raw_query: str, raw_text: str) -> List[Score]:
+        """
+        Use XProvence's process() method for context pruning.
+
+        Returns score with pruned_text containing only relevant sentences.
+        """
+        try:
+            output = self.model.process(
+                raw_query,
+                raw_text,
+                threshold=self.threshold,
+                always_select_title=self.always_select_title,
+            )
+
+            reranking_score = float(output["reranking_score"])
+            pruned_context = output["pruned_context"]
+
+            logger.debug(
+                f"XProvence pruning: score={reranking_score:.4f}, "
+                f"original_len={len(raw_text)}, pruned_len={len(pruned_context)}"
+            )
+
+            return [Score(values=[reranking_score], pruned_text=pruned_context)]
+
+        except Exception as e:
+            logger.error(f"XProvence process() failed: {e}, falling back to standard")
+            # Return a default score without pruning on error
+            return [Score(values=[0.0], pruned_text=None)]
+
+    def _predict_standard(self, batch: PaddedBatch) -> List[Score]:
+        """
+        Standard forward pass without context pruning.
+
+        Used as fallback when raw text is not available.
+        """
+        kwargs = {"input_ids": batch.input_ids, "attention_mask": batch.attention_mask}
+
+        output = self.model(**kwargs, return_dict=True)
+
+        # XProvence forward returns ranking logits at position 0
+        if hasattr(output, "logits"):
+            logits = output.logits
+        else:
+            # Assume first element is ranking logits
+            logits = output[0]
+
+        # Extract scores (first column if multi-dimensional)
+        if logits.dim() == 2 and logits.size(1) >= 1:
+            scores = logits[:, 0].tolist()
+        else:
+            scores = logits.tolist()
+
+        return [Score(values=[s], pruned_text=None) for s in scores]
diff --git a/backends/python/src/lib.rs b/backends/python/src/lib.rs
@@ -5,7 +5,7 @@ use backend_grpc_client::Client;
 use nohash_hasher::BuildNoHashHasher;
 use std::collections::HashMap;
 use text_embeddings_backend_core::{
-    Backend, BackendError, Batch, Embedding, Embeddings, ModelType, Pool, Predictions,
+    Backend, BackendError, Batch, Embedding, Embeddings, ModelType, Pool, Prediction, Predictions,
 };
 use tokio::runtime::Runtime;
 
@@ -108,6 +108,11 @@ impl Backend for PythonBackend {
             ));
         }
         let batch_size = batch.len();
+
+        // XProvence: Get first raw query/text from batch (for single request)
+        let raw_query = batch.raw_queries.first().cloned().flatten();
+        let raw_text = batch.raw_texts.first().cloned().flatten();
+
         let results = self
             .tokio_runtime
             .block_on(self.backend_client.clone().predict(
@@ -116,15 +121,22 @@ impl Backend for PythonBackend {
                 batch.position_ids,
                 batch.cumulative_seq_lengths,
                 batch.max_length,
+                raw_query,
+                raw_text,
             ))
             .map_err(|err| BackendError::Inference(err.to_string()))?;
-        let raw_results: Vec<Vec<f32>> = results.into_iter().map(|r| r.values).collect();
 
         let mut predictions =
             HashMap::with_capacity_and_hasher(batch_size, BuildNoHashHasher::default());
 
-        for (i, r) in raw_results.into_iter().enumerate() {
-            predictions.insert(i, r);
+        for (i, score) in results.into_iter().enumerate() {
+            predictions.insert(
+                i,
+                Prediction {
+                    scores: score.values,
+                    pruned_text: score.pruned_text,
+                },
+            );
         }
 
         Ok(predictions)
diff --git a/core/src/infer.rs b/core/src/infer.rs
@@ -561,11 +561,13 @@ async fn backend_task(backend: Backend, mut embed_receiver: mpsc::Receiver<NextB
                                 inference: inference_duration,
                             };
 
+                            let prediction = predictions.remove(&i).expect(
+                                "prediction not found in results. This is a backend bug.",
+                            );
                             let _ = m.response_tx.send(Ok(InferResult::Classification(
                                 ClassificationInferResponse {
-                                    results: predictions.remove(&i).expect(
-                                        "prediction not found in results. This is a backend bug.",
-                                    ),
+                                    results: prediction.scores,
+                                    pruned_text: prediction.pruned_text,
                                     metadata: infer_metadata,
                                 },
                             )));
@@ -642,6 +644,8 @@ pub(crate) enum InferResult {
 #[derive(Debug)]
 pub struct ClassificationInferResponse {
     pub results: Vec<f32>,
+    /// XProvence: pruned context text after removing irrelevant sentences
+    pub pruned_text: Option<String>,
     pub metadata: InferMetadata,
 }
 
diff --git a/core/src/queue.rs b/core/src/queue.rs
@@ -129,6 +129,10 @@ fn queue_blocking_task(
                 let mut cu_seq_lengths = Vec::with_capacity(capacity);
                 cu_seq_lengths.push(0);
 
+                // XProvence: raw text vectors for context pruning
+                let mut raw_queries = Vec::with_capacity(capacity);
+                let mut raw_texts = Vec::with_capacity(capacity);
+
                 let mut current_tokens = 0;
                 let mut max_length = 0;
 
@@ -168,6 +172,10 @@ fn queue_blocking_task(
                     token_type_ids.extend(entry.encoding.token_type_ids);
                     position_ids.extend(entry.encoding.position_ids);
 
+                    // XProvence: collect raw texts for context pruning
+                    raw_queries.push(entry.encoding.raw_query);
+                    raw_texts.push(entry.encoding.raw_text);
+
                     current_tokens += entry_tokens;
                     metadata.push(entry.metadata);
                     cu_seq_lengths.push(current_tokens as u32);
@@ -193,6 +201,8 @@ fn queue_blocking_task(
                             max_length,
                             pooled_indices,
                             raw_indices,
+                            raw_queries,
+                            raw_texts,
                         },
                     ))
                 };
diff --git a/core/src/tokenization.rs b/core/src/tokenization.rs
diff --git a/router/src/http/server.rs b/router/src/http/server.rs
diff --git a/router/src/http/types.rs b/router/src/http/types.rs