EleutherAI
diff --git a/‎bergson/__main__.py‎
Lines changed: 2 additions & 2 deletions b/‎bergson/__main__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎bergson/attributor.py‎
Lines changed: 154 additions & 17 deletions b/‎bergson/attributor.py‎
Lines changed: 154 additions & 17 deletions
diff --git a/‎bergson/build.py‎
Lines changed: 13 additions & 9 deletions b/‎bergson/build.py‎
Lines changed: 13 additions & 9 deletions
@@ -36,9 +36,9 @@ class Build:
 
     def execute(self):
         """Build the gradient dataset."""
-        if not self.cfg.save_index and not self.cfg.save_processor:
+        if not self.cfg.save_index and not self.cfg.save_processor and not self.cfg.create_custom_query:
             raise ValueError(
-                "At least one of save_index or save_processor must be True"
+                "At least one of save_index or save_processor or create_custom_query must be True"
             )
 
         build_gradient_dataset(self.cfg)
 
@@ -43,36 +43,52 @@ def __init__(
         dtype: torch.dtype = torch.float32,
         unit_norm: bool = False,
         faiss_cfg: FaissConfig | None = None,
+        processor: GradientProcessor | None = None,
+        unstructured: bool = False,
     ):
         self.device = device
         self.dtype = dtype
         self.unit_norm = unit_norm
         self.faiss_index = None
 
         # Load the gradient processor
-        self.processor = GradientProcessor(projection_dim=16)
-        # self.processor = GradientProcessor.load(index_path, map_location=device)
+        self.processor = processor or GradientProcessor.load(
+            index_path, map_location=device
+        )
 
         # Load the gradient index
         if faiss_cfg:
             self.faiss_index = FaissIndex(index_path, faiss_cfg, device, unit_norm)
             self.N = self.faiss_index.ntotal
         else:
             mmap = load_gradients(index_path)
-
-            # Copy gradients into device memory
-            self.grads = {
-                name: torch.tensor(mmap[name], device=device, dtype=dtype)
-                for name in mmap.dtype.names
-            }
             self.N = mmap[mmap.dtype.names[0]].shape[0]
 
-            if unit_norm:
-                norm = torch.cat([grad for grad in self.grads.values()], dim=1).norm(
-                    dim=1, keepdim=True
-                )
-                for name in self.grads:
-                    self.grads[name] /= norm
+            # Copy gradients into device memory
+            if unstructured:
+                from numpy.lib.recfunctions import structured_to_unstructured
+                import numpy as np
+                mmap = structured_to_unstructured(mmap).astype(np.float16)
+                print("Number of elements:", mmap.shape[0] * mmap.shape[1])
+                print(mmap.dtype)
+                print(f"RAM required assuming float32: {mmap.shape[0] * mmap.shape[1] * 4 / 1024**3} GB")
+                self.grads = torch.from_numpy(mmap)
+
+                if unit_norm:
+                    norm = self.grads.norm(dim=1, keepdim=True) + torch.finfo(dtype).eps
+                    self.grads /= norm
+            else:
+                self.grads = {
+                    name: torch.tensor(mmap[name], device=device, dtype=dtype)
+                    for name in mmap.dtype.names
+                }
+
+                if unit_norm:
+                    norm = torch.cat([grad for grad in self.grads.values()], dim=1).norm(
+                        dim=1, keepdim=True
+                    )
+                    for name in self.grads:
+                        self.grads[name] /= norm
 
     def search(
         self,
@@ -124,9 +140,113 @@ def search(
 
         return torch.topk(scores, k)
 
+    def score(
+        self,
+        queries: dict[str, Tensor],
+        # modules: list[str] | None = None,
+        batch_size: int = 1024,
+        onload_device: str = "cuda",
+    ):
+        """
+        Search for the `k` nearest examples in the index based on the query or queries.
+        Onload shards into VRAM and search.
+
+        Args:
+            queries: The query tensor of shape [..., d].
+            k: The number of nearest examples to return for each query.
+            module: The name of the module to search for. If `None`,
+                all modules will be searched.
+
+        Returns:
+            A namedtuple containing the top `k` indices and inner products for each
+            query. Both have shape [..., k].
+        """
+        assert not self.faiss_index, "FAISS index does not implement onloaded search."
+
+        q = {name: item.to(self.device, self.dtype) for name, item in queries.items()}
+
+        if self.unit_norm:
+            norm = torch.cat(list(q.values()), dim=1).norm(dim=1, keepdim=True)
+            for name in q:
+                q[name] /= norm + 1e-8
+
+        # modules = modules or list(q.keys())
+        k = self.N
+
+        modules = list(q.keys())
+
+        scores = torch.zeros(k, len(q), device=self.device, dtype=self.dtype)
+
+        q_tensor = torch.cat([q[name] for name in modules], dim=1).to(onload_device)
+        for i in range(0, self.N, batch_size):
+            batch = self.grads[i : i + batch_size].to(onload_device, self.dtype)
+            batch_scores = batch @ q_tensor.mT
+            scores[i : i + batch_size] = batch_scores.to(self.device)
+
+        return scores
+
+    @contextmanager
+    def trace_score(self,
+        module: nn.Module,
+        k: int | None,
+        *,
+        precondition: bool = False,
+        target_modules: set[str] | None = None,
+    ):
+
+        mod_grads = defaultdict(list)
+        result = {}
+
+        def callback(name: str, g: Tensor, indices: list[int]):
+            # Precondition the gradient using Cholesky solve
+            if precondition:
+                eigval, eigvec = self.processor.preconditioners_eigen[name]
+                # assert not eigval.isnan().any().item() and not eigvec.isnan().any().item()
+
+                eigval_clamped = torch.clamp(eigval.to(torch.float64), min=0.0)
+                # assert not eigval_clamped.isnan().any().item(), "eigval_clamped is nan"
+                eigval_inverse_sqrt = 1.0 / (
+                    (eigval_clamped).sqrt() + torch.finfo(torch.float64).eps
+                )
+
+                P = (
+                    eigvec.to(eigval_inverse_sqrt.dtype)
+                    * eigval_inverse_sqrt
+                    @ eigvec.mT.to(eigval_inverse_sqrt.dtype)
+                )
+                g = g.flatten(1).type_as(P)
+                assert not P.isnan().any().item(), "P is nan"
+                assert not g.isnan().any().item(), "g is nan"
+                g = g @ P
+            else:
+                g = g.flatten(1)
+
+            # Store the gradient for later use
+            mod_grads[name].append(g.to(self.device, self.dtype, non_blocking=True))
+
+        with GradientCollector(module, callback, self.processor, target_modules):
+            yield result
+
+        if not mod_grads:
+            raise ValueError("No grads collected. Did you forget to call backward?")
+
+        queries = {name: torch.cat(g, dim=1) for name, g in mod_grads.items()}
+
+        if any(q.isnan().any() for q in queries.values()):
+            raise ValueError("NaN found in queries.")
+
+        result['scores'] = self.score(queries)
+
+
     @contextmanager
     def trace(
-        self, module: nn.Module, k: int | None, *, precondition: bool = False, target_modules: set[str] | None = None
+        self,
+        module: nn.Module,
+        k: int | None,
+        *,
+        precondition: bool = False,
+        target_modules: set[str] | None = None,
+        score: bool = False,
     ) -> Generator[TraceResult, None, None]:
         """
         Context manager to trace the gradients of a module and return the
@@ -139,9 +259,26 @@ def callback(name: str, g: Tensor, indices: list[int]):
             # Precondition the gradient using Cholesky solve
             if precondition:
                 eigval, eigvec = self.processor.preconditioners_eigen[name]
-                eigval_inverse_sqrt = 1.0 / (eigval).sqrt()
-                P = eigvec * eigval_inverse_sqrt @ eigvec.mT
+                # assert not eigval.isnan().any().item() and not eigvec.isnan().any().item()
+
+                eigval_clamped = torch.clamp(eigval.to(torch.float64), min=0.0)
+                # assert not eigval_clamped.isnan().any().item(), "eigval_clamped is nan"
+                eigval_inverse_sqrt = 1.0 / (
+                    (eigval_clamped).sqrt() + torch.finfo(torch.float64).eps
+                )  #
+                # assert not eigval_inverse_sqrt.isnan().any().item()
+
+                # assert not eigval_inverse_sqrt.isnan().any().item(), "eigval_inverse_sqrt is nan after dtype conversion"
+                # eigval_inverse_sqrt = eigval_inverse_sqrt.to(eigval.dtype)
+                # P = eigvec * eigval_inverse_sqrt @ eigvec.mT
+                P = (
+                    eigvec.to(eigval_inverse_sqrt.dtype)
+                    * eigval_inverse_sqrt
+                    @ eigvec.mT.to(eigval_inverse_sqrt.dtype)
+                )
                 g = g.flatten(1).type_as(P)
+                assert not P.isnan().any().item(), "P is nan"
+                assert not g.isnan().any().item(), "g is nan"
                 g = g @ P
             else:
                 g = g.flatten(1)
 
@@ -168,7 +168,7 @@ def worker(
             save_index=cfg.save_index,
             save_processor=cfg.save_processor,
             drop_columns=cfg.drop_columns,
-            create_custom_query=cfg.in_memory_index,
+            create_custom_query=cfg.create_custom_query,
             module_wise=cfg.module_wise,
             token_batch_size=cfg.token_batch_size,
         )
@@ -197,7 +197,7 @@ def flush():
                 # Save a processor state checkpoint after each shard
                 save_processor=cfg.save_processor,
                 drop_columns=cfg.drop_columns,
-                create_custom_query=cfg.in_memory_index,
+                create_custom_query=cfg.create_custom_query,
                 module_wise=cfg.module_wise,
                 token_batch_size=cfg.token_batch_size,
             )
@@ -240,15 +240,19 @@ def build_gradient_dataset(cfg: IndexConfig):
     tokenizer.model_max_length = min(tokenizer.model_max_length, cfg.token_batch_size)
 
     # Do all the data loading and preprocessing on the main process
-    ds = load_data_string(cfg.data.dataset, cfg.data.split, streaming=cfg.streaming)
+    if cfg.data.subset:
+        ds = load_data_string(cfg.data.dataset, cfg.data.split, cfg.data.subset, streaming=cfg.streaming)
+    else:
+        ds = load_data_string(cfg.data.dataset, cfg.data.split, streaming=cfg.streaming)
 
     remove_columns = ds.column_names if cfg.drop_columns else None
-    ds = ds.map(
-        tokenize,
-        batched=True,
-        fn_kwargs=dict(args=cfg.data, tokenizer=tokenizer),
-        remove_columns=remove_columns,
-    )
+    if not cfg.skip_tokenization:
+        ds = ds.map(
+            tokenize,
+            batched=True,
+            fn_kwargs=dict(args=cfg.data, tokenizer=tokenizer),
+            remove_columns=remove_columns,
+        )
     if cfg.data.reward_column:
         assert isinstance(ds, Dataset), "Dataset required for advantage estimation"
         ds = ds.add_column(