improve readability and code logic

sirluk · sirluk · commit a0b422304994 · 2024-12-05T13:06:05.000+01:00
diff --git a/torch_incremental_pca/incremental_pca.py b/torch_incremental_pca/incremental_pca.py
@@ -1,31 +1,32 @@
-import torch
-from functools import partial
-
 from typing import Optional, Tuple
 
+import torch
+
 
 class IncrementalPCA:
     """
     An implementation of Incremental Principal Components Analysis (IPCA) that leverages PyTorch for GPU acceleration.
+    Adapted from https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/decomposition/_incremental_pca.py
 
-    This class provides methods to fit the model on data incrementally in batches, and to transform new data
-    based on the principal components learned during the fitting process.
+    This class provides methods to fit the model on data incrementally in batches, and to transform new data based on
+    the principal components learned during the fitting process.
 
-    Attributes:
+    Args:
         n_components (int, optional): Number of components to keep. If `None`, it's set to the minimum of the
-                                number of samples and features. Defaults to None.
+            number of samples and features. Defaults to None.
         copy (bool): If False, input data will be overwritten. Defaults to True.
         batch_size (int, optional): The number of samples to use for each batch. Only needed if self.fit is called.
-                                If `None`, it's inferred from the data and set to `5 * n_features`. Defaults to None.
-        svd_driver (str, optional): name of the cuSOLVER method to be used for torch.linalg.svd. This keyword 
-                                argument only works on CUDA inputs. Available options are: None, gesvd, gesvdj,
-                                and gesvda. Defaults to None.
+            If `None`, it's inferred from the data and set to `5 * n_features`. Defaults to None.
+        svd_driver (str, optional): name of the cuSOLVER method to be used for torch.linalg.svd. This keyword
+            argument only works on CUDA inputs. Available options are: None, gesvd, gesvdj, and gesvda. Defaults to
+            None.
         lowrank (bool, optional): Whether to use torch.svd_lowrank instead of torch.linalg.svd which can be faster.
-                                Defaults to False.
-        lowrank_q (int, optional): For an adequate approximation of n_components, this parameter defaults to 
-                                n_components * 2.
+            Defaults to False.
+        lowrank_q (int, optional): For an adequate approximation of n_components, this parameter defaults to
+            n_components * 2.
         lowrank_niter (int, optional): Number of subspace iterations to conduct for torch.svd_lowrank.
-                                Defaults to 4. 
+            Defaults to 4.
+        lowrank_seed (int, optional): Seed for making results of torch.svd_lowrank reproducible.
     """
 
     def __init__(
@@ -36,49 +37,81 @@ def __init__(
         svd_driver: Optional[str] = None,
         lowrank: bool = False,
         lowrank_q: Optional[int] = None,
-        lowrank_niter: int = 4
+        lowrank_niter: int = 4,
+        lowrank_seed: Optional[int] = None,
     ):
-        self.n_components_ = n_components
+        self.n_components = n_components
         self.copy = copy
         self.batch_size = batch_size
-
-        if lowrank:
-            if lowrank_q is None:
-                assert n_components is not None, "n_components must be specified when using lowrank mode with lowrank_q=None."
-                lowrank_q = n_components * 2
-            assert lowrank_q >= n_components, "lowrank_q must be greater than or equal to n_components."
-            def svd_fn(X):
-                U, S, V = torch.svd_lowrank(X, q=lowrank_q, niter=lowrank_niter)
-                return U, S, V.mH # V is returned as a conjugate transpose
-            self._svd_fn = svd_fn
-
-        else:
-            self._svd_fn = partial(torch.linalg.svd, full_matrices=False, driver=svd_driver)
-        
-
-    def _validate_data(self, X, dtype=torch.float32) -> torch.Tensor:
+        self.svd_driver = svd_driver
+        self.lowrank = lowrank
+        self.lowrank_q = lowrank_q
+        self.lowrank_niter = lowrank_niter
+        self.lowrank_seed = lowrank_seed
+
+        self.n_features_ = None
+
+        if self.lowrank:
+            self._validate_lowrank_params()
+
+    def _validate_lowrank_params(self):
+        if self.lowrank_q is None:
+            if self.n_components is None:
+                raise ValueError("n_components must be specified when using lowrank mode with lowrank_q=None.")
+            self.lowrank_q = self.n_components * 2
+        elif self.lowrank_q < self.n_components:
+            raise ValueError("lowrank_q must be greater than or equal to n_components.")
+
+    def _svd_fn_full(self, X):
+        return torch.linalg.svd(X, full_matrices=False, driver=self.svd_driver)
+
+    def _svd_fn_lowrank(self, X):
+        seed_enabled = self.lowrank_seed is not None
+        with torch.random.fork_rng(enabled=seed_enabled):
+            if seed_enabled:
+                torch.manual_seed(self.lowrank_seed)
+            U, S, V = torch.svd_lowrank(X, q=self.lowrank_q, niter=self.lowrank_niter)
+            return U, S, V.mH
+
+    def _validate_data(self, X) -> torch.Tensor:
         """
         Validates and converts the input data `X` to the appropriate tensor format.
 
         Args:
             X (torch.Tensor): Input data.
-            dtype (torch.dtype, optional): Desired data type for the tensor. Defaults to torch.float32.
 
         Returns:
             torch.Tensor: Converted to appropriate format.
         """
+        valid_dtypes = [torch.float32, torch.float64]
+
         if not isinstance(X, torch.Tensor):
-            X = torch.tensor(X, dtype=dtype)
+            X = torch.tensor(X, dtype=torch.float32)
         elif self.copy:
             X = X.clone()
 
-        if X.dtype != dtype:
-            X = X.to(dtype)
+        n_samples, n_features = X.shape
+        if self.n_components is None:
+            pass
+        elif self.n_components > n_features:
+            raise ValueError(
+                f"n_components={self.n_components} invalid for n_features={n_features}, "
+                "need more rows than columns for IncrementalPCA processing."
+            )
+        elif self.n_components > n_samples:
+            raise ValueError(
+                f"n_components={self.n_components} must be less or equal to the batch number of samples {n_samples}"
+            )
+
+        if X.dtype not in valid_dtypes:
+            X = X.to(torch.float32)
 
         return X
 
     @staticmethod
-    def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def _incremental_mean_and_var(
+        X, last_mean, last_variance, last_sample_count
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Computes the incremental mean and variance for the data `X`.
 
@@ -95,12 +128,10 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count) ->
             return last_mean, last_variance, last_sample_count
 
         if last_sample_count > 0:
-            assert (
-                last_mean is not None
-            ), "last_mean should not be None if last_sample_count > 0."
-            assert (
-                last_variance is not None
-            ), "last_variance should not be None if last_sample_count > 0."
+            if last_mean is None:
+                raise ValueError("last_mean should not be None if last_sample_count > 0.")
+            if last_variance is None:
+                raise ValueError("last_variance should not be None if last_sample_count > 0.")
 
         new_sample_count = torch.tensor([X.shape[0]], device=X.device)
         updated_sample_count = last_sample_count + new_sample_count
@@ -128,9 +159,7 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count) ->
             updated_unnormalized_variance = (
                 last_unnormalized_variance
                 + new_unnormalized_variance
-                + last_over_new_count
-                / updated_sample_count
-                * (last_sum / last_over_new_count - new_sum).square()
+                + last_over_new_count / updated_sample_count * (last_sum / last_over_new_count - new_sum).square()
             )
             updated_variance = updated_unnormalized_variance / updated_sample_count
 
@@ -146,7 +175,8 @@ def _svd_flip(u, v, u_based_decision=True) -> Tuple[torch.Tensor, torch.Tensor]:
         Args:
             u (torch.Tensor): Left singular vectors tensor.
             v (torch.Tensor): Right singular vectors tensor.
-            u_based_decision (bool, optional): If True, uses the left singular vectors to determine the sign flipping. Defaults to True.
+            u_based_decision (bool, optional): If True, uses the left singular vectors to determine the sign flipping.
+                Defaults to True.
 
         Returns:
             Tuple[torch.Tensor, torch.Tensor]: Adjusted left and right singular vectors tensors.
@@ -157,7 +187,7 @@ def _svd_flip(u, v, u_based_decision=True) -> Tuple[torch.Tensor, torch.Tensor]:
         else:
             max_abs_rows = torch.argmax(torch.abs(v), dim=1)
             signs = torch.sign(v[range(v.shape[0]), max_abs_rows])
-        u *= signs[:u.shape[1]].view(1, -1)
+        u *= signs[: u.shape[1]].view(1, -1)
         v *= signs.view(-1, 1)
         return u, v
 
@@ -176,14 +206,10 @@ def fit(self, X, check_input=True):
             X = self._validate_data(X)
         n_samples, n_features = X.shape
         if self.batch_size is None:
-            self.batch_size_ = 5 * n_features
-        else:
-            self.batch_size_ = self.batch_size
+            self.batch_size = 5 * n_features
 
-        for start in range(0, n_samples, self.batch_size_):
-            end = min(start + self.batch_size_, n_samples)
-            X_batch = X[start:end]
-            self.partial_fit(X_batch, check_input=False)
+        for batch in self.gen_batches(n_samples, self.batch_size, min_batch_size=self.n_components or 0):
+            self.partial_fit(X[batch], check_input=False)
 
         return self
 
@@ -209,8 +235,14 @@ def partial_fit(self, X, check_input=True):
             self.mean_ = None  # Will be initialized properly in _incremental_mean_and_var based on data dimensions
             self.var_ = None  # Will be initialized properly in _incremental_mean_and_var based on data dimensions
             self.n_samples_seen_ = torch.tensor([0], device=X.device)
-            if not self.n_components_:
-                self.n_components_ = min(n_samples, n_features)
+            self.n_features_ = n_features
+            if not self.n_components:
+                self.n_components = min(n_samples, n_features)
+
+        if n_features != self.n_features_:
+            raise ValueError(
+                "Number of features of the new batch does not match the number of features of the first batch."
+            )
 
         col_mean, col_var, n_total_samples = self._incremental_mean_and_var(
             X, self.mean_, self.var_, self.n_samples_seen_
@@ -221,9 +253,7 @@ def partial_fit(self, X, check_input=True):
         else:
             col_batch_mean = torch.mean(X, dim=0)
             X -= col_batch_mean
-            mean_correction_factor = torch.sqrt(
-                (self.n_samples_seen_.double() / n_total_samples) * n_samples
-            )
+            mean_correction_factor = torch.sqrt((self.n_samples_seen_.double() / n_total_samples) * n_samples)
             mean_correction = mean_correction_factor * (self.mean_ - col_batch_mean)
             X = torch.vstack(
                 (
@@ -233,20 +263,23 @@ def partial_fit(self, X, check_input=True):
                 )
             )
 
-        U, S, Vt = self._svd_fn(X)
+        if self.lowrank:
+            U, S, Vt = self._svd_fn_lowrank(X)
+        else:
+            U, S, Vt = self._svd_fn_full(X)
         U, Vt = self._svd_flip(U, Vt, u_based_decision=False)
         explained_variance = S**2 / (n_total_samples - 1)
         explained_variance_ratio = S**2 / torch.sum(col_var * n_total_samples)
 
         self.n_samples_seen_ = n_total_samples
-        self.components_ = Vt[:self.n_components_]
-        self.singular_values_ = S[:self.n_components_]
+        self.components_ = Vt[: self.n_components]
+        self.singular_values_ = S[: self.n_components]
         self.mean_ = col_mean
         self.var_ = col_var
-        self.explained_variance_ = explained_variance[:self.n_components_]
-        self.explained_variance_ratio_ = explained_variance_ratio[:self.n_components_]
-        if self.n_components_ not in (n_samples, n_features):
-            self.noise_variance_ = explained_variance[self.n_components_:].mean()
+        self.explained_variance_ = explained_variance[: self.n_components]
+        self.explained_variance_ratio_ = explained_variance_ratio[: self.n_components]
+        if self.n_components not in (n_samples, n_features):
+            self.noise_variance_ = explained_variance[self.n_components :].mean()
         else:
             self.noise_variance_ = torch.tensor(0.0, device=X.device)
         return self
@@ -263,5 +296,29 @@ def transform(self, X) -> torch.Tensor:
         Returns:
             torch.Tensor: Transformed data tensor with shape (n_samples, n_components).
         """
-        X -= self.mean_
-        return torch.mm(X, self.components_.T)
+        X = X - self.mean_
+        return torch.mm(X.double(), self.components_.T).to(X.dtype)
+
+    @staticmethod
+    def gen_batches(n: int, batch_size: int, min_batch_size: int = 0):
+        """Generator to create slices containing `batch_size` elements from 0 to `n`.
+
+        The last slice may contain less than `batch_size` elements, when `batch_size` does not divide `n`.
+
+        Args:
+            n (int): Size of the sequence.
+            batch_size (int): Number of elements in each batch.
+            min_batch_size (int, optional): Minimum number of elements in each batch. Defaults to 0.
+
+        Yields:
+            slice: A slice of `batch_size` elements.
+        """
+        start = 0
+        for _ in range(int(n // batch_size)):
+            end = start + batch_size
+            if end + min_batch_size > n:
+                continue
+            yield slice(start, end)
+            start = end
+        if start < n:
+            yield slice(start, n)