JonathanShor · adamgayoso · Mar 12, 2022 · Mar 12, 2022 · Mar 12, 2022 · Mar 12, 2022
diff --git a/README.md b/README.md
@@ -38,9 +38,9 @@ To run basic doublet classification:
 import doubletdetection
 clf = doubletdetection.BoostClassifier()
 # raw_counts is a cells by genes count matrix
-labels = clf.fit(raw_counts).predict()
+labels = clf.fit(raw_counts).get_predictions()
 # higher means more likely to be doublet
-scores = clf.doublet_score()
+scores = clf.get_doublet_scores()
 ```
 
 - `raw_counts` is a scRNA-seq count matrix (cells by genes), and is array-like

diff --git a/doubletdetection/doubletdetection.py b/doubletdetection/doubletdetection.py
@@ -12,12 +12,14 @@
 import scipy.sparse as sp_sparse
 from scipy.sparse import csr_matrix
 from scipy.stats import hypergeom
+from sklearn.base import BaseEstimator
 from sklearn.utils import check_array
 from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l1
+from sklearn.utils.validation import check_is_fitted
 from tqdm.auto import tqdm
 
 
-class BoostClassifier:
+class BoostClassifier(BaseEstimator):
     """Classifier for doublets in single-cell RNA-seq data.
 
     Parameters:
@@ -59,6 +61,10 @@ class BoostClassifier:
             of normalized count matrix prior to PCA. Recommended when not using
             Phenograph. Defaults to False.
         n_jobs (int, optional): Number of jobs to use. Speeds up neighbor computation.
+        p_thresh (float, optional): hypergeometric test p-value threshold
+            that determines per iteration doublet calls
+        voter_thresh (float, optional): fraction of iterations a cell must
+            be called a doublet
 
     Attributes:
         all_log_p_values_ (ndarray): Hypergeometric test natural log p-value per
@@ -96,6 +102,8 @@ def __init__(
         verbose=False,
         standard_scaling=False,
         n_jobs=1,
+        p_thresh=1e-7,
+        voter_thresh=0.9,
     ):
         self.boost_rate = boost_rate
         self.replace = replace
@@ -107,49 +115,18 @@ def __init__(
         self.standard_scaling = standard_scaling
         self.n_jobs = n_jobs
         self.pseudocount = pseudocount
+        self.n_components = n_components
+        self.n_top_var_genes = n_top_var_genes
+        self.clustering_kwargs = clustering_kwargs
+        self.p_thresh = p_thresh
+        self.voter_thresh = voter_thresh
 
-        if self.clustering_algorithm not in ["louvain", "phenograph", "leiden"]:
-            raise ValueError(
-                "Clustering algorithm needs to be one of ['louvain', 'phenograph', 'leiden']"
-            )
-        if self.clustering_algorithm == "leiden":
-            warnings.warn("Leiden clustering is experimental and results have not been validated.")
-
-        if self.random_state:
-            np.random.seed(self.random_state)
-
-        if n_components == 30 and n_top_var_genes > 0:
-            # If user did not change n_components, silently cap it by n_top_var_genes if needed
-            self.n_components = min(n_components, n_top_var_genes)
-        else:
-            self.n_components = n_components
-        # Floor negative n_top_var_genes by 0
-        self.n_top_var_genes = max(0, n_top_var_genes)
-
-        self.clustering_kwargs = (
-            {} if not isinstance(clustering_kwargs, dict) else clustering_kwargs
-        )
-        self._set_clustering_kwargs()
-
-        if not self.replace and self.boost_rate > 0.5:
-            warn_msg = (
-                "boost_rate is trimmed to 0.5 when replace=False."
-                + " Set replace=True to use greater boost rates."
-            )
-            warnings.warn(warn_msg)
-            self.boost_rate = 0.5
-
-        assert (self.n_top_var_genes == 0) or (
-            self.n_components <= self.n_top_var_genes
-        ), "n_components={0} cannot be larger than n_top_var_genes={1}".format(
-            n_components, n_top_var_genes
-        )
-
-    def fit(self, raw_counts):
+    def fit(self, raw_counts, y=None):
         """Fits the classifier on raw_counts.
 
         Args:
             raw_counts (array-like): Count matrix, oriented cells by genes.
+            y: Ignored
 
         Sets:
             all_scores_, all_log_p_values_, communities_,
@@ -165,6 +142,8 @@ def fit(self, raw_counts):
             force_all_finite=True,
             ensure_2d=True,
             dtype="float32",
+            ensure_min_samples=10,
+            ensure_min_features=2,
         )
 
         if sp_sparse.issparse(raw_counts) is not True:
@@ -175,21 +154,26 @@ def fit(self, raw_counts):
         old_n_jobs = sc.settings.n_jobs
         sc.settings.n_jobs = self.n_jobs
 
-        if self.n_top_var_genes > 0:
+        # Floor negative n_top_var_genes by 0
+        self._n_top_var_genes = max(0, self.n_top_var_genes)
+        if self._n_top_var_genes > 0:
             if self.n_top_var_genes < raw_counts.shape[1]:
                 gene_variances = (
                     np.array(raw_counts.power(2).mean(axis=0))
                     - (np.array(raw_counts.mean(axis=0))) ** 2
                 )[0]
                 top_var_indexes = np.argsort(gene_variances)
-                self.top_var_genes_ = top_var_indexes[-self.n_top_var_genes :]
+                self.top_var_genes_ = top_var_indexes[-self._n_top_var_genes :]
                 # csc if faster for column indexing
                 raw_counts = raw_counts.tocsc()
                 raw_counts = raw_counts[:, self.top_var_genes_]
                 raw_counts = raw_counts.tocsr()
 
         self._raw_counts = raw_counts
         (self._num_cells, self._num_genes) = self._raw_counts.shape
+
+        self._validate_kwargs()
+
         if self.normalizer is None:
             # Memoize these; default normalizer treats these invariant for all synths
             self._lib_size = np.sum(raw_counts, axis=1).A1
@@ -226,14 +210,14 @@ def fit(self, raw_counts):
 
         return self
 
-    def predict(self, p_thresh=1e-7, voter_thresh=0.9):
+    def get_predictions(self, p_thresh=None, voter_thresh=None):
         """Produce doublet calls from fitted classifier
 
         Args:
             p_thresh (float, optional): hypergeometric test p-value threshold
-                that determines per iteration doublet calls
+                that determines per iteration doublet calls. Defaults to self.p_thresh.
             voter_thresh (float, optional): fraction of iterations a cell must
-                be called a doublet
+                be called a doublet. Defaults to self.voter_thresh.
 
         Sets:
             labels_ and voting_average_ if n_iters > 1.
@@ -242,6 +226,8 @@ def predict(self, p_thresh=1e-7, voter_thresh=0.9):
         Returns:
             labels_ (ndarray, ndims=1):  0 for singlet, 1 for detected doublet
         """
+        p_thresh = self.p_thresh if p_thresh is None else p_thresh
+        voter_thresh = self.voter_thresh if voter_thresh is None else voter_thresh
         log_p_thresh = np.log(p_thresh)
         if self.n_iters > 1:
             with np.errstate(invalid="ignore"):  # Silence numpy warning about NaN comparison
@@ -266,7 +252,7 @@ def predict(self, p_thresh=1e-7, voter_thresh=0.9):
 
         return self.labels_
 
-    def doublet_score(self):
+    def get_doublet_scores(self):
         """Produce doublet scores
 
         The doublet score is the average negative log p-value of doublet enrichment
@@ -276,6 +262,7 @@ def doublet_score(self):
             scores (ndarray, ndims=1):  Average negative log p-value over iterations
         """
 
+        check_is_fitted(self)
         if self.n_iters > 1:
             with np.errstate(invalid="ignore"):  # Silence numpy warning about NaN comparison
                 avg_log_p = np.mean(np.ma.masked_invalid(self.all_log_p_values_), axis=0)
@@ -287,7 +274,7 @@ def doublet_score(self):
     def _one_fit(self):
         if self.verbose:
             print("\nCreating synthetic doublets...")
-        self._createDoublets()
+        self._create_doublets()
 
         # Normalize combined augmented set
         if self.verbose:
@@ -321,7 +308,7 @@ def _one_fit(self):
         solver = "arpack" if sp_sparse.issparse(aug_counts.X) else "auto"
         sc.tl.pca(
             aug_counts,
-            n_comps=self.n_components,
+            n_comps=self._n_components,
             random_state=self.random_state,
             svd_solver=solver,
         )
@@ -331,7 +318,7 @@ def _one_fit(self):
             f = io.StringIO()
             with redirect_stdout(f):
                 fullcommunities, _, _ = phenograph.cluster(
-                    aug_counts.obsm["X_pca"], n_jobs=self.n_jobs, **self.clustering_kwargs
+                    aug_counts.obsm["X_pca"], n_jobs=self.n_jobs, **self._clustering_kwargs
                 )
             out = f.getvalue()
             if self.verbose:
@@ -351,7 +338,7 @@ def _one_fit(self):
                 aug_counts,
                 key_added="clusters",
                 random_state=self.random_state,
-                **self.clustering_kwargs,
+                **self._clustering_kwargs,
             )
             fullcommunities = np.array(aug_counts.obs["clusters"], dtype=int)
         min_ID = min(fullcommunities)
@@ -395,13 +382,13 @@ def _one_fit(self):
 
         return scores, log_p_values
 
-    def _createDoublets(self):
+    def _create_doublets(self):
         """Create synthetic doublets.
 
         Sets .parents_
         """
         # Number of synthetic doublets to add
-        num_synths = int(self.boost_rate * self._num_cells)
+        num_synths = int(self._boost_rate * self._num_cells)
 
         # Parent indices
         choices = np.random.choice(self._num_cells, size=(num_synths, 2), replace=self.replace)
@@ -415,25 +402,72 @@ def _createDoublets(self):
         self.parents_ = parents
 
     def _set_clustering_kwargs(self):
-        """Sets .clustering_kwargs"""
+        """Sets ._clustering_kwargs"""
+        self._clustering_kwargs = {}
+        if isinstance(self.clustering_kwargs, dict):
+            self._clustering_kwargs.update(self.clustering_kwargs)
         if self.clustering_algorithm == "phenograph":
-            if "prune" not in self.clustering_kwargs:
-                self.clustering_kwargs["prune"] = True
-            self.clustering_kwargs = self.clustering_kwargs
-            if (self.n_iters == 1) and (self.clustering_kwargs.get("prune") is True):
+            if "prune" not in self._clustering_kwargs:
+                self._clustering_kwargs["prune"] = True
+            if (self.n_iters == 1) and (self._clustering_kwargs.get("prune") is True):
                 warn_msg = (
                     "Using phenograph parameter prune=False is strongly recommended when "
                     + "running only one iteration. Otherwise, expect many NaN labels."
                 )
                 warnings.warn(warn_msg)
         else:
-            if "directed" not in self.clustering_kwargs:
-                self.clustering_kwargs["directed"] = False
-            if "resolution" not in self.clustering_kwargs:
-                self.clustering_kwargs["resolution"] = 4
-            if "key_added" in self.clustering_kwargs:
+            if "directed" not in self._clustering_kwargs:
+                self._clustering_kwargs["directed"] = False
+            if "resolution" not in self._clustering_kwargs:
+                self._clustering_kwargs["resolution"] = 4
+            if "key_added" in self._clustering_kwargs:
                 raise ValueError("'key_added' param cannot be overriden")
-            if "random_state" in self.clustering_kwargs:
+            if "random_state" in self._clustering_kwargs:
                 raise ValueError(
                     "'random_state' param cannot be overriden. Please use classifier 'random_state'."
                 )
+
+    def _validate_kwargs(self):
+        """
+        Set attrs for use during fit using validation.
+
+        Sklearn doesn't like mutating params, do set private attrs for fit.
+
+        self._boost_rate, self._n_components are set here and used in fit.
+        """
+        if self.clustering_algorithm not in ["louvain", "phenograph", "leiden"]:
+            raise ValueError(
+                "Clustering algorithm needs to be one of ['louvain', 'phenograph', 'leiden']"
+            )
+        if self.clustering_algorithm == "leiden":
+            warnings.warn("Leiden clustering is experimental and results have not been validated.")
+
+        if self.random_state:
+            np.random.seed(self.random_state)
+
+        min_comps = min(self._num_cells, self._num_genes)
+        self._n_components = self.n_components
+        if self._n_components == 30 and self._n_top_var_genes > 0:
+            # If user did not change n_components, silently cap it by n_top_var_genes if needed
+            self._n_components = min(self._n_components, self._n_top_var_genes)
+        if self._n_components > min_comps:
+            # can't use more components than genes
+            warnings.warn(f"Capping n_components to {min_comps}.")
+            self._n_components = min_comps
+
+        self._set_clustering_kwargs()
+
+        self._boost_rate = self.boost_rate
+        if not self.replace and self.boost_rate > 0.5:
+            warn_msg = (
+                "boost_rate is trimmed to 0.5 when replace=False."
+                + " Set replace=True to use greater boost rates."
+            )
+            warnings.warn(warn_msg)
+            self._boost_rate = 0.5
+
+        assert (self._n_top_var_genes == 0) or (
+            self._n_components <= self._n_top_var_genes
+        ), "n_components={0} cannot be larger than n_top_var_genes={1}".format(
+            self._n_components, self._n_top_var_genes
+        )
diff --git a/tests/notebooks/PBMC_10k_vignette.ipynb b/tests/notebooks/PBMC_10k_vignette.ipynb
diff --git a/tests/test_package.py b/tests/test_package.py
@@ -1,8 +1,17 @@
 import numpy as np
 import pytest
+from sklearn.utils.estimator_checks import check_estimator
+
 import doubletdetection
 
 
+def test_sklearn_estimator():
+    clf = doubletdetection.BoostClassifier(
+        n_iters=2, clustering_algorithm="louvain", standard_scaling=True
+    )
+    check_estimator(clf)
+
+
 def test_classifier():
 
     counts = np.random.poisson(size=(500, 100))
@@ -11,38 +20,42 @@ def test_classifier():
     clf = doubletdetection.BoostClassifier(
         n_iters=2, clustering_algorithm="louvain", standard_scaling=True
     )
-    clf.fit(counts).predict(p_thresh=1e-16, voter_thresh=0.5)
-    clf.doublet_score()
+    clf.fit(counts).get_predictions()
+    clf.get_doublet_scores()
 
     # with phenograph
     clf = doubletdetection.BoostClassifier(
         n_iters=2, clustering_algorithm="phenograph", standard_scaling=True
     )
-    clf.fit(counts).predict(p_thresh=1e-16, voter_thresh=0.5)
-    clf.doublet_score()
+    clf.fit(counts).get_predictions()
+    clf.get_doublet_scores()
 
     # with leiden
     clf = doubletdetection.BoostClassifier(
         n_iters=2, clustering_algorithm="leiden", standard_scaling=True, random_state=123
     )
-    clf.fit(counts).predict(p_thresh=1e-16, voter_thresh=0.5)
-    scores1 = clf.doublet_score()
+    clf.fit(counts).get_predictions()
+    scores1 = clf.get_doublet_scores()
 
     # test random state
     # with leiden
     clf = doubletdetection.BoostClassifier(
         n_iters=2, clustering_algorithm="leiden", standard_scaling=True, random_state=123
     )
-    clf.fit(counts).predict(p_thresh=1e-16, voter_thresh=0.5)
-    scores2 = clf.doublet_score()
+    clf.fit(counts).get_predictions()
+    scores2 = clf.get_doublet_scores()
     np.testing.assert_equal(scores1, scores2)
 
     # plotting
-    doubletdetection.plot.convergence(clf, show=False, p_thresh=1e-16, voter_thresh=0.5)
+    doubletdetection.plot.convergence(
+        clf,
+        show=False,
+    )
     doubletdetection.plot.threshold(clf, show=False, p_step=6)
 
     # test that you can't use random clustering algorithm
     with pytest.raises(ValueError):
         clf = doubletdetection.BoostClassifier(
             n_iters=2, clustering_algorithm="my_clusters", standard_scaling=True
         )
+        clf.fit(counts)