Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ To run basic doublet classification:
import doubletdetection
clf = doubletdetection.BoostClassifier()
# raw_counts is a cells by genes count matrix
labels = clf.fit(raw_counts).predict()
labels = clf.fit(raw_counts).get_predictions()
# higher means more likely to be doublet
scores = clf.doublet_score()
scores = clf.get_doublet_scores()
```

- `raw_counts` is a scRNA-seq count matrix (cells by genes), and is array-like
Expand Down
158 changes: 96 additions & 62 deletions doubletdetection/doubletdetection.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@
import scipy.sparse as sp_sparse
from scipy.sparse import csr_matrix
from scipy.stats import hypergeom
from sklearn.base import BaseEstimator
from sklearn.utils import check_array
from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l1
from sklearn.utils.validation import check_is_fitted
from tqdm.auto import tqdm


class BoostClassifier:
class BoostClassifier(BaseEstimator):
"""Classifier for doublets in single-cell RNA-seq data.

Parameters:
Expand Down Expand Up @@ -59,6 +61,10 @@ class BoostClassifier:
of normalized count matrix prior to PCA. Recommended when not using
Phenograph. Defaults to False.
n_jobs (int, optional): Number of jobs to use. Speeds up neighbor computation.
p_thresh (float, optional): hypergeometric test p-value threshold
that determines per iteration doublet calls
voter_thresh (float, optional): fraction of iterations a cell must
be called a doublet

Attributes:
all_log_p_values_ (ndarray): Hypergeometric test natural log p-value per
Expand Down Expand Up @@ -96,6 +102,8 @@ def __init__(
verbose=False,
standard_scaling=False,
n_jobs=1,
p_thresh=1e-7,
voter_thresh=0.9,
):
self.boost_rate = boost_rate
self.replace = replace
Expand All @@ -107,49 +115,18 @@ def __init__(
self.standard_scaling = standard_scaling
self.n_jobs = n_jobs
self.pseudocount = pseudocount
self.n_components = n_components
self.n_top_var_genes = n_top_var_genes
self.clustering_kwargs = clustering_kwargs
self.p_thresh = p_thresh
self.voter_thresh = voter_thresh

if self.clustering_algorithm not in ["louvain", "phenograph", "leiden"]:
raise ValueError(
"Clustering algorithm needs to be one of ['louvain', 'phenograph', 'leiden']"
)
if self.clustering_algorithm == "leiden":
warnings.warn("Leiden clustering is experimental and results have not been validated.")

if self.random_state:
np.random.seed(self.random_state)

if n_components == 30 and n_top_var_genes > 0:
# If user did not change n_components, silently cap it by n_top_var_genes if needed
self.n_components = min(n_components, n_top_var_genes)
else:
self.n_components = n_components
# Floor negative n_top_var_genes by 0
self.n_top_var_genes = max(0, n_top_var_genes)

self.clustering_kwargs = (
{} if not isinstance(clustering_kwargs, dict) else clustering_kwargs
)
self._set_clustering_kwargs()

if not self.replace and self.boost_rate > 0.5:
warn_msg = (
"boost_rate is trimmed to 0.5 when replace=False."
+ " Set replace=True to use greater boost rates."
)
warnings.warn(warn_msg)
self.boost_rate = 0.5

assert (self.n_top_var_genes == 0) or (
self.n_components <= self.n_top_var_genes
), "n_components={0} cannot be larger than n_top_var_genes={1}".format(
n_components, n_top_var_genes
)

def fit(self, raw_counts):
def fit(self, raw_counts, y=None):
"""Fits the classifier on raw_counts.

Args:
raw_counts (array-like): Count matrix, oriented cells by genes.
y: Ignored

Sets:
all_scores_, all_log_p_values_, communities_,
Expand All @@ -165,6 +142,8 @@ def fit(self, raw_counts):
force_all_finite=True,
ensure_2d=True,
dtype="float32",
ensure_min_samples=10,
ensure_min_features=2,
)

if sp_sparse.issparse(raw_counts) is not True:
Expand All @@ -175,21 +154,26 @@ def fit(self, raw_counts):
old_n_jobs = sc.settings.n_jobs
sc.settings.n_jobs = self.n_jobs

if self.n_top_var_genes > 0:
# Floor negative n_top_var_genes by 0
self._n_top_var_genes = max(0, self.n_top_var_genes)
if self._n_top_var_genes > 0:
if self.n_top_var_genes < raw_counts.shape[1]:
gene_variances = (
np.array(raw_counts.power(2).mean(axis=0))
- (np.array(raw_counts.mean(axis=0))) ** 2
)[0]
top_var_indexes = np.argsort(gene_variances)
self.top_var_genes_ = top_var_indexes[-self.n_top_var_genes :]
self.top_var_genes_ = top_var_indexes[-self._n_top_var_genes :]
# csc if faster for column indexing
raw_counts = raw_counts.tocsc()
raw_counts = raw_counts[:, self.top_var_genes_]
raw_counts = raw_counts.tocsr()

self._raw_counts = raw_counts
(self._num_cells, self._num_genes) = self._raw_counts.shape

self._validate_kwargs()

if self.normalizer is None:
# Memoize these; default normalizer treats these invariant for all synths
self._lib_size = np.sum(raw_counts, axis=1).A1
Expand Down Expand Up @@ -226,14 +210,14 @@ def fit(self, raw_counts):

return self

def predict(self, p_thresh=1e-7, voter_thresh=0.9):
def get_predictions(self, p_thresh=None, voter_thresh=None):
"""Produce doublet calls from fitted classifier

Args:
p_thresh (float, optional): hypergeometric test p-value threshold
that determines per iteration doublet calls
that determines per iteration doublet calls. Defaults to self.p_thresh.
voter_thresh (float, optional): fraction of iterations a cell must
be called a doublet
be called a doublet. Defaults to self.voter_thresh.

Sets:
labels_ and voting_average_ if n_iters > 1.
Expand All @@ -242,6 +226,8 @@ def predict(self, p_thresh=1e-7, voter_thresh=0.9):
Returns:
labels_ (ndarray, ndims=1): 0 for singlet, 1 for detected doublet
"""
p_thresh = self.p_thresh if p_thresh is None else p_thresh
voter_thresh = self.voter_thresh if voter_thresh is None else voter_thresh
log_p_thresh = np.log(p_thresh)
if self.n_iters > 1:
with np.errstate(invalid="ignore"): # Silence numpy warning about NaN comparison
Expand All @@ -266,7 +252,7 @@ def predict(self, p_thresh=1e-7, voter_thresh=0.9):

return self.labels_

def doublet_score(self):
def get_doublet_scores(self):
"""Produce doublet scores

The doublet score is the average negative log p-value of doublet enrichment
Expand All @@ -276,6 +262,7 @@ def doublet_score(self):
scores (ndarray, ndims=1): Average negative log p-value over iterations
"""

check_is_fitted(self)
if self.n_iters > 1:
with np.errstate(invalid="ignore"): # Silence numpy warning about NaN comparison
avg_log_p = np.mean(np.ma.masked_invalid(self.all_log_p_values_), axis=0)
Expand All @@ -287,7 +274,7 @@ def doublet_score(self):
def _one_fit(self):
if self.verbose:
print("\nCreating synthetic doublets...")
self._createDoublets()
self._create_doublets()

# Normalize combined augmented set
if self.verbose:
Expand Down Expand Up @@ -321,7 +308,7 @@ def _one_fit(self):
solver = "arpack" if sp_sparse.issparse(aug_counts.X) else "auto"
sc.tl.pca(
aug_counts,
n_comps=self.n_components,
n_comps=self._n_components,
random_state=self.random_state,
svd_solver=solver,
)
Expand All @@ -331,7 +318,7 @@ def _one_fit(self):
f = io.StringIO()
with redirect_stdout(f):
fullcommunities, _, _ = phenograph.cluster(
aug_counts.obsm["X_pca"], n_jobs=self.n_jobs, **self.clustering_kwargs
aug_counts.obsm["X_pca"], n_jobs=self.n_jobs, **self._clustering_kwargs
)
out = f.getvalue()
if self.verbose:
Expand All @@ -351,7 +338,7 @@ def _one_fit(self):
aug_counts,
key_added="clusters",
random_state=self.random_state,
**self.clustering_kwargs,
**self._clustering_kwargs,
)
fullcommunities = np.array(aug_counts.obs["clusters"], dtype=int)
min_ID = min(fullcommunities)
Expand Down Expand Up @@ -395,13 +382,13 @@ def _one_fit(self):

return scores, log_p_values

def _createDoublets(self):
def _create_doublets(self):
"""Create synthetic doublets.

Sets .parents_
"""
# Number of synthetic doublets to add
num_synths = int(self.boost_rate * self._num_cells)
num_synths = int(self._boost_rate * self._num_cells)

# Parent indices
choices = np.random.choice(self._num_cells, size=(num_synths, 2), replace=self.replace)
Expand All @@ -415,25 +402,72 @@ def _createDoublets(self):
self.parents_ = parents

def _set_clustering_kwargs(self):
"""Sets .clustering_kwargs"""
"""Sets ._clustering_kwargs"""
self._clustering_kwargs = {}
if isinstance(self.clustering_kwargs, dict):
self._clustering_kwargs.update(self.clustering_kwargs)
if self.clustering_algorithm == "phenograph":
if "prune" not in self.clustering_kwargs:
self.clustering_kwargs["prune"] = True
self.clustering_kwargs = self.clustering_kwargs
if (self.n_iters == 1) and (self.clustering_kwargs.get("prune") is True):
if "prune" not in self._clustering_kwargs:
self._clustering_kwargs["prune"] = True
if (self.n_iters == 1) and (self._clustering_kwargs.get("prune") is True):
warn_msg = (
"Using phenograph parameter prune=False is strongly recommended when "
+ "running only one iteration. Otherwise, expect many NaN labels."
)
warnings.warn(warn_msg)
else:
if "directed" not in self.clustering_kwargs:
self.clustering_kwargs["directed"] = False
if "resolution" not in self.clustering_kwargs:
self.clustering_kwargs["resolution"] = 4
if "key_added" in self.clustering_kwargs:
if "directed" not in self._clustering_kwargs:
self._clustering_kwargs["directed"] = False
if "resolution" not in self._clustering_kwargs:
self._clustering_kwargs["resolution"] = 4
if "key_added" in self._clustering_kwargs:
raise ValueError("'key_added' param cannot be overriden")
if "random_state" in self.clustering_kwargs:
if "random_state" in self._clustering_kwargs:
raise ValueError(
"'random_state' param cannot be overriden. Please use classifier 'random_state'."
)

def _validate_kwargs(self):
"""
Set attrs for use during fit using validation.

Sklearn doesn't like mutating params, do set private attrs for fit.

self._boost_rate, self._n_components are set here and used in fit.
"""
if self.clustering_algorithm not in ["louvain", "phenograph", "leiden"]:
raise ValueError(
"Clustering algorithm needs to be one of ['louvain', 'phenograph', 'leiden']"
)
if self.clustering_algorithm == "leiden":
warnings.warn("Leiden clustering is experimental and results have not been validated.")

if self.random_state:
np.random.seed(self.random_state)

min_comps = min(self._num_cells, self._num_genes)
self._n_components = self.n_components
if self._n_components == 30 and self._n_top_var_genes > 0:
# If user did not change n_components, silently cap it by n_top_var_genes if needed
self._n_components = min(self._n_components, self._n_top_var_genes)
if self._n_components > min_comps:
# can't use more components than genes
warnings.warn(f"Capping n_components to {min_comps}.")
self._n_components = min_comps

self._set_clustering_kwargs()

self._boost_rate = self.boost_rate
if not self.replace and self.boost_rate > 0.5:
warn_msg = (
"boost_rate is trimmed to 0.5 when replace=False."
+ " Set replace=True to use greater boost rates."
)
warnings.warn(warn_msg)
self._boost_rate = 0.5

assert (self._n_top_var_genes == 0) or (
self._n_components <= self._n_top_var_genes
), "n_components={0} cannot be larger than n_top_var_genes={1}".format(
self._n_components, self._n_top_var_genes
)
25 changes: 20 additions & 5 deletions tests/notebooks/PBMC_10k_vignette.ipynb

Large diffs are not rendered by default.

31 changes: 22 additions & 9 deletions tests/test_package.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
import numpy as np
import pytest
from sklearn.utils.estimator_checks import check_estimator

import doubletdetection


def test_sklearn_estimator():
clf = doubletdetection.BoostClassifier(
n_iters=2, clustering_algorithm="louvain", standard_scaling=True
)
check_estimator(clf)


def test_classifier():

counts = np.random.poisson(size=(500, 100))
Expand All @@ -11,38 +20,42 @@ def test_classifier():
clf = doubletdetection.BoostClassifier(
n_iters=2, clustering_algorithm="louvain", standard_scaling=True
)
clf.fit(counts).predict(p_thresh=1e-16, voter_thresh=0.5)
clf.doublet_score()
clf.fit(counts).get_predictions()
clf.get_doublet_scores()

# with phenograph
clf = doubletdetection.BoostClassifier(
n_iters=2, clustering_algorithm="phenograph", standard_scaling=True
)
clf.fit(counts).predict(p_thresh=1e-16, voter_thresh=0.5)
clf.doublet_score()
clf.fit(counts).get_predictions()
clf.get_doublet_scores()

# with leiden
clf = doubletdetection.BoostClassifier(
n_iters=2, clustering_algorithm="leiden", standard_scaling=True, random_state=123
)
clf.fit(counts).predict(p_thresh=1e-16, voter_thresh=0.5)
scores1 = clf.doublet_score()
clf.fit(counts).get_predictions()
scores1 = clf.get_doublet_scores()

# test random state
# with leiden
clf = doubletdetection.BoostClassifier(
n_iters=2, clustering_algorithm="leiden", standard_scaling=True, random_state=123
)
clf.fit(counts).predict(p_thresh=1e-16, voter_thresh=0.5)
scores2 = clf.doublet_score()
clf.fit(counts).get_predictions()
scores2 = clf.get_doublet_scores()
np.testing.assert_equal(scores1, scores2)

# plotting
doubletdetection.plot.convergence(clf, show=False, p_thresh=1e-16, voter_thresh=0.5)
doubletdetection.plot.convergence(
clf,
show=False,
)
doubletdetection.plot.threshold(clf, show=False, p_step=6)

# test that you can't use random clustering algorithm
with pytest.raises(ValueError):
clf = doubletdetection.BoostClassifier(
n_iters=2, clustering_algorithm="my_clusters", standard_scaling=True
)
clf.fit(counts)