diff --git a/docs/api/scanpy_gpu.md b/docs/api/scanpy_gpu.md index 7e883cef..1347ebc4 100644 --- a/docs/api/scanpy_gpu.md +++ b/docs/api/scanpy_gpu.md @@ -108,6 +108,7 @@ Any transformation of the data matrix that is not a tool. Other than `tools`, pr :toctree: generated/ tl.rank_genes_groups_logreg + tl.rank_genes_groups_wilcoxon ``` ## Plotting diff --git a/src/rapids_singlecell/tools/__init__.py b/src/rapids_singlecell/tools/__init__.py index 9bbff9c3..3cf0f40d 100644 --- a/src/rapids_singlecell/tools/__init__.py +++ b/src/rapids_singlecell/tools/__init__.py @@ -7,7 +7,7 @@ from ._draw_graph import draw_graph from ._embedding_density import embedding_density from ._pymde import mde -from ._rank_gene_groups import rank_genes_groups_logreg +from ._rank_gene_groups import rank_genes_groups_logreg, rank_genes_groups_wilcoxon from ._score_genes import score_genes, score_genes_cell_cycle from ._tsne import tsne from ._umap import umap diff --git a/src/rapids_singlecell/tools/_rank_gene_groups.py b/src/rapids_singlecell/tools/_rank_gene_groups.py index 5ad69e0a..4252e4ab 100644 --- a/src/rapids_singlecell/tools/_rank_gene_groups.py +++ b/src/rapids_singlecell/tools/_rank_gene_groups.py @@ -1,10 +1,13 @@ from __future__ import annotations +import warnings from typing import TYPE_CHECKING, Literal import cupy as cp +import cupyx.scipy.special as cupyx_special import numpy as np import pandas as pd +from statsmodels.stats.multitest import multipletests from rapids_singlecell._compat import DaskArray, _meta_dense @@ -228,3 +231,256 @@ def rank_genes_groups_logreg( } adata.uns["rank_genes_groups"]["scores"] = scores adata.uns["rank_genes_groups"]["names"] = names + + +EPS = 1e-9 + + +def _choose_chunk_size(requested: int | None, n_obs: int, dtype_size: int = 8) -> int: + if requested is not None: + return int(requested) + try: + free_mem, _ = cp.cuda.runtime.memGetInfo() + except cp.cuda.runtime.CUDARuntimeError: + return 500 + bytes_per_gene = n_obs * dtype_size * 4 + if bytes_per_gene == 0: + return 500 + max_genes = int(0.6 * free_mem / bytes_per_gene) + return max(min(max_genes, 1000), 100) + + +def _average_ranks(matrix: cp.ndarray) -> cp.ndarray: + ranks = cp.empty_like(matrix, dtype=cp.float64) + for idx in range(matrix.shape[1]): + column = matrix[:, idx] + sorter = cp.argsort(column) + sorted_column = column[sorter] + unique = cp.concatenate( + (cp.array([True]), sorted_column[1:] != sorted_column[:-1]) + ) + dense = cp.empty(column.size, dtype=cp.int64) + dense[sorter] = cp.cumsum(unique) + boundaries = cp.concatenate((cp.flatnonzero(unique), cp.array([unique.size]))) + ranks[:, idx] = 0.5 * (boundaries[dense] + boundaries[dense - 1] + 1.0) + return ranks + + +def _tie_correction(ranks: cp.ndarray) -> cp.ndarray: + correction = cp.ones(ranks.shape[1], dtype=cp.float64) + for idx in range(ranks.shape[1]): + column = cp.sort(ranks[:, idx]) + boundaries = cp.concatenate( + ( + cp.array([0]), + cp.flatnonzero(column[1:] != column[:-1]) + 1, + cp.array([column.size]), + ) + ) + differences = cp.diff(boundaries).astype(cp.float64) + size = cp.float64(column.size) + if size >= 2: + correction[idx] = 1.0 - (differences**3 - differences).sum() / ( + size**3 - size + ) + return correction + + +def rank_genes_groups_wilcoxon( + adata: AnnData, + groupby: str, + *, + groups: Literal["all"] | Iterable[str] = "all", + use_raw: bool | None = None, + reference: str = "rest", + n_genes: int | None = None, + tie_correct: bool = False, + layer: str | None = None, + chunk_size: int | None = None, + corr_method: str = "benjamini-hochberg", +) -> None: + if corr_method not in {"benjamini-hochberg", "bonferroni"}: + msg = "corr_method must be either 'benjamini-hochberg' or 'bonferroni'." + raise ValueError(msg) + if reference != "rest": + msg = "Only reference='rest' is currently supported for the GPU Wilcoxon test." + raise NotImplementedError(msg) + + if groups == "all" or groups is None: + groups_order = "all" + elif isinstance(groups, str | int): + raise ValueError("Specify a sequence of groups") + else: + groups_order = list(groups) + if isinstance(groups_order[0], int): + groups_order = [str(n) for n in groups_order] + + labels = pd.Series(adata.obs[groupby]).reset_index(drop="True") + groups_order, groups_masks = _select_groups(labels, groups_order) + + group_sizes = groups_masks.sum(axis=1).astype(np.int64) + n_cells = labels.shape[0] + for name, size in zip(groups_order, group_sizes, strict=False): + rest = n_cells - size + if size <= 25 or rest <= 25: + warnings.warn( + f"Group {name} has size {size} (rest {rest}); normal approximation " + "of the Wilcoxon statistic may be inaccurate.", + RuntimeWarning, + ) + + if layer and use_raw is True: + raise ValueError("Cannot specify `layer` and have `use_raw=True`.") + elif layer: + X = adata.layers[layer] + var_names = adata.var_names + elif use_raw is None and adata.raw: + print("defaulting to using `.raw`") + X = adata.raw.X + var_names = adata.raw.var_names + elif use_raw is True: + X = adata.raw.X + var_names = adata.raw.var_names + else: + X = adata.X + var_names = adata.var_names + + if hasattr(X, "toarray"): + X = X.toarray() + + n_cells, n_total_genes = X.shape + n_top = n_total_genes if n_genes is None else min(n_genes, n_total_genes) + + group_matrix = cp.asarray(groups_masks.T, dtype=cp.float64) + group_sizes_dev = cp.asarray(group_sizes, dtype=cp.float64) + rest_sizes = n_cells - group_sizes_dev + + base = adata.uns.get("log1p", {}).get("base") + if base is not None: + log_expm1 = lambda arr: cp.expm1(arr * cp.log(base)) + else: + log_expm1 = cp.expm1 + + chunk_width = _choose_chunk_size(chunk_size, n_cells) + group_keys = [str(key) for key in groups_order] + + scores: dict[str, list[np.ndarray]] = {key: [] for key in group_keys} + logfc: dict[str, list[np.ndarray]] = {key: [] for key in group_keys} + pvals: dict[str, list[np.ndarray]] = {key: [] for key in group_keys} + gene_indices: dict[str, list[np.ndarray]] = {key: [] for key in group_keys} + + for start in range(0, n_total_genes, chunk_width): + stop = min(start + chunk_width, n_total_genes) + block = cp.asarray(X[:, start:stop], dtype=cp.float64) + ranks = _average_ranks(block) + if tie_correct: + tie_corr = _tie_correction(ranks) + else: + tie_corr = cp.ones(ranks.shape[1], dtype=cp.float64) + + rank_sums = group_matrix.T @ ranks + expected = group_sizes_dev[:, None] * (n_cells + 1) / 2.0 + variance = tie_corr[None, :] * group_sizes_dev[:, None] * rest_sizes[:, None] + variance *= (n_cells + 1) / 12.0 + std = cp.sqrt(variance) + z = (rank_sums - expected) / std + cp.nan_to_num(z, copy=False) + p_values = 2.0 * (1.0 - cupyx_special.ndtr(cp.abs(z))) + + group_sums = group_matrix.T @ block + group_means = group_sums / group_sizes_dev[:, None] + total_mean = cp.mean(block, axis=0) + rest_sum = total_mean * n_cells - group_means * group_sizes_dev[:, None] + mean_rest = rest_sum / rest_sizes[:, None] + numerator = log_expm1(group_means) + EPS + denominator = log_expm1(mean_rest) + EPS + log_fold = cp.log2(numerator / denominator) + + indices = np.arange(start, stop, dtype=int) + z_host = z.get() + p_host = p_values.get() + logfc_host = log_fold.get() + + for idx, key in enumerate(group_keys): + scores[key].append(z_host[idx]) + logfc[key].append(logfc_host[idx]) + pvals[key].append(p_host[idx]) + gene_indices[key].append(indices) + + var_array = np.asarray(var_names) + structured = {} + for key in group_keys: + all_scores = ( + np.concatenate(scores[key]) if scores[key] else np.empty(0, dtype=float) + ) + all_logfc = ( + np.concatenate(logfc[key]) if logfc[key] else np.empty(0, dtype=float) + ) + all_pvals = ( + np.concatenate(pvals[key]) if pvals[key] else np.empty(0, dtype=float) + ) + all_genes = ( + np.concatenate(gene_indices[key]) + if gene_indices[key] + else np.empty(0, dtype=int) + ) + + clean = np.array(all_pvals, copy=True) + clean[np.isnan(clean)] = 1.0 + if clean.size and corr_method == "benjamini-hochberg": + _, adjusted, _, _ = multipletests(clean, alpha=0.05, method="fdr_bh") + elif clean.size: + adjusted = np.minimum(clean * n_total_genes, 1.0) + else: + adjusted = np.array([], dtype=float) + + if all_scores.size: + order = np.argsort(all_scores)[::-1] + else: + order = np.empty(0, dtype=int) + keep = order[: min(n_top, order.size)] + + structured[key] = { + "scores": all_scores[keep].astype(np.float32, copy=False), + "logfc": all_logfc[keep].astype(np.float32, copy=False), + "pvals": clean[keep].astype(np.float64, copy=False), + "pvals_adj": adjusted[keep].astype(np.float64, copy=False), + "names": var_array[all_genes[keep]].astype("U50", copy=False), + } + + dtype_scores = [(key, "float32") for key in group_keys] + dtype_names = [(key, "U50") for key in group_keys] + dtype_logfc = [(key, "float32") for key in group_keys] + dtype_pvals = [(key, "float64") for key in group_keys] + + adata.uns["rank_genes_groups"] = { + "params": { + "groupby": groupby, + "method": "wilcoxon", + "reference": reference, + "use_raw": use_raw, + "tie_correct": tie_correct, + "layer": layer, + "corr_method": corr_method, + }, + "scores": np.rec.fromarrays( + [structured[key]["scores"] for key in group_keys], + dtype=dtype_scores, + ), + "names": np.rec.fromarrays( + [structured[key]["names"] for key in group_keys], + dtype=dtype_names, + ), + "logfoldchanges": np.rec.fromarrays( + [structured[key]["logfc"] for key in group_keys], + dtype=dtype_logfc, + ), + "pvals": np.rec.fromarrays( + [structured[key]["pvals"] for key in group_keys], + dtype=dtype_pvals, + ), + "pvals_adj": np.rec.fromarrays( + [structured[key]["pvals_adj"] for key in group_keys], + dtype=dtype_pvals, + ), + } diff --git a/tests/test_rank_genes_groups_wilcoxon.py b/tests/test_rank_genes_groups_wilcoxon.py new file mode 100644 index 00000000..29ce9245 --- /dev/null +++ b/tests/test_rank_genes_groups_wilcoxon.py @@ -0,0 +1,164 @@ +from __future__ import annotations + +import numpy as np +import pytest +import scanpy as sc + +import rapids_singlecell as rsc + +cp = pytest.importorskip("cupy") + + +def _require_cuda(): + try: + if cp.cuda.runtime.getDeviceCount() < 1: + pytest.skip("No CUDA devices available for Wilcoxon test.") + except cp.cuda.runtime.CUDARuntimeError: + pytest.skip("CUDA runtime unavailable for Wilcoxon test.") + + +def test_rank_genes_groups_wilcoxon_matches_scanpy_output(): + _require_cuda() + adata_gpu = sc.datasets.blobs(n_variables=6, n_centers=3, n_observations=200) + adata_gpu.obs["blobs"] = adata_gpu.obs["blobs"].astype("category") + adata_cpu = adata_gpu.copy() + + rsc.tl.rank_genes_groups_wilcoxon( + adata_gpu, "blobs", use_raw=False, n_genes=3, corr_method="benjamini-hochberg" + ) + sc.tl.rank_genes_groups( + adata_cpu, + "blobs", + method="wilcoxon", + use_raw=False, + n_genes=3, + tie_correct=False, + ) + + gpu_result = adata_gpu.uns["rank_genes_groups"] + cpu_result = adata_cpu.uns["rank_genes_groups"] + + assert gpu_result["names"].dtype.names == cpu_result["names"].dtype.names + for group in gpu_result["names"].dtype.names: + assert list(gpu_result["names"][group]) == list(cpu_result["names"][group]) + + for field in ("scores", "logfoldchanges", "pvals", "pvals_adj"): + gpu_field = gpu_result[field] + cpu_field = cpu_result[field] + assert gpu_field.dtype.names == cpu_field.dtype.names + for group in gpu_field.dtype.names: + gpu_values = np.asarray(gpu_field[group], dtype=float) + cpu_values = np.asarray(cpu_field[group], dtype=float) + np.testing.assert_allclose( + gpu_values, cpu_values, rtol=1e-5, atol=1e-6, equal_nan=True + ) + + params = gpu_result["params"] + assert params["use_raw"] is False + assert params["corr_method"] == "benjamini-hochberg" + assert params["tie_correct"] is False + assert params["layer"] is None + + +def test_rank_genes_groups_wilcoxon_honors_layer_and_use_raw(): + _require_cuda() + base = sc.datasets.blobs(n_variables=5, n_centers=3, n_observations=150) + base.obs["blobs"] = base.obs["blobs"].astype("category") + base.layers["signal"] = base.X.copy() + + reference = base.copy() + rsc.tl.rank_genes_groups_wilcoxon(reference, "blobs", use_raw=False) + reference_names = reference.uns["rank_genes_groups"]["names"].copy() + + rng = np.random.default_rng(0) + perturbed_matrix = base.X.copy() + perturbed_matrix[rng.integers(0, 2, perturbed_matrix.shape, dtype=bool)] = 0.0 + + layered = base.copy() + layered.X = perturbed_matrix + rsc.tl.rank_genes_groups_wilcoxon(layered, "blobs", layer="signal", use_raw=False) + layered_names = layered.uns["rank_genes_groups"]["names"].copy() + + no_layer = base.copy() + no_layer.X = perturbed_matrix + rsc.tl.rank_genes_groups_wilcoxon(no_layer, "blobs", use_raw=False) + no_layer_names = no_layer.uns["rank_genes_groups"]["names"].copy() + + assert layered_names.dtype.names == reference_names.dtype.names + for group in reference_names.dtype.names: + assert tuple(layered_names[group]) == tuple(reference_names[group]) + differences = [ + tuple(no_layer_names[group]) != tuple(reference_names[group]) + for group in reference_names.dtype.names + ] + assert any(differences) + + +def test_rank_genes_groups_wilcoxon_subset_and_bonferroni(): + _require_cuda() + adata = sc.datasets.blobs(n_variables=5, n_centers=4, n_observations=150) + adata.obs["blobs"] = adata.obs["blobs"].astype("category") + + rsc.tl.rank_genes_groups_wilcoxon( + adata, + "blobs", + groups=["0", "2"], + use_raw=False, + n_genes=2, + corr_method="bonferroni", + ) + + result = adata.uns["rank_genes_groups"] + assert result["scores"].dtype.names == ("0", "2") + assert result["names"].dtype.names == ("0", "2") + for group in result["names"].dtype.names: + observed = np.asarray(result["names"][group]) + assert observed.size == 2 + for group in result["pvals_adj"].dtype.names: + adjusted = np.asarray(result["pvals_adj"][group]) + assert np.all(adjusted <= 1.0) + + +def test_rank_genes_groups_wilcoxon_with_renamed_categories(): + _require_cuda() + adata = sc.datasets.blobs(n_variables=4, n_centers=3, n_observations=200) + adata.obs["blobs"] = adata.obs["blobs"].astype("category") + + rsc.tl.rank_genes_groups_wilcoxon(adata, "blobs") + names = adata.uns["rank_genes_groups"]["names"] + assert names.dtype.names == ("0", "1", "2") + first_run = tuple(names[0]) + + adata.rename_categories("blobs", ["Zero", "One", "Two"]) + assert tuple(adata.uns["rank_genes_groups"]["names"][0]) == first_run + + rsc.tl.rank_genes_groups_wilcoxon(adata, "blobs") + renamed_names = adata.uns["rank_genes_groups"]["names"] + assert tuple(renamed_names[0]) == first_run + assert renamed_names.dtype.names == ("Zero", "One", "Two") + + +def test_rank_genes_groups_wilcoxon_with_unsorted_groups(): + _require_cuda() + adata = sc.datasets.blobs(n_variables=6, n_centers=4, n_observations=180) + adata.obs["blobs"] = adata.obs["blobs"].astype("category") + bdata = adata.copy() + + rsc.tl.rank_genes_groups_wilcoxon(adata, "blobs", groups=["0", "2", "3"]) + rsc.tl.rank_genes_groups_wilcoxon(bdata, "blobs", groups=["3", "0", "2"]) + + assert set(adata.uns["rank_genes_groups"]["names"].dtype.names) == {"0", "2", "3"} + assert set(bdata.uns["rank_genes_groups"]["names"].dtype.names) == {"0", "2", "3"} + + for field in ("scores", "logfoldchanges", "pvals", "pvals_adj"): + np.testing.assert_allclose( + np.asarray(adata.uns["rank_genes_groups"][field]["3"], dtype=float), + np.asarray(bdata.uns["rank_genes_groups"][field]["3"], dtype=float), + rtol=1e-5, + atol=1e-6, + equal_nan=True, + ) + + assert tuple(adata.uns["rank_genes_groups"]["names"]["3"]) == tuple( + bdata.uns["rank_genes_groups"]["names"]["3"] + )