From 5232e12dca4ba2d24bc9935cd43243a42fa8fa35 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Wed, 11 Feb 2026 20:56:58 +0100 Subject: [PATCH 01/30] typo --- src/annbatch/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/annbatch/io.py b/src/annbatch/io.py index a22aa5d8..5ea008cf 100644 --- a/src/annbatch/io.py +++ b/src/annbatch/io.py @@ -385,7 +385,7 @@ def __iter__(self) -> Generator[zarr.Group]: @property def is_empty(self) -> bool: - """Wether or not there is an existing store at the group location.""" + """Whether or not there is an existing store at the group location.""" return ( (not (V1_ENCODING.items() <= self._group.attrs.items()) or len(self._dataset_keys) == 0) if isinstance(self._group, zarr.Group) From 5d14e5c71e4c12daf1d5158238f6478b20de5bcb Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Wed, 11 Feb 2026 20:59:22 +0100 Subject: [PATCH 02/30] _collection_added' defined outside --- src/annbatch/loader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/annbatch/loader.py b/src/annbatch/loader.py index 973a47d9..4c23504b 100644 --- a/src/annbatch/loader.py +++ b/src/annbatch/loader.py @@ -161,6 +161,7 @@ class Loader[ _batch_sampler: Sampler _concat_strategy: None | concat_strategies = None _dataset_intervals: pd.IntervalIndex | None = None + _collection_added: bool = False def __init__( self, @@ -312,7 +313,7 @@ def use_collection( """ if collection.is_empty: raise ValueError("DatasetCollection is empty") - if getattr(self, "_collection_added", False): + if self._collection_added: raise RuntimeError( "You should not add multiple collections, independently shuffled - please preshuffle multiple collections, use `add_anndatas` manually if you know what you are doing, or open an issue if you believe that this should be supported at an API level higher than `add_anndatas`." ) From 2558b4017d85aa66fcf7bf696c2b873bf6c2ee97 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Wed, 11 Feb 2026 21:01:26 +0100 Subject: [PATCH 03/30] consistent naming with add_anndatas --- src/annbatch/io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/annbatch/io.py b/src/annbatch/io.py index 5ea008cf..d5b906e4 100644 --- a/src/annbatch/io.py +++ b/src/annbatch/io.py @@ -393,7 +393,7 @@ def is_empty(self) -> bool: ) @_with_settings - def add_adatas( + def add_anndatas( self, adata_paths: Iterable[zarr.Group | h5py.Group | PathLike[str] | str], *, @@ -473,7 +473,7 @@ def add_adatas( ... "path/to/second_adata.h5ad", ... "path/to/third_adata.h5ad", ... ] - >>> DatasetCollection("path/to/output/zarr_store.zarr").add_adatas( + >>> DatasetCollection("path/to/output/zarr_store.zarr").add_anndatas( ... datasets, ... load_adata=read_lazy_x_and_obs_only, ...) @@ -647,7 +647,7 @@ def _add_to_collection( Whether or not to shuffle when adding. Otherwise, the incoming data will just be split up and appended. """ if self.is_empty: - raise ValueError("Store is empty. Please run `DatasetCollection.add` first.") + raise ValueError("Store is empty. Please run `DatasetCollection.add_anndatas` first.") # Check for mismatched keys among the inputs. _check_for_mismatched_keys(adata_paths, load_adata=load_adata) From 36af588245f581edfe960a1b955a6aff1bafefc9 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Wed, 11 Feb 2026 21:04:45 +0100 Subject: [PATCH 04/30] ruff format --- tests/test_dataset.py | 42 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 02b56b09..8a80924a 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -104,30 +104,24 @@ def concat(datas: list[Data | ad.AnnData]) -> ListData | list[ad.AnnData]: "gen_loader", [ pytest.param( - lambda collection, - shuffle, - use_zarrs, - chunk_size=chunk_size, - preload_nchunks=preload_nchunks, - open_func=open_func, - batch_size=batch_size, - preload_to_gpu=preload_to_gpu, - concat_strategy=concat_strategy: Loader( - shuffle=shuffle, - chunk_size=chunk_size, - preload_nchunks=preload_nchunks, - return_index=True, - batch_size=batch_size, - preload_to_gpu=preload_to_gpu, - to_torch=False, - concat_strategy=concat_strategy, - ).use_collection( - collection, - **( - {"load_adata": lambda group: open_func(group, use_zarrs=use_zarrs, use_anndata=True)} - if open_func is not None - else {} - ), + lambda collection, shuffle, use_zarrs, chunk_size=chunk_size, preload_nchunks=preload_nchunks, open_func=open_func, batch_size=batch_size, preload_to_gpu=preload_to_gpu, concat_strategy=concat_strategy: ( + Loader( + shuffle=shuffle, + chunk_size=chunk_size, + preload_nchunks=preload_nchunks, + return_index=True, + batch_size=batch_size, + preload_to_gpu=preload_to_gpu, + to_torch=False, + concat_strategy=concat_strategy, + ).use_collection( + collection, + **( + {"load_adata": lambda group: open_func(group, use_zarrs=use_zarrs, use_anndata=True)} + if open_func is not None + else {} + ), + ) ), id=f"chunk_size={chunk_size}-preload_nchunks={preload_nchunks}-open_func={open_func.__name__[5:] if open_func is not None else 'None'}-batch_size={batch_size}{'-cupy' if preload_to_gpu else ''}-concat_strategy={concat_strategy}", # type: ignore[attr-defined] marks=skip_if_no_cupy, From d841e13a60648db0aa2559aa21fd19a05a0a3303 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Wed, 11 Feb 2026 21:06:10 +0100 Subject: [PATCH 05/30] typo2 --- src/annbatch/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/annbatch/loader.py b/src/annbatch/loader.py index 4c23504b..2df59954 100644 --- a/src/annbatch/loader.py +++ b/src/annbatch/loader.py @@ -305,7 +305,7 @@ def use_collection( Parameters ---------- collection - The collection who on-disk datasets should be used in this loader. + The collection whose on-disk datasets should be used in this loader. load_adata A custom load function - recall that whatever is found in :attr:`~anndata.AnnData.X` and :attr:`~anndata.AnnData.obs` will be yielded in batches. Default is to just load `X` and all of `obs`. From 304dcbbcdd6a7bfd5e81ffdc38a91b90553d7ba3 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Wed, 11 Feb 2026 21:38:53 +0100 Subject: [PATCH 06/30] adapt add_anndatas change to tests --- tests/conftest.py | 2 +- tests/test_preshuffle.py | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 45be4996..0c9843c2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -114,7 +114,7 @@ def simple_collection( ) -> tuple[DatasetCollection, ad.AnnData]: zarr_stores = sorted(f for f in adata_with_zarr_path_same_var_space[1].iterdir() if f.is_dir()) output_path = Path(tmpdir_factory.mktemp("zarr_folder")) / "simple_fixture.zarr" - collection = DatasetCollection(output_path).add_adatas( + collection = DatasetCollection(output_path).add_anndatas( zarr_stores, zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, diff --git a/tests/test_preshuffle.py b/tests/test_preshuffle.py index 680d6fe1..5a0fae4d 100644 --- a/tests/test_preshuffle.py +++ b/tests/test_preshuffle.py @@ -51,7 +51,7 @@ def test_store_creation_warnings_with_different_keys(elem_name: Literal["obsm", adata_1.write_h5ad(path_1) adata_2.write_h5ad(path_2) with pytest.warns(UserWarning, match=rf"Found {elem_name} keys.* not present in all anndatas"): - DatasetCollection(tmp_path / "collection.zarr").add_adatas( + DatasetCollection(tmp_path / "collection.zarr").add_anndatas( [path_1, path_2], zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, @@ -69,7 +69,7 @@ def test_store_creation_no_warnings_with_custom_load(tmp_path: Path): path_2 = tmp_path / "with_extra_key.h5ad" adata_1.write_h5ad(path_1) adata_2.write_h5ad(path_2) - collection = DatasetCollection(tmp_path / "collection.zarr").add_adatas( + collection = DatasetCollection(tmp_path / "collection.zarr").add_anndatas( [path_1, path_2], zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, @@ -91,7 +91,7 @@ def test_store_creation_path_added_to_obs(tmp_path: Path): adata_2.write_h5ad(path_2) paths = [path_1, path_2] output_dir = tmp_path / "path_src_collection.zarr" - collection = DatasetCollection(output_dir).add_adatas( + collection = DatasetCollection(output_dir).add_anndatas( paths, zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, @@ -120,7 +120,7 @@ def test_store_addition_different_keys( adata_orig.write_h5ad(orig_path) output_path = tmp_path / "zarr_store_addition_different_keys.zarr" collection = DatasetCollection(output_path) - collection.add_adatas( + collection.add_anndatas( [orig_path], zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, @@ -136,7 +136,7 @@ def test_store_addition_different_keys( additional_path = tmp_path / "with_extra_key.h5ad" adata.write_h5ad(additional_path) with pytest.warns(UserWarning, match=rf"Found {elem_name} keys.* not present in all anndatas"): - collection.add_adatas( + collection.add_anndatas( [additional_path], load_adata=load_adata, zarr_sparse_chunk_size=10, @@ -169,7 +169,7 @@ def test_store_creation_default( else r"Loading h5ad is currently not supported", ): kwargs = {} if is_zarr else {"is_collection_h5ad": True} - collection = DatasetCollection(output_path, **kwargs).add_adatas( + collection = DatasetCollection(output_path, **kwargs).add_anndatas( [adata_with_h5_path_different_var_space[1] / f for f in h5_files if str(f).endswith(".h5ad")] ) assert isinstance( @@ -201,7 +201,7 @@ def test_store_creation( adata_with_h5_path_different_var_space[1].parent / f"zarr_store_creation_test_{shuffle}_{'default_read' if load_adata is None else 'custom_read'}.zarr" ) - collection = DatasetCollection(output_path).add_adatas( + collection = DatasetCollection(output_path).add_anndatas( [adata_with_h5_path_different_var_space[1] / f for f in h5_files if str(f).endswith(".h5ad")], var_subset=var_subset, zarr_sparse_chunk_size=10, @@ -292,7 +292,7 @@ def test_mismatched_raw_concat( h5_files = sorted(adata_with_h5_path_different_var_space[1].iterdir()) output_path = adata_with_h5_path_different_var_space[1].parent / "zarr_store_creation_test_heterogeneous.zarr" h5_paths = [adata_with_h5_path_different_var_space[1] / f for f in h5_files if str(f).endswith(".h5ad")] - collection = DatasetCollection(output_path).add_adatas( + collection = DatasetCollection(output_path).add_anndatas( h5_paths, zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, @@ -337,7 +337,7 @@ def test_store_extension( additional = all_h5_paths[4:] # don't add everything to get a "different" var space # create new store collection = DatasetCollection(store_path) - collection.add_adatas( + collection.add_anndatas( original, zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, @@ -348,7 +348,7 @@ def test_store_extension( shuffle=True, ) # add h5ads to existing store - collection.add_adatas( + collection.add_anndatas( additional, load_adata=load_adata, zarr_sparse_chunk_size=10, @@ -379,5 +379,5 @@ def test_empty(tmp_path: Path): assert collection.is_empty # Doesn't matter what errors as long as this function runs, but not to completion with pytest.raises(TypeError): - collection.add_adatas() + collection.add_anndatas() assert not (V1_ENCODING.items() <= g.attrs.items()) From 4958749997a24144928785d61e70721ab6d5d00f Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Wed, 11 Feb 2026 22:07:27 +0100 Subject: [PATCH 07/30] add torch and h5py to mypy ignore_missing_imports --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 79dafe73..c06c9bd0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -180,7 +180,7 @@ omit = [ ] [[tool.mypy.overrides]] -module = [ "anndata.*", "cupyx.*", "cupy.*" ] +module = [ "anndata.*", "cupyx.*", "cupy.*", "torch.*", "h5py.*" ] ignore_missing_imports = true [tool.cruft] From 830f2d4ff88b81cf8ddc6cc6d2eec9e7300cd12e Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Wed, 11 Feb 2026 22:09:17 +0100 Subject: [PATCH 08/30] fix Mapping.copy() call in write_sharded callback --- src/annbatch/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/annbatch/io.py b/src/annbatch/io.py index d5b906e4..87114af2 100644 --- a/src/annbatch/io.py +++ b/src/annbatch/io.py @@ -104,7 +104,7 @@ def callback( iospec: ad.experimental.IOSpec, ): # Ensure we're not overriding anything here - dataset_kwargs = dataset_kwargs.copy() + dataset_kwargs = dict(dataset_kwargs) if iospec.encoding_type in {"array"} and ( any(n in store.name for n in {"obsm", "layers", "obsp"}) or "X" == elem_name ): From 6d6067ac05bb2cdb8950b767a0477028b8334cfa Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Wed, 11 Feb 2026 22:09:32 +0100 Subject: [PATCH 09/30] wrap categories in pd.Index for Categorical.from_codes --- src/annbatch/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/annbatch/io.py b/src/annbatch/io.py index 87114af2..2d4f963e 100644 --- a/src/annbatch/io.py +++ b/src/annbatch/io.py @@ -182,7 +182,7 @@ def _lazy_load_anndatas[T: zarr.Group | h5py.Group | PathLike[str] | str]( adata = load_adata(path) # Track the source file for this given anndata object adata.obs["src_path"] = pd.Categorical.from_codes( - np.ones((adata.shape[0],), dtype="int") * i, categories=[str(p) for p in paths] + np.ones((adata.shape[0],), dtype="int") * i, categories=pd.Index([str(p) for p in paths]) ) # Concatenating Dataset2D drops categoricals so we need to track them if isinstance(adata.obs, Dataset2D): From 0f5aa1df5a4d8ffdd760ffaa63e5ac44e6bd1040 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Wed, 11 Feb 2026 22:09:40 +0100 Subject: [PATCH 10/30] add asserts for match/case narrowing and rename idxs variable --- src/annbatch/io.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/annbatch/io.py b/src/annbatch/io.py index 2d4f963e..fb094d4d 100644 --- a/src/annbatch/io.py +++ b/src/annbatch/io.py @@ -239,11 +239,13 @@ def _create_chunks_for_shuffling( if use_single_chunking: return [np.concatenate(idxs)] # unfortunately, this is the only way to prevent numpy.split from trying to np.array the idxs list, which can have uneven elements. - idxs = np.array([slice(int(idx[0]), int(idx[-1] + 1)) for idx in idxs]) + idxs_as_slices = np.array([slice(int(idx[0]), int(idx[-1] + 1)) for idx in idxs]) return [ np.concatenate([np.arange(s.start, s.stop) for s in idx]) for idx in ( - split_given_size(idxs, n_slices_per_dataset) if n_chunkings is None else np.array_split(idxs, n_chunkings) + split_given_size(idxs_as_slices, n_slices_per_dataset) + if n_chunkings is None + else np.array_split(idxs_as_slices, n_chunkings) ) ] From 12830d5082521d6087c58983b81e4362e8089b79 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Wed, 11 Feb 2026 22:14:42 +0100 Subject: [PATCH 11/30] is none == is none works better with mypy --- src/annbatch/io.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/annbatch/io.py b/src/annbatch/io.py index fb094d4d..f563f5e7 100644 --- a/src/annbatch/io.py +++ b/src/annbatch/io.py @@ -226,15 +226,14 @@ def _create_chunks_for_shuffling( idxs = split_given_size(np.arange(n_obs), shuffle_chunk_size) if shuffle: random.shuffle(idxs) - match shuffle_n_obs_per_dataset is not None, n_chunkings is not None: - case True, False: - n_slices_per_dataset = int(shuffle_n_obs_per_dataset // shuffle_chunk_size) - use_single_chunking = n_obs <= shuffle_n_obs_per_dataset or n_slices_per_dataset <= 1 - case False, True: - n_slices_per_dataset = (n_obs // n_chunkings) // shuffle_chunk_size - use_single_chunking = n_chunkings == 1 - case _, _: - raise ValueError("Cannot provide both shuffle_n_obs_per_dataset and n_chunkings or neither") + if (shuffle_n_obs_per_dataset is None) == (n_chunkings is None): + raise ValueError("Cannot provide both shuffle_n_obs_per_dataset and n_chunkings or neither") + elif shuffle_n_obs_per_dataset is not None: + n_slices_per_dataset = int(shuffle_n_obs_per_dataset // shuffle_chunk_size) + use_single_chunking = n_obs <= shuffle_n_obs_per_dataset or n_slices_per_dataset <= 1 + else: # n_chunkings is not None + n_slices_per_dataset = (n_obs // n_chunkings) // shuffle_chunk_size + use_single_chunking = n_chunkings == 1 # In this case `shuffle_n_obs_per_dataset` is bigger than the size of the dataset or the slice size is probably too big. if use_single_chunking: return [np.concatenate(idxs)] From a60774cca976ca33a0f400786f3ad8bf5e7b3925 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Thu, 12 Feb 2026 17:08:12 +0100 Subject: [PATCH 12/30] other add_anndatas renames + changelog --- CHANGELOG.md | 1 + README.md | 2 +- docs/index.md | 2 +- docs/notebooks/example.ipynb | 12 ++++++------ 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 090222bd..5e6f9165 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning][]. ## [0.0.8] - {class}`~annbatch.Loader` acccepts an `rng` argument now +- Renamed {meth}`annbatch.DatasetCollection.add_adatas` to {meth}`annbatch.DatasetCollection.add_anndatas` to be consistent with the rest of the API. ## [0.0.7] diff --git a/README.md b/README.md index 5128594a..7f11e7ea 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,7 @@ zarr.config.set( # Create a collection at the given path. The subgroups will all be anndata stores. collection = DatasetCollection("path/to/output/collection.zarr") -collection.add_adatas( +collection.add_anndatas( adata_paths=[ "path/to/your/file1.h5ad", "path/to/your/file2.h5ad" diff --git a/docs/index.md b/docs/index.md index d94024b5..6263f2e4 100644 --- a/docs/index.md +++ b/docs/index.md @@ -9,7 +9,7 @@ Let's go through the above example: ### Preprocessing ```python -colleciton = DatasetCollection("path/to/output/store.zarr").add_adatas( +colleciton = DatasetCollection("path/to/output/store.zarr").add_anndatas( adata_paths=[ "path/to/your/file1.h5ad", "path/to/your/file2.h5ad" diff --git a/docs/notebooks/example.ipynb b/docs/notebooks/example.ipynb index f7085129..86efd256 100644 --- a/docs/notebooks/example.ipynb +++ b/docs/notebooks/example.ipynb @@ -133,7 +133,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "tags": [ "hide-output" @@ -198,7 +198,7 @@ "\n", "\n", "collection = DatasetCollection(zarr.open(\"annbatch_collection\", mode=\"w\"))\n", - "collection.add_adatas(\n", + "collection.add_anndatas(\n", " # List all the h5ad files you want to include in the collection\n", " adata_paths=[\"866d7d5e-436b-4dbd-b7c1-7696487d452e.h5ad\", \"f81463b8-4986-4904-a0ea-20ff02cbb317.h5ad\"],\n", " # Path to store the output collection\n", @@ -328,7 +328,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "tags": [ "hide-output" @@ -363,7 +363,7 @@ } ], "source": [ - "collection.add_adatas(\n", + "collection.add_anndatas(\n", " adata_paths=[\n", " \"866d7d5e-436b-4dbd-b7c1-7696487d452e.h5ad\",\n", " ],\n", @@ -381,7 +381,7 @@ ], "metadata": { "kernelspec": { - "display_name": "venv", + "display_name": "annbatch", "language": "python", "name": "python3" }, @@ -395,7 +395,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.12.12" } }, "nbformat": 4, From 9c495b3d9e9c644692349175966dda2bdf2d0697 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 16:08:32 +0000 Subject: [PATCH 13/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_dataset.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 8a80924a..1d010c36 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -104,7 +104,15 @@ def concat(datas: list[Data | ad.AnnData]) -> ListData | list[ad.AnnData]: "gen_loader", [ pytest.param( - lambda collection, shuffle, use_zarrs, chunk_size=chunk_size, preload_nchunks=preload_nchunks, open_func=open_func, batch_size=batch_size, preload_to_gpu=preload_to_gpu, concat_strategy=concat_strategy: ( + lambda collection, + shuffle, + use_zarrs, + chunk_size=chunk_size, + preload_nchunks=preload_nchunks, + open_func=open_func, + batch_size=batch_size, + preload_to_gpu=preload_to_gpu, + concat_strategy=concat_strategy: ( Loader( shuffle=shuffle, chunk_size=chunk_size, From 7f33e7852059d99370b11e2892166efcc46fe9a8 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Fri, 13 Feb 2026 12:16:11 +0100 Subject: [PATCH 14/30] Revert "is none == is none works better with mypy" This reverts commit 12830d5082521d6087c58983b81e4362e8089b79. --- src/annbatch/io.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/annbatch/io.py b/src/annbatch/io.py index f563f5e7..fb094d4d 100644 --- a/src/annbatch/io.py +++ b/src/annbatch/io.py @@ -226,14 +226,15 @@ def _create_chunks_for_shuffling( idxs = split_given_size(np.arange(n_obs), shuffle_chunk_size) if shuffle: random.shuffle(idxs) - if (shuffle_n_obs_per_dataset is None) == (n_chunkings is None): - raise ValueError("Cannot provide both shuffle_n_obs_per_dataset and n_chunkings or neither") - elif shuffle_n_obs_per_dataset is not None: - n_slices_per_dataset = int(shuffle_n_obs_per_dataset // shuffle_chunk_size) - use_single_chunking = n_obs <= shuffle_n_obs_per_dataset or n_slices_per_dataset <= 1 - else: # n_chunkings is not None - n_slices_per_dataset = (n_obs // n_chunkings) // shuffle_chunk_size - use_single_chunking = n_chunkings == 1 + match shuffle_n_obs_per_dataset is not None, n_chunkings is not None: + case True, False: + n_slices_per_dataset = int(shuffle_n_obs_per_dataset // shuffle_chunk_size) + use_single_chunking = n_obs <= shuffle_n_obs_per_dataset or n_slices_per_dataset <= 1 + case False, True: + n_slices_per_dataset = (n_obs // n_chunkings) // shuffle_chunk_size + use_single_chunking = n_chunkings == 1 + case _, _: + raise ValueError("Cannot provide both shuffle_n_obs_per_dataset and n_chunkings or neither") # In this case `shuffle_n_obs_per_dataset` is bigger than the size of the dataset or the slice size is probably too big. if use_single_chunking: return [np.concatenate(idxs)] From 462ca485f4c2850eb6102176ff4a728441305f57 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Fri, 13 Feb 2026 12:22:10 +0100 Subject: [PATCH 15/30] update changelogs --- CHANGELOG.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e6f9165..ad727e18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning][]. ## [0.0.8] - {class}`~annbatch.Loader` acccepts an `rng` argument now -- Renamed {meth}`annbatch.DatasetCollection.add_adatas` to {meth}`annbatch.DatasetCollection.add_anndatas` to be consistent with the rest of the API. +- Renamed `annbatch.DatasetCollection.add_adatas` to {meth}`annbatch.DatasetCollection.add_anndatas` to be consistent with the rest of the API. ## [0.0.7] @@ -36,7 +36,7 @@ and this project adheres to [Semantic Versioning][]. ## [0.0.4] -- Load into memory nullables/categoricals from `obs` by default when shuffling (i.e., no custom `load_adata` argument to {meth}`annbatch.DatasetCollection.add_adatas`) +- Load into memory nullables/categoricals from `obs` by default when shuffling (i.e., no custom `load_adata` argument to {meth}`annbatch.DatasetCollection.add_anndatas` (formerly `add_adatas`)) ## [0.0.3] @@ -50,9 +50,9 @@ and this project adheres to [Semantic Versioning][]. ### Breaking - `ZarrSparseDataset` and `ZarrDenseDataset` have been conslidated into {class}`annbatch.Loader` -- `create_anndata_collection` and `add_to_collection` have been moved into the {meth}`annbatch.DatasetCollection.add_adatas` method -- Default reading of input data is now fully lazy in {meth}`annbatch.DatasetCollection.add_adatas`, and therefore the shuffle process may now be slower although have better memory properties. Use `load_adata` argument in {meth}`annbatch.DatasetCollection.add_adatas` to customize this behavior. -- Files shuffled under the old `create_anndata_collection` will not be recognized by {class}`annbatch.DatasetCollection` and therefore are not usable with the new {class}`annbatch.Loader.use_collection` API. At the moment, the file metadata we maintain is only for internal purposes - however, if you wish to migrate to be able to use {class}`annbatch.DatasetCollection` in conjunction with {class}`annbatch.Loader.use_collection`, the root folder of the old collection must have attrs `{"encoding-type": "annbatch-preshuffled", "encoding-version": "0.1.0"}` and be a {class}`zarr.Group`. The subfolders (i.e., datasets) must be called `dataset_([0-9]*)`. Otherwise you can use the {meth}`annbatch.Loader.add_anndatas` as before. +- `create_anndata_collection` and `add_to_collection` have been moved into the {meth}`annbatch.DatasetCollection.add_anndatas` (formerly `add_adatas`) method +- Default reading of input data is now fully lazy in {meth}`annbatch.DatasetCollection.add_anndatas` (formerly `add_adatas`), and therefore the shuffle process may now be slower although have better memory properties. Use `load_adata` argument in {meth}`annbatch.DatasetCollection.add_anndatas` (formerly `add_adatas`) to customize this behavior. +- Files shuffled under the old `create_anndata_collection` will not be recognized by {class}`annbatch.DatasetCollection` and therefore are not usable with the new {class}`annbatch.Loader.use_collection` API. At the moment, the file metadata we maintain is only for internal purposes - however, if you wish to migrate to be able to use {class}`annbatch.DatasetCollection` in conjunction with {class}`annbatch.Loader.use_collection`, the root folder of the old collection must have attrs `{"encoding-type": "annbatch-preshuffled", "encoding-version": "0.1.0"}` and be a {class}`zarr.Group`. The subfolders (i.e., datasets) must be called `dataset_([0-9]*)`. Otherwise you can use the {meth}`annbatch.DatasetCollection.add_anndatas` (formerly `add_adatas`) as before. ### Changed From 400ee88a98588f7134c77359cfc572c4fed5bd47 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Fri, 13 Feb 2026 12:22:46 +0100 Subject: [PATCH 16/30] updatechangelog again --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad727e18..0259c504 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning][]. ## [0.0.8] - {class}`~annbatch.Loader` acccepts an `rng` argument now + +### Breaking - Renamed `annbatch.DatasetCollection.add_adatas` to {meth}`annbatch.DatasetCollection.add_anndatas` to be consistent with the rest of the API. ## [0.0.7] From fea4e70237b55aa73d265d5fc511f93b21ff353a Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Fri, 13 Feb 2026 20:29:47 +0100 Subject: [PATCH 17/30] update changelog --- CHANGELOG.md | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0259c504..44f38e4e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,13 +8,15 @@ and this project adheres to [Semantic Versioning][]. [keep a changelog]: https://keepachangelog.com/en/1.0.0/ [semantic versioning]: https://semver.org/spec/v2.0.0.html -## [0.0.8] - -- {class}`~annbatch.Loader` acccepts an `rng` argument now +## [0.0.9] ### Breaking - Renamed `annbatch.DatasetCollection.add_adatas` to {meth}`annbatch.DatasetCollection.add_anndatas` to be consistent with the rest of the API. +## [0.0.8] + +- {class}`~annbatch.Loader` acccepts an `rng` argument now + ## [0.0.7] - Make the in-memory concatenation strategy configurable for {meth}`annbatch.Loader.__iter__` via a `concat_strategy` argument to `__init__` - sparse on-disk will concatenated then shuffled/yielded (faster, higher memory usage) but dense will be shuffled and then concated/yielded (lower memory usage). @@ -38,7 +40,7 @@ and this project adheres to [Semantic Versioning][]. ## [0.0.4] -- Load into memory nullables/categoricals from `obs` by default when shuffling (i.e., no custom `load_adata` argument to {meth}`annbatch.DatasetCollection.add_anndatas` (formerly `add_adatas`)) +- Load into memory nullables/categoricals from `obs` by default when shuffling (i.e., no custom `load_adata` argument to `annbatch.DatasetCollection.add_adatas`) ## [0.0.3] @@ -52,9 +54,9 @@ and this project adheres to [Semantic Versioning][]. ### Breaking - `ZarrSparseDataset` and `ZarrDenseDataset` have been conslidated into {class}`annbatch.Loader` -- `create_anndata_collection` and `add_to_collection` have been moved into the {meth}`annbatch.DatasetCollection.add_anndatas` (formerly `add_adatas`) method -- Default reading of input data is now fully lazy in {meth}`annbatch.DatasetCollection.add_anndatas` (formerly `add_adatas`), and therefore the shuffle process may now be slower although have better memory properties. Use `load_adata` argument in {meth}`annbatch.DatasetCollection.add_anndatas` (formerly `add_adatas`) to customize this behavior. -- Files shuffled under the old `create_anndata_collection` will not be recognized by {class}`annbatch.DatasetCollection` and therefore are not usable with the new {class}`annbatch.Loader.use_collection` API. At the moment, the file metadata we maintain is only for internal purposes - however, if you wish to migrate to be able to use {class}`annbatch.DatasetCollection` in conjunction with {class}`annbatch.Loader.use_collection`, the root folder of the old collection must have attrs `{"encoding-type": "annbatch-preshuffled", "encoding-version": "0.1.0"}` and be a {class}`zarr.Group`. The subfolders (i.e., datasets) must be called `dataset_([0-9]*)`. Otherwise you can use the {meth}`annbatch.DatasetCollection.add_anndatas` (formerly `add_adatas`) as before. +- `create_anndata_collection` and `add_to_collection` have been moved into the `annbatch.DatasetCollection.add_adatas` method +- Default reading of input data is now fully lazy in `annbatch.DatasetCollection.add_adatas`, and therefore the shuffle process may now be slower although have better memory properties. Use `load_adata` argument in `annbatch.DatasetCollection.add_adatas` to customize this behavior. +- Files shuffled under the old `create_anndata_collection` will not be recognized by {class}`annbatch.DatasetCollection` and therefore are not usable with the new {class}`annbatch.Loader.use_collection` API. At the moment, the file metadata we maintain is only for internal purposes - however, if you wish to migrate to be able to use {class}`annbatch.DatasetCollection` in conjunction with {class}`annbatch.Loader.use_collection`, the root folder of the old collection must have attrs `{"encoding-type": "annbatch-preshuffled", "encoding-version": "0.1.0"}` and be a {class}`zarr.Group`. The subfolders (i.e., datasets) must be called `dataset_([0-9]*)`. Otherwise you can use the `annbatch.DatasetCollection.add_adatas` as before. ### Changed From ac14eb9914015dbec7778dc3a776e4e2484ba7a9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 17 Feb 2026 12:45:29 +0000 Subject: [PATCH 18/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pyproject.toml | 1 + tests/test_dataset.py | 10 +--------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9d8585c5..1114211d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -167,6 +167,7 @@ run.omit = [ run.patch = [ "subprocess" ] run.source = [ "annbatch" ] +[tool.mypy] [[tool.mypy.overrides]] overrides = [ { module = [ "anndata.*", "cupyx.*", "cupy.*", "torch.*", "h5py.*" ], ignore_missing_imports = true } ] diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 1d010c36..8a80924a 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -104,15 +104,7 @@ def concat(datas: list[Data | ad.AnnData]) -> ListData | list[ad.AnnData]: "gen_loader", [ pytest.param( - lambda collection, - shuffle, - use_zarrs, - chunk_size=chunk_size, - preload_nchunks=preload_nchunks, - open_func=open_func, - batch_size=batch_size, - preload_to_gpu=preload_to_gpu, - concat_strategy=concat_strategy: ( + lambda collection, shuffle, use_zarrs, chunk_size=chunk_size, preload_nchunks=preload_nchunks, open_func=open_func, batch_size=batch_size, preload_to_gpu=preload_to_gpu, concat_strategy=concat_strategy: ( Loader( shuffle=shuffle, chunk_size=chunk_size, From 8d2387117749c424ece246b465bf4754f12d0c28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Selman=20=C3=96zleyen?= <32667648+selmanozleyen@users.noreply.github.com> Date: Tue, 24 Feb 2026 11:08:52 +0100 Subject: [PATCH 19/30] Update src/annbatch/io.py Co-authored-by: Ilan Gold --- src/annbatch/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/annbatch/io.py b/src/annbatch/io.py index fb094d4d..e5b0d1bb 100644 --- a/src/annbatch/io.py +++ b/src/annbatch/io.py @@ -397,7 +397,7 @@ def is_empty(self) -> bool: @_with_settings def add_anndatas( self, - adata_paths: Iterable[zarr.Group | h5py.Group | PathLike[str] | str], + anndata_paths: Iterable[zarr.Group | h5py.Group | PathLike[str] | str], *, load_adata: Callable[[zarr.Group | h5py.Group | PathLike[str] | str], ad.AnnData] = _default_load_adata, var_subset: Iterable[str] | None = None, From 0129dc816ff688420118dc082d7c478f63b6963f Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Tue, 24 Feb 2026 11:12:59 +0100 Subject: [PATCH 20/30] anndata_paths --- CHANGELOG.md | 3 ++- README.md | 2 +- docs/index.md | 2 +- docs/notebooks/example.ipynb | 6 +++--- src/annbatch/io.py | 22 +++++++++++----------- 5 files changed, 18 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 44f38e4e..3561f2b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning][]. ## [0.0.9] ### Breaking -- Renamed `annbatch.DatasetCollection.add_adatas` to {meth}`annbatch.DatasetCollection.add_anndatas` to be consistent with the rest of the API. +- Renamed `annbatch.DatasetCollection.add_adatas` to {meth}`annbatch.DatasetCollection.add_anndatas` and renamed `adata_paths` argument to `anndata_paths` to be consistent with the rest of the API. ## [0.0.8] @@ -51,6 +51,7 @@ and this project adheres to [Semantic Versioning][]. ## [0.0.2] + ### Breaking - `ZarrSparseDataset` and `ZarrDenseDataset` have been conslidated into {class}`annbatch.Loader` diff --git a/README.md b/README.md index b8caba79..7452c0df 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,7 @@ zarr.config.set( # Create a collection at the given path. The subgroups will all be anndata stores. collection = DatasetCollection("path/to/output/collection.zarr") collection.add_anndatas( - adata_paths=[ + anndata_paths=[ "path/to/your/file1.h5ad", "path/to/your/file2.h5ad" ], diff --git a/docs/index.md b/docs/index.md index 08e1335c..d720d3d7 100644 --- a/docs/index.md +++ b/docs/index.md @@ -10,7 +10,7 @@ Let's go through the above example: ```python colleciton = DatasetCollection("path/to/output/store.zarr").add_anndatas( - adata_paths=[ + anndata_paths=[ "path/to/your/file1.h5ad", "path/to/your/file2.h5ad" ], diff --git a/docs/notebooks/example.ipynb b/docs/notebooks/example.ipynb index 86efd256..9e97921e 100644 --- a/docs/notebooks/example.ipynb +++ b/docs/notebooks/example.ipynb @@ -118,7 +118,7 @@ "metadata": {}, "source": [ "The conversion code will take care of the following things:\n", - "* Align (outer join) the gene spaces across all datasets listed in `adata_paths`\n", + "* Align (outer join) the gene spaces across all datasets listed in `anndata_paths`\n", " * The gene spaces are outer-joined based on the gene names provided in the `var_names` field of the individual `AnnData` objects.\n", " * If you want to subset to specific gene space, you can provide a list of gene names via the `var_subset` parameter.\n", "* Shuffle the cells across all datasets (this works on larger than memory datasets as well).\n", @@ -200,7 +200,7 @@ "collection = DatasetCollection(zarr.open(\"annbatch_collection\", mode=\"w\"))\n", "collection.add_anndatas(\n", " # List all the h5ad files you want to include in the collection\n", - " adata_paths=[\"866d7d5e-436b-4dbd-b7c1-7696487d452e.h5ad\", \"f81463b8-4986-4904-a0ea-20ff02cbb317.h5ad\"],\n", + " anndata_paths=[\"866d7d5e-436b-4dbd-b7c1-7696487d452e.h5ad\", \"f81463b8-4986-4904-a0ea-20ff02cbb317.h5ad\"],\n", " # Path to store the output collection\n", " shuffle=True, # Whether to pre-shuffle the cells of the collection\n", " n_obs_per_dataset=2_097_152, # Number of cells per dataset shard, this number is much higher than available in these datasets but is generally a good target\n", @@ -364,7 +364,7 @@ ], "source": [ "collection.add_anndatas(\n", - " adata_paths=[\n", + " anndata_paths=[\n", " \"866d7d5e-436b-4dbd-b7c1-7696487d452e.h5ad\",\n", " ],\n", " load_adata=read_lazy_x_and_obs_only,\n", diff --git a/src/annbatch/io.py b/src/annbatch/io.py index e5b0d1bb..767324a6 100644 --- a/src/annbatch/io.py +++ b/src/annbatch/io.py @@ -425,12 +425,12 @@ def add_anndatas( Parameters ---------- - adata_paths + anndata_paths Paths to the AnnData files used to create the zarr store. load_adata Function to customize (lazy-)loading the invidiual input anndata files. By default, :func:`anndata.experimental.read_lazy` is used with categoricals/nullables read into memory and `(-1)` chunks for `obs`. If you only need a subset of the input anndata files' elems (e.g., only `X` and certain `obs` columns), you can provide a custom function here to speed up loading and harmonize your data. - Beware that concatenating nullables/categoricals (i.e., what happens if `len(adata_paths) > 1` internally in this function) from {class}`anndata.experimental.backed.Dataset2D` `obs` is very time consuming - consider loading these into memory if you use this argument. + Beware that concatenating nullables/categoricals (i.e., what happens if `len(anndata_paths) > 1` internally in this function) from {class}`anndata.experimental.backed.Dataset2D` `obs` is very time consuming - consider loading these into memory if you use this argument. var_subset Subset of gene names to include in the store. If None, all genes are included. Genes are subset based on the `var_names` attribute of the concatenated AnnData object. @@ -483,7 +483,7 @@ def add_anndatas( if shuffle_chunk_size > n_obs_per_dataset: raise ValueError("Cannot have a large slice size than observations per dataset") shared_kwargs = { - "adata_paths": adata_paths, + "anndata_paths": anndata_paths, "load_adata": load_adata, "zarr_sparse_chunk_size": zarr_sparse_chunk_size, "zarr_sparse_shard_size": zarr_sparse_shard_size, @@ -503,7 +503,7 @@ def add_anndatas( def _create_collection( self, *, - adata_paths: Iterable[PathLike[str]] | Iterable[str], + anndata_paths: Iterable[PathLike[str]] | Iterable[str], load_adata: Callable[[PathLike[str] | str], ad.AnnData] = _default_load_adata, var_subset: Iterable[str] | None = None, zarr_sparse_chunk_size: int = 32768, @@ -528,7 +528,7 @@ def _create_collection( Parameters ---------- - adata_paths + anndata_paths Paths to the AnnData files used to create the zarr store. load_adata Function to customize lazy-loading the invidiual input anndata files. By default, :func:`anndata.experimental.read_lazy` is used. @@ -563,8 +563,8 @@ def _create_collection( """ if not self.is_empty: raise RuntimeError("Cannot create a collection at a location that already has a shuffled collection") - _check_for_mismatched_keys(adata_paths, load_adata=load_adata) - adata_concat = _lazy_load_anndatas(adata_paths, load_adata=load_adata) + _check_for_mismatched_keys(anndata_paths, load_adata=load_adata) + adata_concat = _lazy_load_anndatas(anndata_paths, load_adata=load_adata) adata_concat.obs_names_make_unique() n_obs_per_dataset = min(adata_concat.shape[0], n_obs_per_dataset) chunks = _create_chunks_for_shuffling( @@ -606,7 +606,7 @@ def _create_collection( def _add_to_collection( self, *, - adata_paths: Iterable[PathLike[str]] | Iterable[str], + anndata_paths: Iterable[PathLike[str]] | Iterable[str], load_adata: Callable[[PathLike[str] | str], ad.AnnData] = ad.read_h5ad, zarr_sparse_chunk_size: int = 32768, zarr_sparse_shard_size: int = 134_217_728, @@ -623,7 +623,7 @@ def _add_to_collection( Parameters ---------- - adata_paths + anndata_paths Paths to the anndata files to be appended to the collection of output chunks. load_adata Function to customize loading the invidiual input anndata files. By default, :func:`anndata.read_h5ad` is used. @@ -651,9 +651,9 @@ def _add_to_collection( if self.is_empty: raise ValueError("Store is empty. Please run `DatasetCollection.add_anndatas` first.") # Check for mismatched keys among the inputs. - _check_for_mismatched_keys(adata_paths, load_adata=load_adata) + _check_for_mismatched_keys(anndata_paths, load_adata=load_adata) - adata_concat = _lazy_load_anndatas(adata_paths, load_adata=load_adata) + adata_concat = _lazy_load_anndatas(anndata_paths, load_adata=load_adata) if math.ceil(adata_concat.shape[0] / shuffle_chunk_size) < len(self._dataset_keys): raise ValueError( f"Use a shuffle size small enough to distribute the input data with {adata_concat.shape[0]} obs across {len(self._dataset_keys)} anndata stores." From d8c9577e2c2448b132e597f97627628bc4811521 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Tue, 24 Feb 2026 11:18:47 +0100 Subject: [PATCH 21/30] load_adata to load_anndata --- CHANGELOG.md | 1 + README.md | 4 ++-- docs/notebooks/example.ipynb | 12 ++++++------ src/annbatch/io.py | 38 ++++++++++++++++++------------------ src/annbatch/loader.py | 11 +++++++---- tests/test_dataset.py | 4 ++-- tests/test_preshuffle.py | 26 ++++++++++++------------ 7 files changed, 50 insertions(+), 46 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3561f2b4..62dc1bdc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning][]. ### Breaking - Renamed `annbatch.DatasetCollection.add_adatas` to {meth}`annbatch.DatasetCollection.add_anndatas` and renamed `adata_paths` argument to `anndata_paths` to be consistent with the rest of the API. +- In similar fashion, renamed `load_adata` argument to `load_anndata` in {meth}`annbatch.DatasetCollection.add_anndatas` and {meth}`annbatch.DatasetCollection.use_collection`. ## [0.0.8] diff --git a/README.md b/README.md index 7452c0df..c701dc87 100644 --- a/README.md +++ b/README.md @@ -127,9 +127,9 @@ with ad.settings.override(remove_unused_categories=False): preload_nchunks=256, ) # `use_collection` automatically uses the on-disk `X` and full `obs` in the `Loader` - # but the `load_adata` arg can override this behavior + # but the `load_anndata` arg can override this behavior # (see `custom_load_func` above for an example of customization). - ds = ds.use_collection(collection, load_adata = custom_load_func) + ds = ds.use_collection(collection, load_anndata = custom_load_func) # Iterate over dataloader (plugin replacement for torch.utils.DataLoader) for batch in ds: diff --git a/docs/notebooks/example.ipynb b/docs/notebooks/example.ipynb index 9e97921e..52da2652 100644 --- a/docs/notebooks/example.ipynb +++ b/docs/notebooks/example.ipynb @@ -178,7 +178,7 @@ "\n", "\n", "# For CELLxGENE data, the raw counts can either be found under .raw.X or under .X (if .raw is not supplied).\n", - "# To have a store that only contains raw counts, we can write the following load_adata function\n", + "# To have a store that only contains raw counts, we can write the following load_anndata function\n", "def read_lazy_x_and_obs_only(path) -> ad.AnnData:\n", " \"\"\"Custom load function to only load raw counts from CxG data.\"\"\"\n", " # IMPORTANT: Large data should always be loaded lazily to reduce the memory footprint\n", @@ -205,7 +205,7 @@ " shuffle=True, # Whether to pre-shuffle the cells of the collection\n", " n_obs_per_dataset=2_097_152, # Number of cells per dataset shard, this number is much higher than available in these datasets but is generally a good target\n", " var_subset=None, # Optionally subset the collection to a specific gene space\n", - " load_adata=read_lazy_x_and_obs_only,\n", + " load_anndata=read_lazy_x_and_obs_only,\n", ")" ] }, @@ -227,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "tags": [ "hide-output" @@ -251,7 +251,7 @@ "from annbatch import Loader\n", "\n", "\n", - "def _load_adata(g: zarr.Group) -> ad.AnnData:\n", + "def _load_anndata(g: zarr.Group) -> ad.AnnData:\n", " return ad.AnnData(X=ad.io.sparse_dataset(g[\"X\"]), obs=ad.experimental.read_lazy(g).obs[[\"cell_type\"]].to_memory())\n", "\n", "\n", @@ -265,7 +265,7 @@ ")\n", "\n", "# Add in the shuffled data that should be used for training.\n", - "ds.use_collection(collection, load_adata=_load_adata)" + "ds.use_collection(collection, load_anndata=_load_anndata)" ] }, { @@ -367,7 +367,7 @@ " anndata_paths=[\n", " \"866d7d5e-436b-4dbd-b7c1-7696487d452e.h5ad\",\n", " ],\n", - " load_adata=read_lazy_x_and_obs_only,\n", + " load_anndata=read_lazy_x_and_obs_only,\n", ")" ] }, diff --git a/src/annbatch/io.py b/src/annbatch/io.py index 767324a6..00656b76 100644 --- a/src/annbatch/io.py +++ b/src/annbatch/io.py @@ -33,7 +33,7 @@ V1_ENCODING = {"encoding-type": "annbatch-preshuffled", "encoding-version": "0.1.0"} -def _default_load_adata[T: zarr.Group | h5py.Group | PathLike[str] | str](x: T) -> ad.AnnData: +def _default_load_anndata[T: zarr.Group | h5py.Group | PathLike[str] | str](x: T) -> ad.AnnData: adata = ad.experimental.read_lazy(x, load_annotation_index=False) if not isinstance(x, zarr.Group | h5py.Group): group = ( @@ -138,7 +138,7 @@ def callback( def _check_for_mismatched_keys[T: zarr.Group | h5py.Group | PathLike[str] | str]( paths_or_anndatas: Iterable[T | ad.AnnData], *, - load_adata: Callable[[T], ad.AnnData] = lambda x: ad.experimental.read_lazy(x, load_annotation_index=False), + load_anndata: Callable[[T], ad.AnnData] = lambda x: ad.experimental.read_lazy(x, load_annotation_index=False), ): num_raw_in_adata = 0 found_keys: dict[str, defaultdict[str, int]] = { @@ -148,7 +148,7 @@ def _check_for_mismatched_keys[T: zarr.Group | h5py.Group | PathLike[str] | str] } for path_or_anndata in tqdm(paths_or_anndatas, desc="checking for mismatched keys"): if not isinstance(path_or_anndata, ad.AnnData): - adata = load_adata(path_or_anndata) + adata = load_anndata(path_or_anndata) else: adata = path_or_anndata for elem_name, key_count in found_keys.items(): @@ -160,26 +160,26 @@ def _check_for_mismatched_keys[T: zarr.Group | h5py.Group | PathLike[str] | str] num_raw_in_adata += 1 if num_raw_in_adata != (num_anndatas := len(list(paths_or_anndatas))) and num_raw_in_adata != 0: warnings.warn( - f"Found raw keys not present in all anndatas {paths_or_anndatas}, consider deleting raw or moving it to a shared layer/X location via `load_adata`", + f"Found raw keys not present in all anndatas {paths_or_anndatas}, consider deleting raw or moving it to a shared layer/X location via `load_anndata`", stacklevel=2, ) for elem_name, key_count in found_keys.items(): elem_keys_mismatched = [key for key, count in key_count.items() if (count != num_anndatas and count != 0)] if len(elem_keys_mismatched) > 0: warnings.warn( - f"Found {elem_name} keys {elem_keys_mismatched} not present in all anndatas {paths_or_anndatas}, consider stopping and using the `load_adata` argument to alter {elem_name} accordingly.", + f"Found {elem_name} keys {elem_keys_mismatched} not present in all anndatas {paths_or_anndatas}, consider stopping and using the `load_anndata` argument to alter {elem_name} accordingly.", stacklevel=2, ) def _lazy_load_anndatas[T: zarr.Group | h5py.Group | PathLike[str] | str]( paths: Iterable[T], - load_adata: Callable[[T], ad.AnnData] = _default_load_adata, + load_anndata: Callable[[T], ad.AnnData] = _default_load_anndata, ): adatas = [] categoricals_in_all_adatas: dict[str, pd.Index] = {} for i, path in tqdm(enumerate(paths), desc="loading"): - adata = load_adata(path) + adata = load_anndata(path) # Track the source file for this given anndata object adata.obs["src_path"] = pd.Categorical.from_codes( np.ones((adata.shape[0],), dtype="int") * i, categories=pd.Index([str(p) for p in paths]) @@ -399,7 +399,7 @@ def add_anndatas( self, anndata_paths: Iterable[zarr.Group | h5py.Group | PathLike[str] | str], *, - load_adata: Callable[[zarr.Group | h5py.Group | PathLike[str] | str], ad.AnnData] = _default_load_adata, + load_anndata: Callable[[zarr.Group | h5py.Group | PathLike[str] | str], ad.AnnData] = _default_load_anndata, var_subset: Iterable[str] | None = None, zarr_sparse_chunk_size: int = 32768, zarr_sparse_shard_size: int = 134_217_728, @@ -427,7 +427,7 @@ def add_anndatas( ---------- anndata_paths Paths to the AnnData files used to create the zarr store. - load_adata + load_anndata Function to customize (lazy-)loading the invidiual input anndata files. By default, :func:`anndata.experimental.read_lazy` is used with categoricals/nullables read into memory and `(-1)` chunks for `obs`. If you only need a subset of the input anndata files' elems (e.g., only `X` and certain `obs` columns), you can provide a custom function here to speed up loading and harmonize your data. Beware that concatenating nullables/categoricals (i.e., what happens if `len(anndata_paths) > 1` internally in this function) from {class}`anndata.experimental.backed.Dataset2D` `obs` is very time consuming - consider loading these into memory if you use this argument. @@ -477,14 +477,14 @@ def add_anndatas( ... ] >>> DatasetCollection("path/to/output/zarr_store.zarr").add_anndatas( ... datasets, - ... load_adata=read_lazy_x_and_obs_only, + ... load_anndata=read_lazy_x_and_obs_only, ...) """ if shuffle_chunk_size > n_obs_per_dataset: raise ValueError("Cannot have a large slice size than observations per dataset") shared_kwargs = { "anndata_paths": anndata_paths, - "load_adata": load_adata, + "load_anndata": load_anndata, "zarr_sparse_chunk_size": zarr_sparse_chunk_size, "zarr_sparse_shard_size": zarr_sparse_shard_size, "zarr_dense_chunk_size": zarr_dense_chunk_size, @@ -504,7 +504,7 @@ def _create_collection( self, *, anndata_paths: Iterable[PathLike[str]] | Iterable[str], - load_adata: Callable[[PathLike[str] | str], ad.AnnData] = _default_load_adata, + load_anndata: Callable[[PathLike[str] | str], ad.AnnData] = _default_load_anndata, var_subset: Iterable[str] | None = None, zarr_sparse_chunk_size: int = 32768, zarr_sparse_shard_size: int = 134_217_728, @@ -530,7 +530,7 @@ def _create_collection( ---------- anndata_paths Paths to the AnnData files used to create the zarr store. - load_adata + load_anndata Function to customize lazy-loading the invidiual input anndata files. By default, :func:`anndata.experimental.read_lazy` is used. If you only need a subset of the input anndata files' elems (e.g., only `X` and `obs`), you can provide a custom function here to speed up loading and harmonize your data. The input to the function is a path to an anndata file, and the output is an anndata object which has `X` as a :class:`dask.array.Array`. @@ -563,8 +563,8 @@ def _create_collection( """ if not self.is_empty: raise RuntimeError("Cannot create a collection at a location that already has a shuffled collection") - _check_for_mismatched_keys(anndata_paths, load_adata=load_adata) - adata_concat = _lazy_load_anndatas(anndata_paths, load_adata=load_adata) + _check_for_mismatched_keys(anndata_paths, load_anndata=load_anndata) + adata_concat = _lazy_load_anndatas(anndata_paths, load_anndata=load_anndata) adata_concat.obs_names_make_unique() n_obs_per_dataset = min(adata_concat.shape[0], n_obs_per_dataset) chunks = _create_chunks_for_shuffling( @@ -607,7 +607,7 @@ def _add_to_collection( self, *, anndata_paths: Iterable[PathLike[str]] | Iterable[str], - load_adata: Callable[[PathLike[str] | str], ad.AnnData] = ad.read_h5ad, + load_anndata: Callable[[PathLike[str] | str], ad.AnnData] = ad.read_h5ad, zarr_sparse_chunk_size: int = 32768, zarr_sparse_shard_size: int = 134_217_728, zarr_dense_chunk_size: int = 1024, @@ -625,7 +625,7 @@ def _add_to_collection( ---------- anndata_paths Paths to the anndata files to be appended to the collection of output chunks. - load_adata + load_anndata Function to customize loading the invidiual input anndata files. By default, :func:`anndata.read_h5ad` is used. If you only need a subset of the input anndata files' elems (e.g., only `X` and `obs`), you can provide a custom function here to speed up loading and harmonize your data. The input to the function is a path to an anndata file, and the output is an anndata object. @@ -651,9 +651,9 @@ def _add_to_collection( if self.is_empty: raise ValueError("Store is empty. Please run `DatasetCollection.add_anndatas` first.") # Check for mismatched keys among the inputs. - _check_for_mismatched_keys(anndata_paths, load_adata=load_adata) + _check_for_mismatched_keys(anndata_paths, load_anndata=load_anndata) - adata_concat = _lazy_load_anndatas(anndata_paths, load_adata=load_adata) + adata_concat = _lazy_load_anndatas(anndata_paths, load_anndata=load_anndata) if math.ceil(adata_concat.shape[0] / shuffle_chunk_size) < len(self._dataset_keys): raise ValueError( f"Use a shuffle size small enough to distribute the input data with {adata_concat.shape[0]} obs across {len(self._dataset_keys)} anndata stores." diff --git a/src/annbatch/loader.py b/src/annbatch/loader.py index 2df59954..29811872 100644 --- a/src/annbatch/loader.py +++ b/src/annbatch/loader.py @@ -296,7 +296,10 @@ def batch_sampler(self) -> Sampler: return self._batch_sampler def use_collection( - self, collection: DatasetCollection, *, load_adata: Callable[[zarr.Group], ad.AnnData] = load_x_and_obs_and_var + self, + collection: DatasetCollection, + *, + load_anndata: Callable[[zarr.Group], ad.AnnData] = load_x_and_obs_and_var, ) -> Self: """Load from an existing :class:`annbatch.DatasetCollection`. @@ -306,10 +309,10 @@ def use_collection( ---------- collection The collection whose on-disk datasets should be used in this loader. - load_adata + load_anndata A custom load function - recall that whatever is found in :attr:`~anndata.AnnData.X` and :attr:`~anndata.AnnData.obs` will be yielded in batches. Default is to just load `X` and all of `obs`. - This default behavior can degrade performance if you don't need all columns in `obs` - it is recommended to use the `load_adata` argument. + This default behavior can degrade performance if you don't need all columns in `obs` - it is recommended to use the `load_anndata` argument. """ if collection.is_empty: raise ValueError("DatasetCollection is empty") @@ -317,7 +320,7 @@ def use_collection( raise RuntimeError( "You should not add multiple collections, independently shuffled - please preshuffle multiple collections, use `add_anndatas` manually if you know what you are doing, or open an issue if you believe that this should be supported at an API level higher than `add_anndatas`." ) - adatas = [load_adata(g) for g in collection] + adatas = [load_anndata(g) for g in collection] self.add_anndatas(adatas) self._collection_added = True return self diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 8a80924a..bfff82be 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -117,7 +117,7 @@ def concat(datas: list[Data | ad.AnnData]) -> ListData | list[ad.AnnData]: ).use_collection( collection, **( - {"load_adata": lambda group: open_func(group, use_zarrs=use_zarrs, use_anndata=True)} + {"load_anndata": lambda group: open_func(group, use_zarrs=use_zarrs, use_anndata=True)} if open_func is not None else {} ), @@ -519,7 +519,7 @@ def test_no_obs_no_var(simple_collection: tuple[ad.AnnData, DatasetCollection]): batch_size=20, ).use_collection( simple_collection[1], - load_adata=lambda g: ad.AnnData(X=ad.io.sparse_dataset(g["layers"]["sparse"])), + load_anndata=lambda g: ad.AnnData(X=ad.io.sparse_dataset(g["layers"]["sparse"])), ) assert next(iter(ds))["obs"] is None diff --git a/tests/test_preshuffle.py b/tests/test_preshuffle.py index 5a0fae4d..722164ee 100644 --- a/tests/test_preshuffle.py +++ b/tests/test_preshuffle.py @@ -77,7 +77,7 @@ def test_store_creation_no_warnings_with_custom_load(tmp_path: Path): zarr_dense_shard_size=10, n_obs_per_dataset=10, shuffle_chunk_size=5, - load_adata=lambda x: ad.AnnData(X=ad.io.read_elem(h5py.File(x)["X"])), + load_anndata=lambda x: ad.AnnData(X=ad.io.read_elem(h5py.File(x)["X"])), ) assert len(ad.read_zarr(next(iter(collection))).layers.keys()) == 0 @@ -109,11 +109,11 @@ def test_store_creation_path_added_to_obs(tmp_path: Path): @pytest.mark.parametrize("elem_name", ["obsm", "layers", "raw", "obs"]) -@pytest.mark.parametrize("load_adata", [ad.read_h5ad, ad.experimental.read_lazy]) +@pytest.mark.parametrize("load_anndata", [ad.read_h5ad, ad.experimental.read_lazy]) def test_store_addition_different_keys( elem_name: Literal["obsm", "layers", "raw"], tmp_path: Path, - load_adata: Callable[[PathLike[str] | str], ad.AnnData], + load_anndata: Callable[[PathLike[str] | str], ad.AnnData], ): adata_orig = ad.AnnData(X=np.random.randn(100, 20)) orig_path = tmp_path / "orig.h5ad" @@ -138,7 +138,7 @@ def test_store_addition_different_keys( with pytest.warns(UserWarning, match=rf"Found {elem_name} keys.* not present in all anndatas"): collection.add_anndatas( [additional_path], - load_adata=load_adata, + load_anndata=load_anndata, zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, zarr_dense_chunk_size=5, @@ -188,18 +188,18 @@ def test_store_creation_default( @pytest.mark.parametrize("shuffle", [pytest.param(True, id="shuffle"), pytest.param(False, id="no_shuffle")]) @pytest.mark.parametrize( - "load_adata", [pytest.param(None, id="default_read"), pytest.param(ad.experimental.read_lazy, id="fully_lazy")] + "load_anndata", [pytest.param(None, id="default_read"), pytest.param(ad.experimental.read_lazy, id="fully_lazy")] ) def test_store_creation( adata_with_h5_path_different_var_space: tuple[ad.AnnData, Path], shuffle: bool, - load_adata: Callable[[str], ad.AnnData], + load_anndata: Callable[[str], ad.AnnData], ): var_subset = [f"gene_{i}" for i in range(100)] h5_files = sorted(adata_with_h5_path_different_var_space[1].iterdir()) output_path = ( adata_with_h5_path_different_var_space[1].parent - / f"zarr_store_creation_test_{shuffle}_{'default_read' if load_adata is None else 'custom_read'}.zarr" + / f"zarr_store_creation_test_{shuffle}_{'default_read' if load_anndata is None else 'custom_read'}.zarr" ) collection = DatasetCollection(output_path).add_anndatas( [adata_with_h5_path_different_var_space[1] / f for f in h5_files if str(f).endswith(".h5ad")], @@ -211,7 +211,7 @@ def test_store_creation( n_obs_per_dataset=50, shuffle_chunk_size=10, shuffle=shuffle, - **({"load_adata": load_adata} if load_adata is not None else {}), + **({"load_anndata": load_anndata} if load_anndata is not None else {}), ) assert not DatasetCollection(output_path).is_empty assert V1_ENCODING.items() <= zarr.open(output_path).attrs.items() @@ -301,7 +301,7 @@ def test_mismatched_raw_concat( n_obs_per_dataset=30, shuffle_chunk_size=10, shuffle=False, # don't shuffle -> want to check if the right attributes get taken - load_adata=_read_lazy_x_and_obs_only_from_raw, + load_anndata=_read_lazy_x_and_obs_only_from_raw, ) adatas_orig = [] @@ -324,14 +324,14 @@ def test_mismatched_raw_concat( np.testing.assert_array_equal(adata_orig.X.toarray(), adata.X.toarray()) -@pytest.mark.parametrize("load_adata", [ad.read_h5ad, ad.experimental.read_lazy]) +@pytest.mark.parametrize("load_anndata", [ad.read_h5ad, ad.experimental.read_lazy]) def test_store_extension( adata_with_h5_path_different_var_space: tuple[ad.AnnData, Path], - load_adata: Callable[[PathLike[str] | str], ad.AnnData], + load_anndata: Callable[[PathLike[str] | str], ad.AnnData], ): all_h5_paths = sorted(p for p in adata_with_h5_path_different_var_space[1].iterdir() if p.suffix == ".h5ad") store_path = ( - adata_with_h5_path_different_var_space[1].parent / f"zarr_store_extension_test_{load_adata.__name__}.zarr" + adata_with_h5_path_different_var_space[1].parent / f"zarr_store_extension_test_{load_anndata.__name__}.zarr" ) original = all_h5_paths additional = all_h5_paths[4:] # don't add everything to get a "different" var space @@ -350,7 +350,7 @@ def test_store_extension( # add h5ads to existing store collection.add_anndatas( additional, - load_adata=load_adata, + load_anndata=load_anndata, zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, zarr_dense_chunk_size=5, From 50293fba1f1f9c99cd9679ceb0de9c2c7a33daf0 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Tue, 24 Feb 2026 11:33:07 +0100 Subject: [PATCH 22/30] fix mistake --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 62dc1bdc..5eb4bf65 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ and this project adheres to [Semantic Versioning][]. ### Breaking - Renamed `annbatch.DatasetCollection.add_adatas` to {meth}`annbatch.DatasetCollection.add_anndatas` and renamed `adata_paths` argument to `anndata_paths` to be consistent with the rest of the API. -- In similar fashion, renamed `load_adata` argument to `load_anndata` in {meth}`annbatch.DatasetCollection.add_anndatas` and {meth}`annbatch.DatasetCollection.use_collection`. +- In similar fashion, renamed `load_adata` argument to `load_anndata` in {meth}`annbatch.DatasetCollection.add_anndatas` and {meth}`annbatch.Loader.use_collection`. ## [0.0.8] From a485103c7572fb0a079b593cdf01cc71e9dc4b77 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Wed, 25 Feb 2026 15:10:44 +0100 Subject: [PATCH 23/30] rename from anndata to adata --- README.md | 10 +++--- docs/index.md | 4 +-- src/annbatch/io.py | 76 ++++++++++++++++++++-------------------- src/annbatch/loader.py | 20 +++++------ tests/conftest.py | 2 +- tests/test_dataset.py | 8 ++--- tests/test_preshuffle.py | 48 ++++++++++++------------- 7 files changed, 84 insertions(+), 84 deletions(-) diff --git a/README.md b/README.md index c701dc87..e7052e12 100644 --- a/README.md +++ b/README.md @@ -86,8 +86,8 @@ zarr.config.set( # Create a collection at the given path. The subgroups will all be anndata stores. collection = DatasetCollection("path/to/output/collection.zarr") -collection.add_anndatas( - anndata_paths=[ +collection.add_adata( + adata_paths=[ "path/to/your/file1.h5ad", "path/to/your/file2.h5ad" ], @@ -98,7 +98,7 @@ collection.add_anndatas( Data loading: > [!IMPORTANT] -> Without custom loading via {meth}`annbatch.Loader.use_collection` or `load_anndata{s}` or `load_dataset{s}`, *all* columns of the (obs) {class}`pandas.DataFrame` will be loaded and yielded potentially degrading performance. +> Without custom loading via {meth}`annbatch.Loader.use_collection` or `load_adata{s}` or `load_dataset{s}`, *all* columns of the (obs) {class}`pandas.DataFrame` will be loaded and yielded potentially degrading performance. ```python from pathlib import Path @@ -127,9 +127,9 @@ with ad.settings.override(remove_unused_categories=False): preload_nchunks=256, ) # `use_collection` automatically uses the on-disk `X` and full `obs` in the `Loader` - # but the `load_anndata` arg can override this behavior + # but the `load_adata` arg can override this behavior # (see `custom_load_func` above for an example of customization). - ds = ds.use_collection(collection, load_anndata = custom_load_func) + ds = ds.use_collection(collection, load_adata = custom_load_func) # Iterate over dataloader (plugin replacement for torch.utils.DataLoader) for batch in ds: diff --git a/docs/index.md b/docs/index.md index d720d3d7..dd8b4a89 100644 --- a/docs/index.md +++ b/docs/index.md @@ -9,8 +9,8 @@ Let's go through the above example: ### Preprocessing ```python -colleciton = DatasetCollection("path/to/output/store.zarr").add_anndatas( - anndata_paths=[ +colleciton = DatasetCollection("path/to/output/store.zarr").add_adata( + adata_paths=[ "path/to/your/file1.h5ad", "path/to/your/file2.h5ad" ], diff --git a/src/annbatch/io.py b/src/annbatch/io.py index 00656b76..bddaa3a8 100644 --- a/src/annbatch/io.py +++ b/src/annbatch/io.py @@ -33,7 +33,7 @@ V1_ENCODING = {"encoding-type": "annbatch-preshuffled", "encoding-version": "0.1.0"} -def _default_load_anndata[T: zarr.Group | h5py.Group | PathLike[str] | str](x: T) -> ad.AnnData: +def _default_load_adata[T: zarr.Group | h5py.Group | PathLike[str] | str](x: T) -> ad.AnnData: adata = ad.experimental.read_lazy(x, load_annotation_index=False) if not isinstance(x, zarr.Group | h5py.Group): group = ( @@ -136,9 +136,9 @@ def callback( def _check_for_mismatched_keys[T: zarr.Group | h5py.Group | PathLike[str] | str]( - paths_or_anndatas: Iterable[T | ad.AnnData], + paths_or_adata: Iterable[T | ad.AnnData], *, - load_anndata: Callable[[T], ad.AnnData] = lambda x: ad.experimental.read_lazy(x, load_annotation_index=False), + load_adata: Callable[[T], ad.AnnData] = lambda x: ad.experimental.read_lazy(x, load_annotation_index=False), ): num_raw_in_adata = 0 found_keys: dict[str, defaultdict[str, int]] = { @@ -146,9 +146,9 @@ def _check_for_mismatched_keys[T: zarr.Group | h5py.Group | PathLike[str] | str] "obsm": defaultdict(lambda: 0), "obs": defaultdict(lambda: 0), } - for path_or_anndata in tqdm(paths_or_anndatas, desc="checking for mismatched keys"): + for path_or_anndata in tqdm(paths_or_adata, desc="Checking for mismatched keys"): if not isinstance(path_or_anndata, ad.AnnData): - adata = load_anndata(path_or_anndata) + adata = load_adata(path_or_anndata) else: adata = path_or_anndata for elem_name, key_count in found_keys.items(): @@ -158,28 +158,28 @@ def _check_for_mismatched_keys[T: zarr.Group | h5py.Group | PathLike[str] | str] key_count[key] += 1 if adata.raw is not None: num_raw_in_adata += 1 - if num_raw_in_adata != (num_anndatas := len(list(paths_or_anndatas))) and num_raw_in_adata != 0: + if num_raw_in_adata != (num_anndatas := len(list(paths_or_adata))) and num_raw_in_adata != 0: warnings.warn( - f"Found raw keys not present in all anndatas {paths_or_anndatas}, consider deleting raw or moving it to a shared layer/X location via `load_anndata`", + f"Found raw keys not present in all anndatas {paths_or_adata}, consider deleting raw or moving it to a shared layer/X location via `load_adata`", stacklevel=2, ) for elem_name, key_count in found_keys.items(): elem_keys_mismatched = [key for key, count in key_count.items() if (count != num_anndatas and count != 0)] if len(elem_keys_mismatched) > 0: warnings.warn( - f"Found {elem_name} keys {elem_keys_mismatched} not present in all anndatas {paths_or_anndatas}, consider stopping and using the `load_anndata` argument to alter {elem_name} accordingly.", + f"Found {elem_name} keys {elem_keys_mismatched} not present in all anndatas {paths_or_adata}, consider stopping and using the `load_adata` argument to alter {elem_name} accordingly.", stacklevel=2, ) -def _lazy_load_anndatas[T: zarr.Group | h5py.Group | PathLike[str] | str]( +def _lazy_load_adata[T: zarr.Group | h5py.Group | PathLike[str] | str]( paths: Iterable[T], - load_anndata: Callable[[T], ad.AnnData] = _default_load_anndata, + load_adata: Callable[[T], ad.AnnData] = _default_load_adata, ): adatas = [] categoricals_in_all_adatas: dict[str, pd.Index] = {} - for i, path in tqdm(enumerate(paths), desc="loading"): - adata = load_anndata(path) + for i, path in tqdm(enumerate(paths), total=len(paths), desc="Lazy loading adata"): + adata = load_adata(path) # Track the source file for this given anndata object adata.obs["src_path"] = pd.Categorical.from_codes( np.ones((adata.shape[0],), dtype="int") * i, categories=pd.Index([str(p) for p in paths]) @@ -395,11 +395,11 @@ def is_empty(self) -> bool: ) @_with_settings - def add_anndatas( + def add_adata( self, - anndata_paths: Iterable[zarr.Group | h5py.Group | PathLike[str] | str], + adata_paths: Iterable[zarr.Group | h5py.Group | PathLike[str] | str], *, - load_anndata: Callable[[zarr.Group | h5py.Group | PathLike[str] | str], ad.AnnData] = _default_load_anndata, + load_adata: Callable[[zarr.Group | h5py.Group | PathLike[str] | str], ad.AnnData] = _default_load_adata, var_subset: Iterable[str] | None = None, zarr_sparse_chunk_size: int = 32768, zarr_sparse_shard_size: int = 134_217_728, @@ -411,7 +411,7 @@ def add_anndatas( shuffle_chunk_size: int = 1000, shuffle: bool = True, ) -> Self: - """Take AnnData paths and create or add to an on-disk set of AnnData datasets with uniform var spaces at the desired path (with `n_obs_per_dataset` rows per dataset if running for the first time). + """Take adata paths and create or add to an on-disk set of AnnData datasets with uniform var spaces at the desired path (with `n_obs_per_dataset` rows per dataset if running for the first time). The set of AnnData datasets is collectively referred to as a "collection" where each dataset is called `dataset_i.{zarr,h5ad}`. The main purpose of this function is to create shuffled sharded zarr datasets, which is the default behavior of this function. @@ -425,12 +425,12 @@ def add_anndatas( Parameters ---------- - anndata_paths + adata_paths Paths to the AnnData files used to create the zarr store. - load_anndata + load_adata Function to customize (lazy-)loading the invidiual input anndata files. By default, :func:`anndata.experimental.read_lazy` is used with categoricals/nullables read into memory and `(-1)` chunks for `obs`. If you only need a subset of the input anndata files' elems (e.g., only `X` and certain `obs` columns), you can provide a custom function here to speed up loading and harmonize your data. - Beware that concatenating nullables/categoricals (i.e., what happens if `len(anndata_paths) > 1` internally in this function) from {class}`anndata.experimental.backed.Dataset2D` `obs` is very time consuming - consider loading these into memory if you use this argument. + Beware that concatenating nullables/categoricals (i.e., what happens if `len(adata_paths) > 1` internally in this function) from {class}`anndata.experimental.backed.Dataset2D` `obs` is very time consuming - consider loading these into memory if you use this argument. var_subset Subset of gene names to include in the store. If None, all genes are included. Genes are subset based on the `var_names` attribute of the concatenated AnnData object. @@ -475,16 +475,16 @@ def add_anndatas( ... "path/to/second_adata.h5ad", ... "path/to/third_adata.h5ad", ... ] - >>> DatasetCollection("path/to/output/zarr_store.zarr").add_anndatas( + >>> DatasetCollection("path/to/output/zarr_store.zarr").add_adata( ... datasets, - ... load_anndata=read_lazy_x_and_obs_only, + ... load_adata=read_lazy_x_and_obs_only, ...) """ if shuffle_chunk_size > n_obs_per_dataset: raise ValueError("Cannot have a large slice size than observations per dataset") shared_kwargs = { - "anndata_paths": anndata_paths, - "load_anndata": load_anndata, + "adata_paths": adata_paths, + "load_adata": load_adata, "zarr_sparse_chunk_size": zarr_sparse_chunk_size, "zarr_sparse_shard_size": zarr_sparse_shard_size, "zarr_dense_chunk_size": zarr_dense_chunk_size, @@ -503,8 +503,8 @@ def add_anndatas( def _create_collection( self, *, - anndata_paths: Iterable[PathLike[str]] | Iterable[str], - load_anndata: Callable[[PathLike[str] | str], ad.AnnData] = _default_load_anndata, + adata_paths: Iterable[PathLike[str]] | Iterable[str], + load_adata: Callable[[PathLike[str] | str], ad.AnnData] = _default_load_adata, var_subset: Iterable[str] | None = None, zarr_sparse_chunk_size: int = 32768, zarr_sparse_shard_size: int = 134_217_728, @@ -528,9 +528,9 @@ def _create_collection( Parameters ---------- - anndata_paths + adata_paths Paths to the AnnData files used to create the zarr store. - load_anndata + load_adata Function to customize lazy-loading the invidiual input anndata files. By default, :func:`anndata.experimental.read_lazy` is used. If you only need a subset of the input anndata files' elems (e.g., only `X` and `obs`), you can provide a custom function here to speed up loading and harmonize your data. The input to the function is a path to an anndata file, and the output is an anndata object which has `X` as a :class:`dask.array.Array`. @@ -563,8 +563,8 @@ def _create_collection( """ if not self.is_empty: raise RuntimeError("Cannot create a collection at a location that already has a shuffled collection") - _check_for_mismatched_keys(anndata_paths, load_anndata=load_anndata) - adata_concat = _lazy_load_anndatas(anndata_paths, load_anndata=load_anndata) + _check_for_mismatched_keys(adata_paths, load_adata=load_adata) + adata_concat = _lazy_load_adata(adata_paths, load_adata=load_adata) adata_concat.obs_names_make_unique() n_obs_per_dataset = min(adata_concat.shape[0], n_obs_per_dataset) chunks = _create_chunks_for_shuffling( @@ -573,7 +573,7 @@ def _create_collection( if var_subset is None: var_subset = adata_concat.var_names - for i, chunk in enumerate(tqdm(chunks, desc="processing chunks")): + for i, chunk in enumerate(tqdm(chunks, desc="Creating collection")): var_mask = adata_concat.var_names.isin(var_subset) # np.sort: It's more efficient to access elements sequentially from dask arrays # The data will be shuffled later on, we just want the elements at this point @@ -606,8 +606,8 @@ def _create_collection( def _add_to_collection( self, *, - anndata_paths: Iterable[PathLike[str]] | Iterable[str], - load_anndata: Callable[[PathLike[str] | str], ad.AnnData] = ad.read_h5ad, + adata_paths: Iterable[PathLike[str]] | Iterable[str], + load_adata: Callable[[PathLike[str] | str], ad.AnnData] = ad.read_h5ad, zarr_sparse_chunk_size: int = 32768, zarr_sparse_shard_size: int = 134_217_728, zarr_dense_chunk_size: int = 1024, @@ -623,9 +623,9 @@ def _add_to_collection( Parameters ---------- - anndata_paths + adata_paths Paths to the anndata files to be appended to the collection of output chunks. - load_anndata + load_adata Function to customize loading the invidiual input anndata files. By default, :func:`anndata.read_h5ad` is used. If you only need a subset of the input anndata files' elems (e.g., only `X` and `obs`), you can provide a custom function here to speed up loading and harmonize your data. The input to the function is a path to an anndata file, and the output is an anndata object. @@ -649,11 +649,11 @@ def _add_to_collection( Whether or not to shuffle when adding. Otherwise, the incoming data will just be split up and appended. """ if self.is_empty: - raise ValueError("Store is empty. Please run `DatasetCollection.add_anndatas` first.") + raise ValueError("Store is empty. Please run `DatasetCollection.add_adata` first.") # Check for mismatched keys among the inputs. - _check_for_mismatched_keys(anndata_paths, load_anndata=load_anndata) + _check_for_mismatched_keys(adata_paths, load_adata=load_adata) - adata_concat = _lazy_load_anndatas(anndata_paths, load_anndata=load_anndata) + adata_concat = _lazy_load_adata(adata_paths, load_adata=load_adata) if math.ceil(adata_concat.shape[0] / shuffle_chunk_size) < len(self._dataset_keys): raise ValueError( f"Use a shuffle size small enough to distribute the input data with {adata_concat.shape[0]} obs across {len(self._dataset_keys)} anndata stores." @@ -667,7 +667,7 @@ def _add_to_collection( adata_concat.obs_names_make_unique() for dataset, chunk in tqdm( - zip(self._dataset_keys, chunks, strict=True), total=len(self._dataset_keys), desc="processing chunks" + zip(self._dataset_keys, chunks, strict=True), total=len(self._dataset_keys), desc="Extending collection" ): adata_dataset = ad.io.read_elem(self._group[dataset]) subset_adata = _to_categorical_obs( diff --git a/src/annbatch/loader.py b/src/annbatch/loader.py index 29811872..b65b9da7 100644 --- a/src/annbatch/loader.py +++ b/src/annbatch/loader.py @@ -299,45 +299,45 @@ def use_collection( self, collection: DatasetCollection, *, - load_anndata: Callable[[zarr.Group], ad.AnnData] = load_x_and_obs_and_var, + load_adata: Callable[[zarr.Group], ad.AnnData] = load_x_and_obs_and_var, ) -> Self: """Load from an existing :class:`annbatch.DatasetCollection`. - This function can only be called once. If you want to manually add more data, use :meth:`Loader.add_anndatas` or open an issue. + This function can only be called once. If you want to manually add more data, use :meth:`Loader.add_adata` or open an issue. Parameters ---------- collection The collection whose on-disk datasets should be used in this loader. - load_anndata + load_adata A custom load function - recall that whatever is found in :attr:`~anndata.AnnData.X` and :attr:`~anndata.AnnData.obs` will be yielded in batches. Default is to just load `X` and all of `obs`. - This default behavior can degrade performance if you don't need all columns in `obs` - it is recommended to use the `load_anndata` argument. + This default behavior can degrade performance if you don't need all columns in `obs` - it is recommended to use the `load_adata` argument. """ if collection.is_empty: raise ValueError("DatasetCollection is empty") if self._collection_added: raise RuntimeError( - "You should not add multiple collections, independently shuffled - please preshuffle multiple collections, use `add_anndatas` manually if you know what you are doing, or open an issue if you believe that this should be supported at an API level higher than `add_anndatas`." + "You should not add multiple collections, independently shuffled - please preshuffle multiple collections, use `add_adata` manually if you know what you are doing, or open an issue if you believe that this should be supported at an API level higher than `add_adata`." ) - adatas = [load_anndata(g) for g in collection] - self.add_anndatas(adatas) + adatas = [load_adata(g) for g in collection] + self.add_adata(adatas) self._collection_added = True return self @validate_sampler - def add_anndatas( + def add_adata( self, adatas: list[ad.AnnData], ) -> Self: - """Append anndatas to this dataset. + """Append adata to this dataset. Parameters ---------- adatas List of :class:`anndata.AnnData` objects, with :class:`zarr.Array` or :class:`anndata.abc.CSRDataset` as the data matrix in :attr:`~anndata.AnnData.X`, and :attr:`~anndata.AnnData.obs` containing annotations to yield in a :class:`pandas.DataFrame`. """ - check_lt_1([len(adatas)], ["Number of anndatas"]) + check_lt_1([len(adatas)], ["Number of adata"]) for adata in adatas: dataset, obs, var = self._prepare_dataset_obs_and_var(adata) self._add_dataset_unchecked(dataset, obs, var) diff --git a/tests/conftest.py b/tests/conftest.py index 0c9843c2..06f2e77f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -114,7 +114,7 @@ def simple_collection( ) -> tuple[DatasetCollection, ad.AnnData]: zarr_stores = sorted(f for f in adata_with_zarr_path_same_var_space[1].iterdir() if f.is_dir()) output_path = Path(tmpdir_factory.mktemp("zarr_folder")) / "simple_fixture.zarr" - collection = DatasetCollection(output_path).add_anndatas( + collection = DatasetCollection(output_path).add_adata( zarr_stores, zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, diff --git a/tests/test_dataset.py b/tests/test_dataset.py index bfff82be..6e044a2e 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -117,7 +117,7 @@ def concat(datas: list[Data | ad.AnnData]) -> ListData | list[ad.AnnData]: ).use_collection( collection, **( - {"load_anndata": lambda group: open_func(group, use_zarrs=use_zarrs, use_anndata=True)} + {"load_adata": lambda group: open_func(group, use_zarrs=use_zarrs, use_anndata=True)} if open_func is not None else {} ), @@ -519,7 +519,7 @@ def test_no_obs_no_var(simple_collection: tuple[ad.AnnData, DatasetCollection]): batch_size=20, ).use_collection( simple_collection[1], - load_anndata=lambda g: ad.AnnData(X=ad.io.sparse_dataset(g["layers"]["sparse"])), + load_adata=lambda g: ad.AnnData(X=ad.io.sparse_dataset(g["layers"]["sparse"])), ) assert next(iter(ds))["obs"] is None @@ -558,10 +558,10 @@ def test_mismatched_var_raises_error(tmp_path: Path, subtests): with pytest.raises(ValueError, match="All datasets must have identical var DataFrames"): loader.add_anndata(adata2_on_disk) - with subtests.test(msg="add_anndatas"): + with subtests.test(msg="add_adata"): loader = Loader(chunk_size=10, preload_nchunks=4, batch_size=20) with pytest.raises(ValueError, match="All datasets must have identical var DataFrames"): - loader.add_anndatas([adata1_on_disk, adata2_on_disk]) + loader.add_adata([adata1_on_disk, adata2_on_disk]) with subtests.test(msg="add_dataset"): loader = Loader(chunk_size=10, preload_nchunks=4, batch_size=20) diff --git a/tests/test_preshuffle.py b/tests/test_preshuffle.py index 722164ee..ada5072f 100644 --- a/tests/test_preshuffle.py +++ b/tests/test_preshuffle.py @@ -51,7 +51,7 @@ def test_store_creation_warnings_with_different_keys(elem_name: Literal["obsm", adata_1.write_h5ad(path_1) adata_2.write_h5ad(path_2) with pytest.warns(UserWarning, match=rf"Found {elem_name} keys.* not present in all anndatas"): - DatasetCollection(tmp_path / "collection.zarr").add_anndatas( + DatasetCollection(tmp_path / "collection.zarr").add_adata( [path_1, path_2], zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, @@ -69,7 +69,7 @@ def test_store_creation_no_warnings_with_custom_load(tmp_path: Path): path_2 = tmp_path / "with_extra_key.h5ad" adata_1.write_h5ad(path_1) adata_2.write_h5ad(path_2) - collection = DatasetCollection(tmp_path / "collection.zarr").add_anndatas( + collection = DatasetCollection(tmp_path / "collection.zarr").add_adata( [path_1, path_2], zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, @@ -77,7 +77,7 @@ def test_store_creation_no_warnings_with_custom_load(tmp_path: Path): zarr_dense_shard_size=10, n_obs_per_dataset=10, shuffle_chunk_size=5, - load_anndata=lambda x: ad.AnnData(X=ad.io.read_elem(h5py.File(x)["X"])), + load_adata=lambda x: ad.AnnData(X=ad.io.read_elem(h5py.File(x)["X"])), ) assert len(ad.read_zarr(next(iter(collection))).layers.keys()) == 0 @@ -91,7 +91,7 @@ def test_store_creation_path_added_to_obs(tmp_path: Path): adata_2.write_h5ad(path_2) paths = [path_1, path_2] output_dir = tmp_path / "path_src_collection.zarr" - collection = DatasetCollection(output_dir).add_anndatas( + collection = DatasetCollection(output_dir).add_adata( paths, zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, @@ -109,18 +109,18 @@ def test_store_creation_path_added_to_obs(tmp_path: Path): @pytest.mark.parametrize("elem_name", ["obsm", "layers", "raw", "obs"]) -@pytest.mark.parametrize("load_anndata", [ad.read_h5ad, ad.experimental.read_lazy]) +@pytest.mark.parametrize("load_adata", [ad.read_h5ad, ad.experimental.read_lazy]) def test_store_addition_different_keys( elem_name: Literal["obsm", "layers", "raw"], tmp_path: Path, - load_anndata: Callable[[PathLike[str] | str], ad.AnnData], + load_adata: Callable[[PathLike[str] | str], ad.AnnData], ): adata_orig = ad.AnnData(X=np.random.randn(100, 20)) orig_path = tmp_path / "orig.h5ad" adata_orig.write_h5ad(orig_path) output_path = tmp_path / "zarr_store_addition_different_keys.zarr" collection = DatasetCollection(output_path) - collection.add_anndatas( + collection.add_adata( [orig_path], zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, @@ -136,9 +136,9 @@ def test_store_addition_different_keys( additional_path = tmp_path / "with_extra_key.h5ad" adata.write_h5ad(additional_path) with pytest.warns(UserWarning, match=rf"Found {elem_name} keys.* not present in all anndatas"): - collection.add_anndatas( + collection.add_adata( [additional_path], - load_anndata=load_anndata, + load_adata=load_adata, zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, zarr_dense_chunk_size=5, @@ -169,7 +169,7 @@ def test_store_creation_default( else r"Loading h5ad is currently not supported", ): kwargs = {} if is_zarr else {"is_collection_h5ad": True} - collection = DatasetCollection(output_path, **kwargs).add_anndatas( + collection = DatasetCollection(output_path, **kwargs).add_adata( [adata_with_h5_path_different_var_space[1] / f for f in h5_files if str(f).endswith(".h5ad")] ) assert isinstance( @@ -188,20 +188,20 @@ def test_store_creation_default( @pytest.mark.parametrize("shuffle", [pytest.param(True, id="shuffle"), pytest.param(False, id="no_shuffle")]) @pytest.mark.parametrize( - "load_anndata", [pytest.param(None, id="default_read"), pytest.param(ad.experimental.read_lazy, id="fully_lazy")] + "load_adata", [pytest.param(None, id="default_read"), pytest.param(ad.experimental.read_lazy, id="fully_lazy")] ) def test_store_creation( adata_with_h5_path_different_var_space: tuple[ad.AnnData, Path], shuffle: bool, - load_anndata: Callable[[str], ad.AnnData], + load_adata: Callable[[str], ad.AnnData], ): var_subset = [f"gene_{i}" for i in range(100)] h5_files = sorted(adata_with_h5_path_different_var_space[1].iterdir()) output_path = ( adata_with_h5_path_different_var_space[1].parent - / f"zarr_store_creation_test_{shuffle}_{'default_read' if load_anndata is None else 'custom_read'}.zarr" + / f"zarr_store_creation_test_{shuffle}_{'default_read' if load_adata is None else 'custom_read'}.zarr" ) - collection = DatasetCollection(output_path).add_anndatas( + collection = DatasetCollection(output_path).add_adata( [adata_with_h5_path_different_var_space[1] / f for f in h5_files if str(f).endswith(".h5ad")], var_subset=var_subset, zarr_sparse_chunk_size=10, @@ -211,7 +211,7 @@ def test_store_creation( n_obs_per_dataset=50, shuffle_chunk_size=10, shuffle=shuffle, - **({"load_anndata": load_anndata} if load_anndata is not None else {}), + **({"load_adata": load_adata} if load_adata is not None else {}), ) assert not DatasetCollection(output_path).is_empty assert V1_ENCODING.items() <= zarr.open(output_path).attrs.items() @@ -292,7 +292,7 @@ def test_mismatched_raw_concat( h5_files = sorted(adata_with_h5_path_different_var_space[1].iterdir()) output_path = adata_with_h5_path_different_var_space[1].parent / "zarr_store_creation_test_heterogeneous.zarr" h5_paths = [adata_with_h5_path_different_var_space[1] / f for f in h5_files if str(f).endswith(".h5ad")] - collection = DatasetCollection(output_path).add_anndatas( + collection = DatasetCollection(output_path).add_adata( h5_paths, zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, @@ -301,7 +301,7 @@ def test_mismatched_raw_concat( n_obs_per_dataset=30, shuffle_chunk_size=10, shuffle=False, # don't shuffle -> want to check if the right attributes get taken - load_anndata=_read_lazy_x_and_obs_only_from_raw, + load_adata=_read_lazy_x_and_obs_only_from_raw, ) adatas_orig = [] @@ -324,20 +324,20 @@ def test_mismatched_raw_concat( np.testing.assert_array_equal(adata_orig.X.toarray(), adata.X.toarray()) -@pytest.mark.parametrize("load_anndata", [ad.read_h5ad, ad.experimental.read_lazy]) +@pytest.mark.parametrize("load_adata", [ad.read_h5ad, ad.experimental.read_lazy]) def test_store_extension( adata_with_h5_path_different_var_space: tuple[ad.AnnData, Path], - load_anndata: Callable[[PathLike[str] | str], ad.AnnData], + load_adata: Callable[[PathLike[str] | str], ad.AnnData], ): all_h5_paths = sorted(p for p in adata_with_h5_path_different_var_space[1].iterdir() if p.suffix == ".h5ad") store_path = ( - adata_with_h5_path_different_var_space[1].parent / f"zarr_store_extension_test_{load_anndata.__name__}.zarr" + adata_with_h5_path_different_var_space[1].parent / f"zarr_store_extension_test_{load_adata.__name__}.zarr" ) original = all_h5_paths additional = all_h5_paths[4:] # don't add everything to get a "different" var space # create new store collection = DatasetCollection(store_path) - collection.add_anndatas( + collection.add_adata( original, zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, @@ -348,9 +348,9 @@ def test_store_extension( shuffle=True, ) # add h5ads to existing store - collection.add_anndatas( + collection.add_adata( additional, - load_anndata=load_anndata, + load_adata=load_adata, zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, zarr_dense_chunk_size=5, @@ -379,5 +379,5 @@ def test_empty(tmp_path: Path): assert collection.is_empty # Doesn't matter what errors as long as this function runs, but not to completion with pytest.raises(TypeError): - collection.add_anndatas() + collection.add_adata() assert not (V1_ENCODING.items() <= g.attrs.items()) From b1af3b64c0cd7dcbd85976fb5ac36bc685b4e5f1 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Wed, 25 Feb 2026 15:11:34 +0100 Subject: [PATCH 24/30] update changelog --- CHANGELOG.md | 7 +++++-- src/annbatch/io.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5eb4bf65..d04722c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,8 +11,11 @@ and this project adheres to [Semantic Versioning][]. ## [0.0.9] ### Breaking -- Renamed `annbatch.DatasetCollection.add_adatas` to {meth}`annbatch.DatasetCollection.add_anndatas` and renamed `adata_paths` argument to `anndata_paths` to be consistent with the rest of the API. -- In similar fashion, renamed `load_adata` argument to `load_anndata` in {meth}`annbatch.DatasetCollection.add_anndatas` and {meth}`annbatch.Loader.use_collection`. +- Renamed `annbatch.DatasetCollection.add_adatas` to {meth}`annbatch.DatasetCollection.add_adata`. +- Renamed `annbatch.Loader.add_anndatas` to {meth}`annbatch.Loader.add_adata`. + +### Fixed +- Formatted progress bar descriptions to be more readable. ## [0.0.8] diff --git a/src/annbatch/io.py b/src/annbatch/io.py index bddaa3a8..fb670b24 100644 --- a/src/annbatch/io.py +++ b/src/annbatch/io.py @@ -411,7 +411,7 @@ def add_adata( shuffle_chunk_size: int = 1000, shuffle: bool = True, ) -> Self: - """Take adata paths and create or add to an on-disk set of AnnData datasets with uniform var spaces at the desired path (with `n_obs_per_dataset` rows per dataset if running for the first time). + """Take AnnData paths and create or add to an on-disk set of AnnData datasets with uniform var spaces at the desired path (with `n_obs_per_dataset` rows per dataset if running for the first time). The set of AnnData datasets is collectively referred to as a "collection" where each dataset is called `dataset_i.{zarr,h5ad}`. The main purpose of this function is to create shuffled sharded zarr datasets, which is the default behavior of this function. From 8716f947d149d743c42c1356f351b27504ee8b3f Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Thu, 26 Feb 2026 13:41:32 +0100 Subject: [PATCH 25/30] Apply suggestions from code review --- docs/notebooks/example.ipynb | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/notebooks/example.ipynb b/docs/notebooks/example.ipynb index 52da2652..2141539d 100644 --- a/docs/notebooks/example.ipynb +++ b/docs/notebooks/example.ipynb @@ -118,7 +118,7 @@ "metadata": {}, "source": [ "The conversion code will take care of the following things:\n", - "* Align (outer join) the gene spaces across all datasets listed in `anndata_paths`\n", + "* Align (outer join) the gene spaces across all datasets listed in `adata_paths`\n", " * The gene spaces are outer-joined based on the gene names provided in the `var_names` field of the individual `AnnData` objects.\n", " * If you want to subset to specific gene space, you can provide a list of gene names via the `var_subset` parameter.\n", "* Shuffle the cells across all datasets (this works on larger than memory datasets as well).\n", @@ -178,7 +178,7 @@ "\n", "\n", "# For CELLxGENE data, the raw counts can either be found under .raw.X or under .X (if .raw is not supplied).\n", - "# To have a store that only contains raw counts, we can write the following load_anndata function\n", + "# To have a store that only contains raw counts, we can write the following `load_adata` function\n", "def read_lazy_x_and_obs_only(path) -> ad.AnnData:\n", " \"\"\"Custom load function to only load raw counts from CxG data.\"\"\"\n", " # IMPORTANT: Large data should always be loaded lazily to reduce the memory footprint\n", @@ -198,14 +198,14 @@ "\n", "\n", "collection = DatasetCollection(zarr.open(\"annbatch_collection\", mode=\"w\"))\n", - "collection.add_anndatas(\n", + "collection.add_adatas(\n", " # List all the h5ad files you want to include in the collection\n", - " anndata_paths=[\"866d7d5e-436b-4dbd-b7c1-7696487d452e.h5ad\", \"f81463b8-4986-4904-a0ea-20ff02cbb317.h5ad\"],\n", + " adata_paths=[\"866d7d5e-436b-4dbd-b7c1-7696487d452e.h5ad\", \"f81463b8-4986-4904-a0ea-20ff02cbb317.h5ad\"],\n", " # Path to store the output collection\n", " shuffle=True, # Whether to pre-shuffle the cells of the collection\n", " n_obs_per_dataset=2_097_152, # Number of cells per dataset shard, this number is much higher than available in these datasets but is generally a good target\n", " var_subset=None, # Optionally subset the collection to a specific gene space\n", - " load_anndata=read_lazy_x_and_obs_only,\n", + " load_adata=read_lazy_x_and_obs_only,\n", ")" ] }, @@ -251,7 +251,7 @@ "from annbatch import Loader\n", "\n", "\n", - "def _load_anndata(g: zarr.Group) -> ad.AnnData:\n", + "def _load_adata(g: zarr.Group) -> ad.AnnData:\n", " return ad.AnnData(X=ad.io.sparse_dataset(g[\"X\"]), obs=ad.experimental.read_lazy(g).obs[[\"cell_type\"]].to_memory())\n", "\n", "\n", @@ -363,11 +363,11 @@ } ], "source": [ - "collection.add_anndatas(\n", - " anndata_paths=[\n", + "collection.add_adatas(\n", + " adata_paths=[\n", " \"866d7d5e-436b-4dbd-b7c1-7696487d452e.h5ad\",\n", " ],\n", - " load_anndata=read_lazy_x_and_obs_only,\n", + " load_adata=read_lazy_x_and_obs_only,\n", ")" ] }, From c885049ed6e8a42c879c1915c503a043e1ab5c76 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Thu, 26 Feb 2026 13:43:48 +0100 Subject: [PATCH 26/30] Apply suggestion from @ilan-gold --- docs/notebooks/example.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/notebooks/example.ipynb b/docs/notebooks/example.ipynb index 2141539d..7bf3c822 100644 --- a/docs/notebooks/example.ipynb +++ b/docs/notebooks/example.ipynb @@ -265,7 +265,7 @@ ")\n", "\n", "# Add in the shuffled data that should be used for training.\n", - "ds.use_collection(collection, load_anndata=_load_anndata)" + "ds.use_collection(collection, load_adata=_load_adata)" ] }, { From 4305a8b82f8b29bffb5cfeb5ffccd8289bd2534e Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Tue, 3 Mar 2026 11:13:53 +0100 Subject: [PATCH 27/30] fix after merge conflict --- CHANGELOG.md | 5 ++--- src/annbatch/loader.py | 18 +++++++++--------- tests/test_dataset.py | 12 ++++++------ 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 00c1937c..831e622b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,12 +11,11 @@ and this project adheres to [Semantic Versioning][]. ## [0.0.9] ### Breaking -- Renamed `annbatch.DatasetCollection.add_adatas` to {meth}`annbatch.DatasetCollection.add_adata`. -- Renamed `annbatch.Loader.add_anndatas` to {meth}`annbatch.Loader.add_adata`. +- Renamed `annbatch.Loader.add_anndatas` to {meth}`annbatch.Loader.add_adatas`. +- Renamed `annbatch.Loader.add_anndata` to {meth}`annbatch.Loader.add_adata`. ### Fixed - Formatted progress bar descriptions to be more readable. -======= - {class}`annbatch.DatasetCollection` now accepts a `rng` argument to the {meth}`annbatch.DatasetCollection.add_adatas` method. diff --git a/src/annbatch/loader.py b/src/annbatch/loader.py index b65b9da7..0e969c74 100644 --- a/src/annbatch/loader.py +++ b/src/annbatch/loader.py @@ -131,7 +131,7 @@ class Loader[ batch_size=4096, chunk_size=32, preload_nchunks=512, - ).add_anndata(my_anndata) + ).add_adata(my_anndata) >>> for batch in ds: # optionally convert to dense # batch = batch.to_dense() @@ -303,7 +303,7 @@ def use_collection( ) -> Self: """Load from an existing :class:`annbatch.DatasetCollection`. - This function can only be called once. If you want to manually add more data, use :meth:`Loader.add_adata` or open an issue. + This function can only be called once. If you want to manually add more data, use :meth:`Loader.add_adatas` or open an issue. Parameters ---------- @@ -318,33 +318,33 @@ def use_collection( raise ValueError("DatasetCollection is empty") if self._collection_added: raise RuntimeError( - "You should not add multiple collections, independently shuffled - please preshuffle multiple collections, use `add_adata` manually if you know what you are doing, or open an issue if you believe that this should be supported at an API level higher than `add_adata`." + "You should not add multiple collections, independently shuffled - please preshuffle multiple collections, use `add_adatas` manually if you know what you are doing, or open an issue if you believe that this should be supported at an API level higher than `add_adatas`." ) adatas = [load_adata(g) for g in collection] - self.add_adata(adatas) + self.add_adatas(adatas) self._collection_added = True return self @validate_sampler - def add_adata( + def add_adatas( self, adatas: list[ad.AnnData], ) -> Self: - """Append adata to this dataset. + """Append adatas to this dataset. Parameters ---------- adatas List of :class:`anndata.AnnData` objects, with :class:`zarr.Array` or :class:`anndata.abc.CSRDataset` as the data matrix in :attr:`~anndata.AnnData.X`, and :attr:`~anndata.AnnData.obs` containing annotations to yield in a :class:`pandas.DataFrame`. """ - check_lt_1([len(adatas)], ["Number of adata"]) + check_lt_1([len(adatas)], ["Number of adatas"]) for adata in adatas: dataset, obs, var = self._prepare_dataset_obs_and_var(adata) self._add_dataset_unchecked(dataset, obs, var) return self - def add_anndata(self, adata: ad.AnnData) -> Self: - """Append an anndata to this dataset. + def add_adata(self, adata: ad.AnnData) -> Self: + """Append an adata to this dataset. Parameters ---------- diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 6e044a2e..18684717 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -552,16 +552,16 @@ def test_mismatched_var_raises_error(tmp_path: Path, subtests): var=adata2.var, ) - with subtests.test(msg="add_anndata"): + with subtests.test(msg="add_adata"): loader = Loader(chunk_size=10, preload_nchunks=4, batch_size=20) - loader.add_anndata(adata1_on_disk) + loader.add_adata(adata1_on_disk) with pytest.raises(ValueError, match="All datasets must have identical var DataFrames"): - loader.add_anndata(adata2_on_disk) + loader.add_adata(adata2_on_disk) - with subtests.test(msg="add_adata"): + with subtests.test(msg="add_adatas"): loader = Loader(chunk_size=10, preload_nchunks=4, batch_size=20) with pytest.raises(ValueError, match="All datasets must have identical var DataFrames"): - loader.add_adata([adata1_on_disk, adata2_on_disk]) + loader.add_adatas([adata1_on_disk, adata2_on_disk]) with subtests.test(msg="add_dataset"): loader = Loader(chunk_size=10, preload_nchunks=4, batch_size=20) @@ -585,7 +585,7 @@ def test_preload_dtype(tmp_path: Path, dtype_in: np.dtype, expected: np.dtype): z = zarr.open(tmp_path / "foo.zarr") write_sharded(z, ad.AnnData(X=sp.random(100, 10, dtype=dtype_in, format="csr", rng=np.random.default_rng()))) adata = ad.AnnData(X=ad.io.sparse_dataset(z["X"])) - loader = Loader(preload_to_gpu=True, batch_size=10, chunk_size=10, preload_nchunks=2, to_torch=False).add_anndata( + loader = Loader(preload_to_gpu=True, batch_size=10, chunk_size=10, preload_nchunks=2, to_torch=False).add_adata( adata ) assert next(iter(loader))["X"].dtype == expected From 33061e6a217f2d0d06f83cef90d31b691ce6b7e4 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Tue, 3 Mar 2026 11:20:38 +0100 Subject: [PATCH 28/30] undo dataset collection changes --- src/annbatch/io.py | 4 ++-- tests/test_preshuffle.py | 22 +++++++++++----------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/annbatch/io.py b/src/annbatch/io.py index e935e6f8..f158fe70 100644 --- a/src/annbatch/io.py +++ b/src/annbatch/io.py @@ -395,7 +395,7 @@ def is_empty(self) -> bool: ) @_with_settings - def add_adata( + def add_adatas( self, adata_paths: Iterable[zarr.Group | h5py.Group | PathLike[str] | str], *, @@ -478,7 +478,7 @@ def add_adata( ... "path/to/second_adata.h5ad", ... "path/to/third_adata.h5ad", ... ] - >>> DatasetCollection("path/to/output/zarr_store.zarr").add_adata( + >>> DatasetCollection("path/to/output/zarr_store.zarr").add_adatas( ... datasets, ... load_adata=read_lazy_x_and_obs_only, ...) diff --git a/tests/test_preshuffle.py b/tests/test_preshuffle.py index 8066f425..da06723d 100644 --- a/tests/test_preshuffle.py +++ b/tests/test_preshuffle.py @@ -51,7 +51,7 @@ def test_store_creation_warnings_with_different_keys(elem_name: Literal["obsm", adata_1.write_h5ad(path_1) adata_2.write_h5ad(path_2) with pytest.warns(UserWarning, match=rf"Found {elem_name} keys.* not present in all anndatas"): - DatasetCollection(tmp_path / "collection.zarr").add_adata( + DatasetCollection(tmp_path / "collection.zarr").add_adatas( [path_1, path_2], zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, @@ -69,7 +69,7 @@ def test_store_creation_no_warnings_with_custom_load(tmp_path: Path): path_2 = tmp_path / "with_extra_key.h5ad" adata_1.write_h5ad(path_1) adata_2.write_h5ad(path_2) - collection = DatasetCollection(tmp_path / "collection.zarr").add_adata( + collection = DatasetCollection(tmp_path / "collection.zarr").add_adatas( [path_1, path_2], zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, @@ -91,7 +91,7 @@ def test_store_creation_path_added_to_obs(tmp_path: Path): adata_2.write_h5ad(path_2) paths = [path_1, path_2] output_dir = tmp_path / "path_src_collection.zarr" - collection = DatasetCollection(output_dir).add_adata( + collection = DatasetCollection(output_dir).add_adatas( paths, zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, @@ -120,7 +120,7 @@ def test_store_addition_different_keys( adata_orig.write_h5ad(orig_path) output_path = tmp_path / "zarr_store_addition_different_keys.zarr" collection = DatasetCollection(output_path) - collection.add_adata( + collection.add_adatas( [orig_path], zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, @@ -136,7 +136,7 @@ def test_store_addition_different_keys( additional_path = tmp_path / "with_extra_key.h5ad" adata.write_h5ad(additional_path) with pytest.warns(UserWarning, match=rf"Found {elem_name} keys.* not present in all anndatas"): - collection.add_adata( + collection.add_adatas( [additional_path], load_adata=load_adata, zarr_sparse_chunk_size=10, @@ -169,7 +169,7 @@ def test_store_creation_default( else r"Loading h5ad is currently not supported", ): kwargs = {} if is_zarr else {"is_collection_h5ad": True} - collection = DatasetCollection(output_path, **kwargs).add_adata( + collection = DatasetCollection(output_path, **kwargs).add_adatas( [adata_with_h5_path_different_var_space[1] / f for f in h5_files if str(f).endswith(".h5ad")] ) assert isinstance( @@ -201,7 +201,7 @@ def test_store_creation( adata_with_h5_path_different_var_space[1].parent / f"zarr_store_creation_test_{shuffle}_{'default_read' if load_adata is None else 'custom_read'}.zarr" ) - collection = DatasetCollection(output_path).add_adata( + collection = DatasetCollection(output_path).add_adatas( [adata_with_h5_path_different_var_space[1] / f for f in h5_files if str(f).endswith(".h5ad")], var_subset=var_subset, zarr_sparse_chunk_size=10, @@ -292,7 +292,7 @@ def test_mismatched_raw_concat( h5_files = sorted(adata_with_h5_path_different_var_space[1].iterdir()) output_path = adata_with_h5_path_different_var_space[1].parent / "zarr_store_creation_test_heterogeneous.zarr" h5_paths = [adata_with_h5_path_different_var_space[1] / f for f in h5_files if str(f).endswith(".h5ad")] - collection = DatasetCollection(output_path).add_adata( + collection = DatasetCollection(output_path).add_adatas( h5_paths, zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, @@ -337,7 +337,7 @@ def test_store_extension( additional = all_h5_paths[4:] # don't add everything to get a "different" var space # create new store collection = DatasetCollection(store_path) - collection.add_adata( + collection.add_adatas( original, zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, @@ -348,7 +348,7 @@ def test_store_extension( shuffle=True, ) # add h5ads to existing store - collection.add_adata( + collection.add_adatas( additional, load_adata=load_adata, zarr_sparse_chunk_size=10, @@ -379,7 +379,7 @@ def test_empty(tmp_path: Path): assert collection.is_empty # Doesn't matter what errors as long as this function runs, but not to completion with pytest.raises(TypeError): - collection.add_adata() + collection.add_adatas() assert not (V1_ENCODING.items() <= g.attrs.items()) From 9da30cac44e1c8b79a02ca7e10e8da95eceacb7d Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Tue, 3 Mar 2026 11:22:56 +0100 Subject: [PATCH 29/30] conftest --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 06f2e77f..45be4996 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -114,7 +114,7 @@ def simple_collection( ) -> tuple[DatasetCollection, ad.AnnData]: zarr_stores = sorted(f for f in adata_with_zarr_path_same_var_space[1].iterdir() if f.is_dir()) output_path = Path(tmpdir_factory.mktemp("zarr_folder")) / "simple_fixture.zarr" - collection = DatasetCollection(output_path).add_adata( + collection = DatasetCollection(output_path).add_adatas( zarr_stores, zarr_sparse_chunk_size=10, zarr_sparse_shard_size=20, From 5098d5ca093c2d1f12d7baef80348529781e90cc Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Tue, 3 Mar 2026 12:07:40 +0100 Subject: [PATCH 30/30] Update src/annbatch/io.py --- src/annbatch/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/annbatch/io.py b/src/annbatch/io.py index f158fe70..d5d131f6 100644 --- a/src/annbatch/io.py +++ b/src/annbatch/io.py @@ -665,7 +665,7 @@ def _add_to_collection( Whether or not to shuffle when adding. Otherwise, the incoming data will just be split up and appended. """ if self.is_empty: - raise ValueError("Store is empty. Please run `DatasetCollection.add_adata` first.") + raise ValueError("Store is empty. Please run `DatasetCollection.add_adatas` first.") # Check for mismatched keys among the inputs. _check_for_mismatched_keys(adata_paths, load_adata=load_adata)