diff --git a/.github/workflows/test.yaml b/.github/workflows/test-cpu.yaml similarity index 98% rename from .github/workflows/test.yaml rename to .github/workflows/test-cpu.yaml index eda403b..46b8901 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test-cpu.yaml @@ -26,7 +26,7 @@ jobs: outputs: envs: ${{ steps.get-envs.outputs.envs }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: filter: blob:none fetch-depth: 0 diff --git a/.github/workflows/test-gpu.yaml b/.github/workflows/test-gpu.yaml index a7d7b3c..68319e7 100644 --- a/.github/workflows/test-gpu.yaml +++ b/.github/workflows/test-gpu.yaml @@ -42,7 +42,7 @@ jobs: - name: Install Python uses: actions/setup-python@v5 with: - python-version: "3.13" + python-version: "3.12" - name: Install uv uses: astral-sh/setup-uv@v7 @@ -50,7 +50,7 @@ jobs: cache-dependency-glob: pyproject.toml - name: Install fknni - run: uv pip install --system -e ".[test]" + run: uv pip install --system -e ".[test,faissgpu]" - name: Pip list run: pip list diff --git a/pyproject.toml b/pyproject.toml index 419a0b5..6f11b7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,6 @@ classifiers = [ "Programming Language :: Python :: 3.13", ] dependencies = [ - "faiss-cpu", "lamin-utils", "pandas", "scikit-learn", @@ -45,6 +44,8 @@ optional-dependencies.doc = [ "sphinxcontrib-bibtex>=1", "sphinxext-opengraph", ] +optional-dependencies.faisscpu = [ "faiss-cpu" ] +optional-dependencies.faissgpu = [ "faiss-gpu-cu12" ] optional-dependencies.rapids12 = [ "cudf-cu12>=25.10", "cugraph-cu12>=25.10", @@ -88,7 +89,7 @@ deps = [ "pre" ] python = [ "3.13" ] [tool.hatch.envs.hatch-test] -features = [ "dev", "test" ] +features = [ "dev", "test", "faisscpu" ] [tool.hatch.envs.hatch-test.overrides] # If the matrix variable `deps` is set to "pre", @@ -142,6 +143,7 @@ testpaths = [ "tests" ] xfail_strict = true addopts = [ "--import-mode=importlib", # allow using test files with same name + "-m not gpu", ] markers = [ "gpu: mark test to run on GPU", diff --git a/src/fknni/faiss/faiss.py b/src/fknni/faiss/faiss.py index 6d97a73..1df044e 100644 --- a/src/fknni/faiss/faiss.py +++ b/src/fknni/faiss/faiss.py @@ -9,6 +9,13 @@ from numpy import dtype from sklearn.base import BaseEstimator, TransformerMixin +try: + import faiss + + HAS_FAISS_GPU = hasattr(faiss, "StandardGpuResources") +except ImportError: + raise ImportError("faiss-cpu or faiss-gpu required") from None + class FaissImputer(BaseEstimator, TransformerMixin): """Imputer for completing missing values using Faiss, incorporating weighted averages based on distance.""" @@ -23,6 +30,7 @@ def __init__( index_factory: str = "Flat", min_data_ratio: float = 0.25, temporal_mode: Literal["flatten", "per_variable"] = "flatten", + use_gpu: bool = False, ): """Initializes FaissImputer with specified parameters that are used for the imputation. @@ -39,6 +47,7 @@ def __init__( temporal_mode: How to handle 3D temporal data. 'flatten' treats all (variable, timestep) pairs as independent features (fast but allows temporal leakage). 'per_variable' imputes each variable independently across time (slower but respects temporal causality). + use_gpu: Whether to train using GPU. """ if n_neighbors < 1: raise ValueError("n_neighbors must be at least 1.") @@ -47,6 +56,10 @@ def __init__( if temporal_mode not in {"flatten", "per_variable"}: raise ValueError("Unknown temporal_mode. Choose one of 'flatten', 'per_variable'") + self.use_gpu = use_gpu + if use_gpu and not HAS_FAISS_GPU: + raise ValueError("use_gpu=True requires faiss-gpu package, install with: pip install faiss-gpu") from None + self.missing_values = missing_values self.n_neighbors = n_neighbors self.metric = metric @@ -236,6 +249,11 @@ def _features_indices_sorted_descending_on_nan(self) -> list[int]: def _train(self, x_train: np.ndarray) -> faiss.Index: index = faiss.index_factory(x_train.shape[1], self.index_factory) index.metric_type = faiss.METRIC_L2 if self.metric == "l2" else faiss.METRIC_INNER_PRODUCT + + if self.use_gpu: + res = faiss.StandardGpuResources() + index = faiss.index_cpu_to_gpu(res, 0, index) + index.train(x_train) index.add(x_train) return index diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/compare_predictions.py b/tests/compare_predictions.py new file mode 100644 index 0000000..397550d --- /dev/null +++ b/tests/compare_predictions.py @@ -0,0 +1,46 @@ +import numpy as np + + +def _are_ndarrays_equal(arr1: np.ndarray, arr2: np.ndarray) -> np.bool_: + """Check if two arrays are equal member-wise. + + Note: Two NaN are considered equal. + + Args: + arr1: First array to compare + arr2: Second array to compare + + Returns: + True if the two arrays are equal member-wise + """ + return np.all(np.equal(arr1, arr2, dtype=object) | ((arr1 != arr1) & (arr2 != arr2))) + + +def _base_check_imputation( + data_original: np.ndarray, + data_imputed: np.ndarray, +): + """Provides the following base checks: + - Imputation doesn't leave any NaN behind + - Imputation doesn't modify any data that wasn't NaN + + Args: + data_before_imputation: Dataset before imputation + data_after_imputation: Dataset after imputation + + Raises: + AssertionError: If any of the checks fail. + """ + if data_original.shape != data_imputed.shape: + raise AssertionError("The shapes of the two datasets do not match") + + # Ensure no NaN remains in the imputed dataset + if np.isnan(data_imputed).any(): + raise AssertionError("NaN found in imputed columns of layer_after.") + + # Ensure imputation does not alter non-NaN values in the imputed columns + imputed_non_nan_mask = ~np.isnan(data_original) + if not _are_ndarrays_equal(data_original[imputed_non_nan_mask], data_imputed[imputed_non_nan_mask]): + raise AssertionError("Non-NaN values in imputed columns were modified.") + + return diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..fe25307 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,19 @@ +import numpy as np +import pandas as pd +import pytest + + +@pytest.fixture +def rng(): + return np.random.default_rng(0) + + +@pytest.fixture +def simple_test_df(rng): + data = pd.DataFrame(rng.integers(0, 100, size=(10, 5)), columns=list("ABCDE")) + data_missing = data.copy() + indices = [(i, j) for i in range(data.shape[0]) for j in range(data.shape[1])] + rng.shuffle(indices) + for i, j in indices[:5]: + data_missing.iat[i, j] = np.nan + return data.to_numpy(), data_missing.to_numpy() diff --git a/tests/cpu/conftest.py b/tests/cpu/conftest.py deleted file mode 100644 index 0e05b5a..0000000 --- a/tests/cpu/conftest.py +++ /dev/null @@ -1,7 +0,0 @@ -import numpy as np -import pytest - - -@pytest.fixture -def rng(): - return np.random.default_rng(0) diff --git a/tests/cpu/test_faiss_imputation.py b/tests/cpu/test_faiss_imputation.py index 4040e63..2c5c668 100644 --- a/tests/cpu/test_faiss_imputation.py +++ b/tests/cpu/test_faiss_imputation.py @@ -1,22 +1,11 @@ import numpy as np -import pandas as pd import pytest from sklearn.datasets import make_regression +from tests.compare_predictions import _base_check_imputation from fknni.faiss.faiss import FaissImputer -@pytest.fixture -def simple_test_df(rng): - data = pd.DataFrame(rng.integers(0, 100, size=(10, 5)), columns=list("ABCDE")) - data_missing = data.copy() - indices = [(i, j) for i in range(data.shape[0]) for j in range(data.shape[1])] - rng.shuffle(indices) - for i, j in indices[:5]: - data_missing.iat[i, j] = np.nan - return data.to_numpy(), data_missing.to_numpy() - - @pytest.fixture def regression_dataset(rng): X, y = make_regression(n_samples=100, n_features=20, random_state=42) @@ -28,36 +17,6 @@ def regression_dataset(rng): return X, X_missing, y -def _base_check_imputation( - data_original: np.ndarray, - data_imputed: np.ndarray, -): - """Provides the following base checks: - - Imputation doesn't leave any NaN behind - - Imputation doesn't modify any data that wasn't NaN - - Args: - data_before_imputation: Dataset before imputation - data_after_imputation: Dataset after imputation - - Raises: - AssertionError: If any of the checks fail. - """ - if data_original.shape != data_imputed.shape: - raise AssertionError("The shapes of the two datasets do not match") - - # Ensure no NaN remains in the imputed dataset - if np.isnan(data_imputed).any(): - raise AssertionError("NaN found in imputed columns of layer_after.") - - # Ensure imputation does not alter non-NaN values in the imputed columns - imputed_non_nan_mask = ~np.isnan(data_original) - if not _are_ndarrays_equal(data_original[imputed_non_nan_mask], data_imputed[imputed_non_nan_mask]): - raise AssertionError("Non-NaN values in imputed columns were modified.") - - return - - def test_median_imputation(simple_test_df): """Tests if median imputation successfully fills all NaN values""" data, data_missing = simple_test_df @@ -222,18 +181,3 @@ def test_invalid_temporal_mode(): """Tests if imputer raises error for invalid temporal_mode""" with pytest.raises(ValueError): FaissImputer(temporal_mode="invalid") - - -def _are_ndarrays_equal(arr1: np.ndarray, arr2: np.ndarray) -> np.bool_: - """Check if two arrays are equal member-wise. - - Note: Two NaN are considered equal. - - Args: - arr1: First array to compare - arr2: Second array to compare - - Returns: - True if the two arrays are equal member-wise - """ - return np.all(np.equal(arr1, arr2, dtype=object) | ((arr1 != arr1) & (arr2 != arr2))) diff --git a/tests/gpu/test_gpu.py b/tests/gpu/test_gpu.py index f3891cc..5ea13b4 100644 --- a/tests/gpu/test_gpu.py +++ b/tests/gpu/test_gpu.py @@ -1,6 +1,13 @@ import pytest +from tests.compare_predictions import _base_check_imputation + +from fknni.faiss.faiss import FaissImputer @pytest.mark.gpu -def test_gpu(): - assert 1 + 1 == 2 +def test_median_imputation(simple_test_df): + """Tests if median imputation successfully fills all NaN values""" + data, data_missing = simple_test_df + data_original = data_missing.copy() + FaissImputer(n_neighbors=5, strategy="median", use_gpu=True).fit_transform(data_missing) + _base_check_imputation(data_original, data_missing)