Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
outputs:
envs: ${{ steps.get-envs.outputs.envs }}
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v5
with:
filter: blob:none
fetch-depth: 0
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/test-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,15 @@ jobs:
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: "3.13"
python-version: "3.12"

- name: Install uv
uses: astral-sh/setup-uv@v7
with:
cache-dependency-glob: pyproject.toml

- name: Install fknni
run: uv pip install --system -e ".[test]"
run: uv pip install --system -e ".[test,faissgpu]"
- name: Pip list
run: pip list

Expand Down
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ classifiers = [
"Programming Language :: Python :: 3.13",
]
dependencies = [
"faiss-cpu",
"lamin-utils",
"pandas",
"scikit-learn",
Expand All @@ -45,6 +44,8 @@ optional-dependencies.doc = [
"sphinxcontrib-bibtex>=1",
"sphinxext-opengraph",
]
optional-dependencies.faisscpu = [ "faiss-cpu" ]
optional-dependencies.faissgpu = [ "faiss-gpu-cu12" ]
optional-dependencies.rapids12 = [
"cudf-cu12>=25.10",
"cugraph-cu12>=25.10",
Expand Down Expand Up @@ -88,7 +89,7 @@ deps = [ "pre" ]
python = [ "3.13" ]

[tool.hatch.envs.hatch-test]
features = [ "dev", "test" ]
features = [ "dev", "test", "faisscpu" ]

[tool.hatch.envs.hatch-test.overrides]
# If the matrix variable `deps` is set to "pre",
Expand Down Expand Up @@ -142,6 +143,7 @@ testpaths = [ "tests" ]
xfail_strict = true
addopts = [
"--import-mode=importlib", # allow using test files with same name
"-m not gpu",
]
markers = [
"gpu: mark test to run on GPU",
Expand Down
18 changes: 18 additions & 0 deletions src/fknni/faiss/faiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@
from numpy import dtype
from sklearn.base import BaseEstimator, TransformerMixin

try:
import faiss

HAS_FAISS_GPU = hasattr(faiss, "StandardGpuResources")
except ImportError:
raise ImportError("faiss-cpu or faiss-gpu required") from None


class FaissImputer(BaseEstimator, TransformerMixin):
"""Imputer for completing missing values using Faiss, incorporating weighted averages based on distance."""
Expand All @@ -23,6 +30,7 @@ def __init__(
index_factory: str = "Flat",
min_data_ratio: float = 0.25,
temporal_mode: Literal["flatten", "per_variable"] = "flatten",
use_gpu: bool = False,
):
"""Initializes FaissImputer with specified parameters that are used for the imputation.

Expand All @@ -39,6 +47,7 @@ def __init__(
temporal_mode: How to handle 3D temporal data. 'flatten' treats all (variable, timestep) pairs as
independent features (fast but allows temporal leakage).
'per_variable' imputes each variable independently across time (slower but respects temporal causality).
use_gpu: Whether to train using GPU.
"""
if n_neighbors < 1:
raise ValueError("n_neighbors must be at least 1.")
Expand All @@ -47,6 +56,10 @@ def __init__(
if temporal_mode not in {"flatten", "per_variable"}:
raise ValueError("Unknown temporal_mode. Choose one of 'flatten', 'per_variable'")

self.use_gpu = use_gpu
if use_gpu and not HAS_FAISS_GPU:
raise ValueError("use_gpu=True requires faiss-gpu package, install with: pip install faiss-gpu") from None

self.missing_values = missing_values
self.n_neighbors = n_neighbors
self.metric = metric
Expand Down Expand Up @@ -236,6 +249,11 @@ def _features_indices_sorted_descending_on_nan(self) -> list[int]:
def _train(self, x_train: np.ndarray) -> faiss.Index:
index = faiss.index_factory(x_train.shape[1], self.index_factory)
index.metric_type = faiss.METRIC_L2 if self.metric == "l2" else faiss.METRIC_INNER_PRODUCT

if self.use_gpu:
res = faiss.StandardGpuResources()
index = faiss.index_cpu_to_gpu(res, 0, index)

index.train(x_train)
index.add(x_train)
return index
Expand Down
Empty file added tests/__init__.py
Empty file.
46 changes: 46 additions & 0 deletions tests/compare_predictions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import numpy as np


def _are_ndarrays_equal(arr1: np.ndarray, arr2: np.ndarray) -> np.bool_:
"""Check if two arrays are equal member-wise.

Note: Two NaN are considered equal.

Args:
arr1: First array to compare
arr2: Second array to compare

Returns:
True if the two arrays are equal member-wise
"""
return np.all(np.equal(arr1, arr2, dtype=object) | ((arr1 != arr1) & (arr2 != arr2)))


def _base_check_imputation(
data_original: np.ndarray,
data_imputed: np.ndarray,
):
"""Provides the following base checks:
- Imputation doesn't leave any NaN behind
- Imputation doesn't modify any data that wasn't NaN

Args:
data_before_imputation: Dataset before imputation
data_after_imputation: Dataset after imputation

Raises:
AssertionError: If any of the checks fail.
"""
if data_original.shape != data_imputed.shape:
raise AssertionError("The shapes of the two datasets do not match")

# Ensure no NaN remains in the imputed dataset
if np.isnan(data_imputed).any():
raise AssertionError("NaN found in imputed columns of layer_after.")

# Ensure imputation does not alter non-NaN values in the imputed columns
imputed_non_nan_mask = ~np.isnan(data_original)
if not _are_ndarrays_equal(data_original[imputed_non_nan_mask], data_imputed[imputed_non_nan_mask]):
raise AssertionError("Non-NaN values in imputed columns were modified.")

return
19 changes: 19 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import numpy as np
import pandas as pd
import pytest


@pytest.fixture
def rng():
return np.random.default_rng(0)


@pytest.fixture
def simple_test_df(rng):
data = pd.DataFrame(rng.integers(0, 100, size=(10, 5)), columns=list("ABCDE"))
data_missing = data.copy()
indices = [(i, j) for i in range(data.shape[0]) for j in range(data.shape[1])]
rng.shuffle(indices)
for i, j in indices[:5]:
data_missing.iat[i, j] = np.nan
return data.to_numpy(), data_missing.to_numpy()
7 changes: 0 additions & 7 deletions tests/cpu/conftest.py

This file was deleted.

58 changes: 1 addition & 57 deletions tests/cpu/test_faiss_imputation.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,11 @@
import numpy as np
import pandas as pd
import pytest
from sklearn.datasets import make_regression
from tests.compare_predictions import _base_check_imputation

from fknni.faiss.faiss import FaissImputer


@pytest.fixture
def simple_test_df(rng):
data = pd.DataFrame(rng.integers(0, 100, size=(10, 5)), columns=list("ABCDE"))
data_missing = data.copy()
indices = [(i, j) for i in range(data.shape[0]) for j in range(data.shape[1])]
rng.shuffle(indices)
for i, j in indices[:5]:
data_missing.iat[i, j] = np.nan
return data.to_numpy(), data_missing.to_numpy()


@pytest.fixture
def regression_dataset(rng):
X, y = make_regression(n_samples=100, n_features=20, random_state=42)
Expand All @@ -28,36 +17,6 @@ def regression_dataset(rng):
return X, X_missing, y


def _base_check_imputation(
data_original: np.ndarray,
data_imputed: np.ndarray,
):
"""Provides the following base checks:
- Imputation doesn't leave any NaN behind
- Imputation doesn't modify any data that wasn't NaN

Args:
data_before_imputation: Dataset before imputation
data_after_imputation: Dataset after imputation

Raises:
AssertionError: If any of the checks fail.
"""
if data_original.shape != data_imputed.shape:
raise AssertionError("The shapes of the two datasets do not match")

# Ensure no NaN remains in the imputed dataset
if np.isnan(data_imputed).any():
raise AssertionError("NaN found in imputed columns of layer_after.")

# Ensure imputation does not alter non-NaN values in the imputed columns
imputed_non_nan_mask = ~np.isnan(data_original)
if not _are_ndarrays_equal(data_original[imputed_non_nan_mask], data_imputed[imputed_non_nan_mask]):
raise AssertionError("Non-NaN values in imputed columns were modified.")

return


def test_median_imputation(simple_test_df):
"""Tests if median imputation successfully fills all NaN values"""
data, data_missing = simple_test_df
Expand Down Expand Up @@ -222,18 +181,3 @@ def test_invalid_temporal_mode():
"""Tests if imputer raises error for invalid temporal_mode"""
with pytest.raises(ValueError):
FaissImputer(temporal_mode="invalid")


def _are_ndarrays_equal(arr1: np.ndarray, arr2: np.ndarray) -> np.bool_:
"""Check if two arrays are equal member-wise.

Note: Two NaN are considered equal.

Args:
arr1: First array to compare
arr2: Second array to compare

Returns:
True if the two arrays are equal member-wise
"""
return np.all(np.equal(arr1, arr2, dtype=object) | ((arr1 != arr1) & (arr2 != arr2)))
11 changes: 9 additions & 2 deletions tests/gpu/test_gpu.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
import pytest
from tests.compare_predictions import _base_check_imputation

from fknni.faiss.faiss import FaissImputer


@pytest.mark.gpu
def test_gpu():
assert 1 + 1 == 2
def test_median_imputation(simple_test_df):
"""Tests if median imputation successfully fills all NaN values"""
data, data_missing = simple_test_df
data_original = data_missing.copy()
FaissImputer(n_neighbors=5, strategy="median", use_gpu=True).fit_transform(data_missing)
_base_check_imputation(data_original, data_missing)
Loading