Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,6 @@ setuptools = "*"
# test
pytest = "*"
pytest-cov = "*"
polars = "*"
# linting
pre-commit = "*"
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ classifiers = [

dependencies = [
"numpy",
"pandas",
"scikit-learn"
]
dynamic = ["version"]
Expand Down
82 changes: 38 additions & 44 deletions src/tclf/classical_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,16 @@
import numpy as np
import numpy.typing as npt
import pandas as pd
from scipy import sparse
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import check_random_state
from sklearn.utils.validation import (
_check_sample_weight,
_get_feature_names,
check_array,
check_is_fitted,
)

from tclf.types import ArrayLike, MatrixLike

ALLOWED_FUNC_LITERALS = Literal[
"tick",
"rev_tick",
Expand Down Expand Up @@ -396,7 +397,8 @@ def _nan(self, subset: str) -> npt.NDArray:
Returns:
npt.NDArray: result of the trade size rule. Can be np.NaN.
"""
return np.full(shape=(self.X_.shape[0],), fill_value=np.nan)
n_samples = next(iter(self.X_.values())).shape[0]
return np.full(shape=(n_samples,), fill_value=np.nan)

def _validate_columns(self, missing_columns: list) -> None:
"""Validate if all required columns are present.
Expand All @@ -407,8 +409,9 @@ def _validate_columns(self, missing_columns: list) -> None:
Raises:
ValueError: columns missing in dataframe.
"""
columns = self.columns_ + missing_columns if self.columns_ else missing_columns
self.X_ = pd.DataFrame(np.zeros(shape=(1, len(columns))), columns=columns)
columns = self.feature_names_in_.tolist()
columns.extend(missing_columns)
self.X_ = {c: np.zeros(shape=(1, 1)) for c in columns}
try:
self._predict()
except KeyError as e:
Expand All @@ -427,15 +430,15 @@ def _validate_columns(self, missing_columns: list) -> None:

def fit(
self,
X: MatrixLike,
y: ArrayLike | None = None,
X,
y=None,
sample_weight: npt.NDArray | None = None,
) -> ClassicalClassifier:
"""Fit the classifier.

Args:
X (MatrixLike): features
y (ArrayLike | None, optional): ignored, present here for API consistency by convention.
X: features
y: ignored, present here for API consistency by convention.
sample_weight (npt.NDArray | None, optional): Sample weights. Defaults to None.

Raises:
Expand Down Expand Up @@ -465,30 +468,12 @@ def fit(

self.func_mapping_ = dict(zip(ALLOWED_FUNC_STR, funcs))

# create working copy to be altered and try to get columns from df
self.columns_ = self.features
if isinstance(X, pd.DataFrame):
self.columns_ = X.columns.tolist()

X = self._validate_data(
X,
y="no_validation",
dtype=[np.float64, np.float32],
accept_sparse=False,
force_all_finite=False,
)

# set feature names, if given
self._check_feature_names(X, reset=True)
X = _check_X(X)
self._check_n_features(X, reset=True)
self.classes_ = np.array([-1, 1])

# if no features are provided or inferred, use default
if self.columns_ is None:
self.columns_ = [str(i) for i in range(X.shape[1])]

if len(self.columns_) > 0 and X.shape[1] != len(self.columns_):
raise ValueError(
f"Expected {len(self.columns_)} columns, got {X.shape[1]}."
)

self._layers = self.layers if self.layers is not None else []
for func_str, _ in self._layers:
if func_str not in ALLOWED_FUNC_STR:
Expand All @@ -500,26 +485,26 @@ def fit(
self._validate_columns([])
return self

def predict(self, X: MatrixLike) -> npt.NDArray:
def predict(self, X) -> npt.NDArray:
"""Perform classification on test vectors `X`.

Args:
X (MatrixLike): feature matrix.
X: feature matrix.

Returns:
npt.NDArray: Predicted traget values for X.
"""
check_is_fitted(self)
X = self._validate_data(
X,
dtype=[np.float64, np.float32],
accept_sparse=False,
force_all_finite=False,
)

rs = check_random_state(self.random_state)

self.X_ = pd.DataFrame(data=X, columns=self.columns_)
# adapted from:
# https://github.com/scikit-learn/scikit-learn/blob/f07e0138b/sklearn/compose/_column_transformer.py#L900
column_names = _get_feature_names(X)
self._check_n_features(X, reset=True)
X = _check_X(X)

self.X_ = {c: X[c] for c in column_names}

pred = self._predict()

# fill NaNs randomly with -1 and 1 or with constant zero
Expand All @@ -539,7 +524,9 @@ def _predict(self) -> npt.NDArray:
Returns:
npt.NDArray: prediction
"""
pred = np.full(shape=(self.X_.shape[0],), fill_value=np.nan)
n_samples = next(iter(self.X_.values())).shape[0]
pred = np.full(shape=(n_samples,), fill_value=np.nan)

for func_str, subset in self._layers:
func = self.func_mapping_[func_str]
pred = np.where(
Expand All @@ -549,15 +536,15 @@ def _predict(self) -> npt.NDArray:
)
return pred

def predict_proba(self, X: MatrixLike) -> npt.NDArray:
def predict_proba(self, X) -> npt.NDArray:
"""Predict class probabilities for X.

Probabilities are either 0 or 1 depending on the class.

For strategy 'constant' probabilities are (0.5,0.5) for unclassified classes.

Args:
X (MatrixLike): feature matrix
X: feature matrix

Returns:
npt.NDArray: probabilities
Expand All @@ -578,3 +565,10 @@ def predict_proba(self, X: MatrixLike) -> npt.NDArray:
# For strategy 'constant' probabilities are (0.5,0.5).
prob[mask] = np.identity(n_classes)[indices]
return prob


def _check_X(X):
"""Use check_array only when necessary, e.g. on lists and other non-array-likes."""
if hasattr(X, "__array__") or hasattr(X, "__dataframe__") or sparse.issparse(X):
return X
return check_array(X, force_all_finite="allow-nan", dtype=object)
9 changes: 0 additions & 9 deletions src/tclf/types.py

This file was deleted.

16 changes: 16 additions & 0 deletions tests/test_classical_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import numpy as np
import pandas as pd
import polars as pl
import pytest
from numpy.testing import assert_allclose
from sklearn.base import BaseEstimator
Expand Down Expand Up @@ -226,6 +227,21 @@ def test_override(self, x_train: pd.DataFrame) -> None:
)
assert (y_pred == y_test).all()

def test_polars(self) -> None:
"""Test polars support."""
x = pl.DataFrame({"trade_price": [1, 2, 0], "price_ex_lag": [2, 1, 3]})
y = pl.Series([-1, 1])

y_pred = (
ClassicalClassifier(
layers=[("tick", "ex")],
random_state=7,
)
.fit(x)
.predict(x)
)
assert (y_pred == y).all()

def test_np_array(self, x_train: pd.DataFrame) -> None:
"""Test, if classifier works, if only np.ndarrays are provided.

Expand Down