From 553389229f349c7110a3a4c46ec873d366deb1c5 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Fri, 28 Jun 2024 15:58:03 -0400 Subject: [PATCH 1/2] attempt to replace pandas with polars, this is untested --- peppy/parsers.py | 23 ++++++++++++----------- peppy/project.py | 23 ++++++++++++----------- peppy/sample.py | 12 +++++++----- requirements/requirements-all.txt | 3 ++- 4 files changed, 33 insertions(+), 28 deletions(-) diff --git a/peppy/parsers.py b/peppy/parsers.py index 961d6e17..1622ae37 100644 --- a/peppy/parsers.py +++ b/peppy/parsers.py @@ -1,7 +1,8 @@ import os from typing import Any, Dict, List -import pandas as pd +#import pandas as pd +import polars as pl from .exceptions import InvalidSampleTableFileException @@ -18,7 +19,7 @@ class TableParser: def __init__(self, path: str, exts: List[str]) -> None: self._path = path self._exts = exts - self._table: pd.DataFrame = None + self._table: pl.DataFrame = None self._pandas_kwargs: Dict[str, Any] = { "dtype": str, "index_col": False, @@ -41,7 +42,7 @@ def extensions(self) -> List[str]: return self._exts @property - def table(self) -> pd.DataFrame: + def table(self) -> pl.DataFrame: """ The parsed table @@ -62,7 +63,7 @@ def validate_path(self) -> None: f"Sample table file format not supported: {self.path}" ) - def parse(self) -> pd.DataFrame: + def parse(self) -> pl.DataFrame: """ Parse the sample table """ @@ -83,13 +84,13 @@ class CSVTableParser(TableParser): def __init__(self, path: str) -> None: super().__init__(path, ["csv"]) - def parse(self) -> pd.DataFrame: + def parse(self) -> pl.DataFrame: """ Parse the sample table """ self.validate_path() - self._table = pd.read_csv(self.path, **self._pandas_kwargs) - self._table = self._table.where(pd.notnull(self._table), None) + self._table = pl.read_csv(self.path, **self._pandas_kwargs) + self._table = self._table.where(pl.notnull(self._table), None) return self.table @@ -101,12 +102,12 @@ class TSVTableParser(TableParser): def __init__(self, path: str) -> None: super().__init__(path, ["tsv"]) - def parse(self) -> pd.DataFrame: + def parse(self) -> pl.DataFrame: """ Parse the sample table """ self.validate_path() - self._table = pd.read_csv(self.path, sep="\t", **self._pandas_kwargs) + self._table = pl.read_csv(self.path, sep="\t", **self._pandas_kwargs) return self.table @@ -118,12 +119,12 @@ class XLSXTableParser(TableParser): def __init__(self, path: str) -> None: super().__init__(path, ["xlsx"]) - def parse(self) -> pd.DataFrame: + def parse(self) -> pl.DataFrame: """ Parse the sample table """ self.validate_path() - self._table = pd.read_excel(self.path, **self._pandas_kwargs) + self._table = pl.read_excel(self.path, **self._pandas_kwargs) return self.table diff --git a/peppy/project.py b/peppy/project.py index 8b562603..dfffc34d 100644 --- a/peppy/project.py +++ b/peppy/project.py @@ -10,9 +10,10 @@ from typing import Iterable, List, Tuple, Union, Literal import numpy as np -import pandas as pd +#import pandas as pd +import polars as pl import yaml -from pandas.core.common import flatten +#from pandas.core.common import flatten from rich.console import Console from rich.progress import track from ubiquerg import is_url @@ -173,8 +174,8 @@ def __eq__(self, other): @classmethod def from_pandas( cls, - samples_df: pd.DataFrame, - sub_samples_df: List[pd.DataFrame] = None, + samples_df: pl.DataFrame, + sub_samples_df: List[pl.DataFrame] = None, config: dict = None, ): """ @@ -224,7 +225,7 @@ def _from_dict(self, pep_dictionary) -> "Project": _samples: list | dict, _subsamples: list[list | dict]} """ - self[SAMPLE_DF_KEY] = pd.DataFrame(pep_dictionary[SAMPLE_RAW_DICT_KEY]).replace( + self[SAMPLE_DF_KEY] = pl.DataFrame(pep_dictionary[SAMPLE_RAW_DICT_KEY]).replace( np.nan, "" ) self[CONFIG_KEY] = pep_dictionary[CONFIG_KEY] @@ -232,7 +233,7 @@ def _from_dict(self, pep_dictionary) -> "Project": if SUBSAMPLE_RAW_LIST_KEY in pep_dictionary: if pep_dictionary[SUBSAMPLE_RAW_LIST_KEY]: self[SUBSAMPLE_DF_KEY] = [ - pd.DataFrame(sub_a).replace(np.nan, "") + pl.DataFrame(sub_a).replace(np.nan, "") for sub_a in pep_dictionary[SUBSAMPLE_RAW_LIST_KEY] ] if NAME_KEY in self[CONFIG_KEY]: @@ -291,7 +292,7 @@ def from_sample_yaml(cls, yaml_file: str): _LOGGER.info("Processing project from yaml...") with open(yaml_file, "r") as f: prj_dict = yaml.safe_load(f) - pd_df = pd.DataFrame.from_dict(prj_dict) + pd_df = pl.DataFrame.from_dict(prj_dict) return cls.from_pandas(pd_df) def to_dict( @@ -375,9 +376,9 @@ def _get_table_from_samples(self, index, initial=False): if SAMPLE_DF_KEY in self: df = self[SAMPLE_DF_KEY] else: - df = pd.DataFrame() + df = pl.DataFrame() else: - df = pd.DataFrame.from_dict([s.to_dict() for s in self.samples]) + df = pl.DataFrame.from_dict([s.to_dict() for s in self.samples]) index = [index] if isinstance(index, str) else index if not all([i in df.columns for i in index]): _LOGGER.debug( @@ -745,7 +746,7 @@ def _get_merged_attributes( attribute_value_for_sample = sample.get(attr, "") attribute_values.append(attribute_value_for_sample) - merged_attributes[attr] = list(flatten(attribute_values)) + merged_attributes[attr] = list(pl.Expr.flatten(attribute_values)) return merged_attributes @@ -1255,7 +1256,7 @@ def subsample_table(self): if not self[SUBSAMPLE_DF_KEY]: return None - subsample_dataframes_array = make_list(self[SUBSAMPLE_DF_KEY], pd.DataFrame) + subsample_dataframes_array = make_list(self[SUBSAMPLE_DF_KEY], pl.DataFrame) for subsample_table in subsample_dataframes_array: if not all([i in subsample_table.columns for i in self.sst_index]): _LOGGER.info( diff --git a/peppy/sample.py b/peppy/sample.py index d28fdaff..104bee80 100644 --- a/peppy/sample.py +++ b/peppy/sample.py @@ -5,9 +5,11 @@ from logging import getLogger from string import Formatter -import pandas as pd +#import pandas as pd +import polars as pl import yaml -from pandas import Series, isnull +#from pandas import Series, isnull +from polars import Series from .const import ( CONFIG_FILE_KEY, @@ -122,7 +124,7 @@ def _obj2dict(obj, name=None): elif hasattr(obj, "dtype"): # numpy data types # TODO: this fails with ValueError for multi-element array. return obj.item() - elif isnull(obj): + elif pl.is_null(obj): # Missing values as evaluated by pandas.isnull(). # This gets correctly written into yaml. return None @@ -317,7 +319,7 @@ def sample_name(self): def __reduce__(self): return ( self.__class__, - (pd.Series(self.to_dict()),), + (pl.Series(self.to_dict()),), # (None, {}), # iter([]), # iter({PRJ_REF: self[PRJ_REF]}.items()), @@ -374,7 +376,7 @@ def _excl_from_repr(self, k, cls): def _excl_classes_from_todict(self): """Exclude pandas.DataFrame from dict representation""" - return (pd.DataFrame,) + return (pl.DataFrame,) def _try_touch_samples(self): """ diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index a0036b9c..a9dccb88 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,5 +1,6 @@ -pandas>=0.24.2 +#pandas>=0.24.2 pyyaml rich>=10.3.0 ubiquerg>=0.6.2 numpy +polars From 4fd6012b197f71616d15fc7e46eef35793ffbba4 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Tue, 2 Jul 2024 13:12:06 -0400 Subject: [PATCH 2/2] replace pandas in tests with polars --- tests/conftest.py | 5 +++-- tests/test_Project.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index a689ffc1..89fc095b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,7 +2,8 @@ import os -import pandas as pd +# import pandas as pd +import polars as pl import pytest __author__ = "Michal Stolarczyk" @@ -70,6 +71,6 @@ def example_peps_cfg_paths(request): @pytest.fixture def config_with_pandas_obj(request): - return pd.read_csv( + return pl.read_csv( get_path_to_example_file(EPB, request.param, "sample_table.csv"), dtype=str ) diff --git a/tests/test_Project.py b/tests/test_Project.py index f165fb53..dd67c4e1 100644 --- a/tests/test_Project.py +++ b/tests/test_Project.py @@ -6,7 +6,8 @@ import numpy as np import pytest -from pandas import DataFrame +#from pandas import DataFrame +from polars import DataFrame from yaml import dump, safe_load import pickle