From 553389229f349c7110a3a4c46ec873d366deb1c5 Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Fri, 28 Jun 2024 15:58:03 -0400
Subject: [PATCH 1/2] attempt to replace pandas with polars, this is untested

---
 peppy/parsers.py                  | 23 ++++++++++++-----------
 peppy/project.py                  | 23 ++++++++++++-----------
 peppy/sample.py                   | 12 +++++++-----
 requirements/requirements-all.txt |  3 ++-
 4 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/peppy/parsers.py b/peppy/parsers.py
index 961d6e17..1622ae37 100644
--- a/peppy/parsers.py
+++ b/peppy/parsers.py
@@ -1,7 +1,8 @@
 import os
 from typing import Any, Dict, List
 
-import pandas as pd
+#import pandas as pd
+import polars as pl
 
 from .exceptions import InvalidSampleTableFileException
 
@@ -18,7 +19,7 @@ class TableParser:
     def __init__(self, path: str, exts: List[str]) -> None:
         self._path = path
         self._exts = exts
-        self._table: pd.DataFrame = None
+        self._table: pl.DataFrame = None
         self._pandas_kwargs: Dict[str, Any] = {
             "dtype": str,
             "index_col": False,
@@ -41,7 +42,7 @@ def extensions(self) -> List[str]:
         return self._exts
 
     @property
-    def table(self) -> pd.DataFrame:
+    def table(self) -> pl.DataFrame:
         """
         The parsed table
 
@@ -62,7 +63,7 @@ def validate_path(self) -> None:
                 f"Sample table file format not supported: {self.path}"
             )
 
-    def parse(self) -> pd.DataFrame:
+    def parse(self) -> pl.DataFrame:
         """
         Parse the sample table
         """
@@ -83,13 +84,13 @@ class CSVTableParser(TableParser):
     def __init__(self, path: str) -> None:
         super().__init__(path, ["csv"])
 
-    def parse(self) -> pd.DataFrame:
+    def parse(self) -> pl.DataFrame:
         """
         Parse the sample table
         """
         self.validate_path()
-        self._table = pd.read_csv(self.path, **self._pandas_kwargs)
-        self._table = self._table.where(pd.notnull(self._table), None)
+        self._table = pl.read_csv(self.path, **self._pandas_kwargs)
+        self._table = self._table.where(pl.notnull(self._table), None)
         return self.table
 
 
@@ -101,12 +102,12 @@ class TSVTableParser(TableParser):
     def __init__(self, path: str) -> None:
         super().__init__(path, ["tsv"])
 
-    def parse(self) -> pd.DataFrame:
+    def parse(self) -> pl.DataFrame:
         """
         Parse the sample table
         """
         self.validate_path()
-        self._table = pd.read_csv(self.path, sep="\t", **self._pandas_kwargs)
+        self._table = pl.read_csv(self.path, sep="\t", **self._pandas_kwargs)
         return self.table
 
 
@@ -118,12 +119,12 @@ class XLSXTableParser(TableParser):
     def __init__(self, path: str) -> None:
         super().__init__(path, ["xlsx"])
 
-    def parse(self) -> pd.DataFrame:
+    def parse(self) -> pl.DataFrame:
         """
         Parse the sample table
         """
         self.validate_path()
-        self._table = pd.read_excel(self.path, **self._pandas_kwargs)
+        self._table = pl.read_excel(self.path, **self._pandas_kwargs)
         return self.table
 
 
diff --git a/peppy/project.py b/peppy/project.py
index 8b562603..dfffc34d 100644
--- a/peppy/project.py
+++ b/peppy/project.py
@@ -10,9 +10,10 @@
 from typing import Iterable, List, Tuple, Union, Literal
 
 import numpy as np
-import pandas as pd
+#import pandas as pd
+import polars as pl
 import yaml
-from pandas.core.common import flatten
+#from pandas.core.common import flatten
 from rich.console import Console
 from rich.progress import track
 from ubiquerg import is_url
@@ -173,8 +174,8 @@ def __eq__(self, other):
     @classmethod
     def from_pandas(
         cls,
-        samples_df: pd.DataFrame,
-        sub_samples_df: List[pd.DataFrame] = None,
+        samples_df: pl.DataFrame,
+        sub_samples_df: List[pl.DataFrame] = None,
         config: dict = None,
     ):
         """
@@ -224,7 +225,7 @@ def _from_dict(self, pep_dictionary) -> "Project":
                                                                              _samples: list | dict,
                                                                              _subsamples: list[list | dict]}
         """
-        self[SAMPLE_DF_KEY] = pd.DataFrame(pep_dictionary[SAMPLE_RAW_DICT_KEY]).replace(
+        self[SAMPLE_DF_KEY] = pl.DataFrame(pep_dictionary[SAMPLE_RAW_DICT_KEY]).replace(
             np.nan, ""
         )
         self[CONFIG_KEY] = pep_dictionary[CONFIG_KEY]
@@ -232,7 +233,7 @@ def _from_dict(self, pep_dictionary) -> "Project":
         if SUBSAMPLE_RAW_LIST_KEY in pep_dictionary:
             if pep_dictionary[SUBSAMPLE_RAW_LIST_KEY]:
                 self[SUBSAMPLE_DF_KEY] = [
-                    pd.DataFrame(sub_a).replace(np.nan, "")
+                    pl.DataFrame(sub_a).replace(np.nan, "")
                     for sub_a in pep_dictionary[SUBSAMPLE_RAW_LIST_KEY]
                 ]
         if NAME_KEY in self[CONFIG_KEY]:
@@ -291,7 +292,7 @@ def from_sample_yaml(cls, yaml_file: str):
         _LOGGER.info("Processing project from yaml...")
         with open(yaml_file, "r") as f:
             prj_dict = yaml.safe_load(f)
-        pd_df = pd.DataFrame.from_dict(prj_dict)
+        pd_df = pl.DataFrame.from_dict(prj_dict)
         return cls.from_pandas(pd_df)
 
     def to_dict(
@@ -375,9 +376,9 @@ def _get_table_from_samples(self, index, initial=False):
             if SAMPLE_DF_KEY in self:
                 df = self[SAMPLE_DF_KEY]
             else:
-                df = pd.DataFrame()
+                df = pl.DataFrame()
         else:
-            df = pd.DataFrame.from_dict([s.to_dict() for s in self.samples])
+            df = pl.DataFrame.from_dict([s.to_dict() for s in self.samples])
         index = [index] if isinstance(index, str) else index
         if not all([i in df.columns for i in index]):
             _LOGGER.debug(
@@ -745,7 +746,7 @@ def _get_merged_attributes(
                 attribute_value_for_sample = sample.get(attr, "")
                 attribute_values.append(attribute_value_for_sample)
 
-            merged_attributes[attr] = list(flatten(attribute_values))
+            merged_attributes[attr] = list(pl.Expr.flatten(attribute_values))
 
         return merged_attributes
 
@@ -1255,7 +1256,7 @@ def subsample_table(self):
         if not self[SUBSAMPLE_DF_KEY]:
             return None
 
-        subsample_dataframes_array = make_list(self[SUBSAMPLE_DF_KEY], pd.DataFrame)
+        subsample_dataframes_array = make_list(self[SUBSAMPLE_DF_KEY], pl.DataFrame)
         for subsample_table in subsample_dataframes_array:
             if not all([i in subsample_table.columns for i in self.sst_index]):
                 _LOGGER.info(
diff --git a/peppy/sample.py b/peppy/sample.py
index d28fdaff..104bee80 100644
--- a/peppy/sample.py
+++ b/peppy/sample.py
@@ -5,9 +5,11 @@
 from logging import getLogger
 from string import Formatter
 
-import pandas as pd
+#import pandas as pd
+import polars as pl
 import yaml
-from pandas import Series, isnull
+#from pandas import Series, isnull
+from polars import Series
 
 from .const import (
     CONFIG_FILE_KEY,
@@ -122,7 +124,7 @@ def _obj2dict(obj, name=None):
             elif hasattr(obj, "dtype"):  # numpy data types
                 # TODO: this fails with ValueError for multi-element array.
                 return obj.item()
-            elif isnull(obj):
+            elif pl.is_null(obj):
                 # Missing values as evaluated by pandas.isnull().
                 # This gets correctly written into yaml.
                 return None
@@ -317,7 +319,7 @@ def sample_name(self):
     def __reduce__(self):
         return (
             self.__class__,
-            (pd.Series(self.to_dict()),),
+            (pl.Series(self.to_dict()),),
             # (None, {}),
             # iter([]),
             # iter({PRJ_REF: self[PRJ_REF]}.items()),
@@ -374,7 +376,7 @@ def _excl_from_repr(self, k, cls):
 
     def _excl_classes_from_todict(self):
         """Exclude pandas.DataFrame from dict representation"""
-        return (pd.DataFrame,)
+        return (pl.DataFrame,)
 
     def _try_touch_samples(self):
         """
diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt
index a0036b9c..a9dccb88 100644
--- a/requirements/requirements-all.txt
+++ b/requirements/requirements-all.txt
@@ -1,5 +1,6 @@
-pandas>=0.24.2
+#pandas>=0.24.2
 pyyaml
 rich>=10.3.0
 ubiquerg>=0.6.2
 numpy
+polars

From 4fd6012b197f71616d15fc7e46eef35793ffbba4 Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Tue, 2 Jul 2024 13:12:06 -0400
Subject: [PATCH 2/2] replace pandas in tests with polars

---
 tests/conftest.py     | 5 +++--
 tests/test_Project.py | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index a689ffc1..89fc095b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -2,7 +2,8 @@
 
 import os
 
-import pandas as pd
+# import pandas as pd
+import polars as pl
 import pytest
 
 __author__ = "Michal Stolarczyk"
@@ -70,6 +71,6 @@ def example_peps_cfg_paths(request):
 
 @pytest.fixture
 def config_with_pandas_obj(request):
-    return pd.read_csv(
+    return pl.read_csv(
         get_path_to_example_file(EPB, request.param, "sample_table.csv"), dtype=str
     )
diff --git a/tests/test_Project.py b/tests/test_Project.py
index f165fb53..dd67c4e1 100644
--- a/tests/test_Project.py
+++ b/tests/test_Project.py
@@ -6,7 +6,8 @@
 
 import numpy as np
 import pytest
-from pandas import DataFrame
+#from pandas import DataFrame
+from polars import DataFrame
 from yaml import dump, safe_load
 import pickle