diff --git a/CHANGES.rst b/CHANGES.rst index cd2fedee..7ad241bf 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -17,6 +17,10 @@ New features and enhancements * `cdm_mapper.read_tables` * `cdm_mapper.write_tables` +* introduce `ParquetStreamReader` to replace `pd.parsers.io.TextfileReader` (:issue:`8`, :pull:`348`) +* ``cdm_reader.map_model`` now supports both `pd.DataFrame` and `ParquetStreamReader` as output (:pull:`348`) +* ``common.replace_columns`` now supports both `pd.DataFrame` and `ParquetStreamReader` as output (:pull:`348`) + Breaking changes ^^^^^^^^^^^^^^^^ * ``DataBundle.stack_v`` and ``DataBundle.stack_h`` only support `pd.DataFrames` as input, otherwise raises an `ValueError` (:pull:`360`) @@ -27,12 +31,18 @@ Breaking changes * set default for `extension` from ``csv` to specified `data_format` in `mdf_reader.write_data` (:pull:`363`) * `mdf_reader.read_data`: save `dtypes` in return DataBundle as `pd.Series` not `dict` (:pull:`363`) +* remove ``common.pandas_TextParser_hdlr`` (:issue:`8`, :pull:`348`) +* ``cdm_reader_mapper`` now raises errors instead of logging them (:pull:`348`) +* ``DataBundle`` now converts all iterables of `pd.DataFrame`/`pd.Series` to `ParquetStreamReader` when initialized (:pull:`348`) +* all main functions in `common.select` now return a tuple of 4 (selected values, rejected values, original indexes of selected values, original indexes of rejected values) (:pull:`348`) +* move `ParquetStreamReader` and all corresponding methods to `common.iterables` to handle chunking outside of `mdf_reader`/`cdm_mapper`/`core`/`metmetpy` (:issue:`349`, :pull:`348`) Internal changes ^^^^^^^^^^^^^^^^ * re-work internal structure for more readability and better performance (:pull:`360`) * use pre-defined `Literal` constants in `cdm_reader_mapper.properties` (:pull:`363`) * `mdf_reader.utils.utilities.read_csv`: parameter `columns` to `column_names` (:pull:`363`) +* introduce post-processing decorator that handles both `pd.DataFrame` and `ParquetStreamReader` (:pull:`348`) 2.2.1 (2026-01-23) ------------------ diff --git a/cdm_reader_mapper/cdm_mapper/mapper.py b/cdm_reader_mapper/cdm_mapper/mapper.py index 16f390a4..7d4b64bc 100755 --- a/cdm_reader_mapper/cdm_mapper/mapper.py +++ b/cdm_reader_mapper/cdm_mapper/mapper.py @@ -3,7 +3,7 @@ Created on Thu Apr 11 13:45:38 2019 -Maps data contained in a pandas DataFrame (or pd.io.parsers.TextFileReader) to +Maps data contained in a pandas DataFrame (or Iterable[pd.DataFrame]) to the C3S Climate Data Store Common Data Model (CDM) header and observational tables using the mapping information available in the tool's mapping library for the input data model. @@ -15,14 +15,18 @@ from copy import deepcopy from io import StringIO -from typing import Any, get_args +from typing import Any, Iterable, get_args import numpy as np import pandas as pd -from pandas.io.parsers import TextFileReader +from cdm_reader_mapper.common import logging_hdlr -from cdm_reader_mapper.common import logging_hdlr, pandas_TextParser_hdlr +from cdm_reader_mapper.common.iterators import ( + ParquetStreamReader, + ProcessFunction, + process_function, +) from . import properties from .codes.codes import get_code_table @@ -31,41 +35,6 @@ from .utils.mapping_functions import mapping_functions -def _check_input_data_type(data, logger): - """Check whether inpuit data type is valid.""" - - def _log_and_return_empty(msg): - logger.error(msg) - - if isinstance(data, pd.DataFrame): - logger.debug("Input data is a pd.DataFrame") - if data.empty: - return _log_and_return_empty("Input data is empty") - return [data] - - elif isinstance(data, TextFileReader): - logger.debug("Input is a pd.TextFileReader") - if not pandas_TextParser_hdlr.is_not_empty(data): - return _log_and_return_empty("Input data is empty") - - return data - - return _log_and_return_empty("Input data type " f"{type(data)}" " not supported") - - -def _normalize_input_data(data, logger): - """Return an iterator of DataFrames irrespective of input type.""" - data = _check_input_data_type(data, logger) - - if data is None: - return iter(()) - - if isinstance(data, list): - return iter(data) - - return data - - def _is_empty(value): """Check whether a value is considered empty.""" if value is None: @@ -369,7 +338,7 @@ def _prepare_cdm_tables(cdm_subset): return tables -def _process_chunk( +def _map_data_model( idata, imodel_maps, imodel_functions, @@ -380,9 +349,14 @@ def _process_chunk( drop_missing_obs, drop_duplicates, logger, - is_reader, ): """Process one chunk of input data.""" + if ":" in idata.columns[0]: + idata.columns = pd.MultiIndex.from_tuples( + col.split(":") for col in idata.columns + ) + + all_tables = [] for table, mapping in imodel_maps.items(): logger.debug(f"Table: {table}") @@ -400,118 +374,37 @@ def _process_chunk( ) table_df.columns = pd.MultiIndex.from_product([[table], table_df.columns]) + table_df = table_df.astype(object) + all_tables.append(table_df) - if is_reader: - table_df.to_csv( - cdm_tables[table]["buffer"], - header=False, - index=False, - mode="a", - ) - cdm_tables[table]["columns"] = table_df.columns - else: - cdm_tables[table]["df"] = table_df.astype(object) - - -def _finalize_output(cdm_tables, logger): - """Turn buffers into DataFrames and combine all tables.""" - final_tables = [] - - for table, meta in cdm_tables.items(): - logger.debug(f"\tParse datetime by reader; Table: {table}") - - if "df" not in meta: - meta["buffer"].seek(0) - df = pd.read_csv( - meta["buffer"], - names=meta["columns"], - na_values=[], - dtype="object", - keep_default_na=False, - ) - meta["buffer"].close() - else: - df = meta.get("df", pd.DataFrame()) - - final_tables.append(df) - - if not final_tables: - return pd.DataFrame() - - return pd.concat(final_tables, axis=1, join="outer").reset_index(drop=True) - - -def _map_and_convert( - data_model, - *sub_models, - data=None, - cdm_subset=None, - codes_subset=None, - cdm_complete=True, - drop_missing_obs=True, - drop_duplicates=True, - null_label="null", - logger=None, -) -> pd.DataFrame: - """Map and convert MDF data to CDM tables.""" - data_iter = _normalize_input_data(data, logger) - - if data_iter is None: - return pd.DataFrame() - - if not cdm_subset: - cdm_subset = properties.cdm_tables - - imodel_maps = get_imodel_maps(data_model, *sub_models, cdm_tables=cdm_subset) - imodel_functions = mapping_functions("_".join([data_model] + list(sub_models))) - - cdm_tables = _prepare_cdm_tables(imodel_maps.keys()) - - is_reader = isinstance(data_iter, TextFileReader) - - for idata in data_iter: - _process_chunk( - idata=idata, - imodel_maps=imodel_maps, - imodel_functions=imodel_functions, - cdm_tables=cdm_tables, - null_label=null_label, - codes_subset=codes_subset, - cdm_complete=cdm_complete, - drop_missing_obs=drop_missing_obs, - drop_duplicates=drop_duplicates, - logger=logger, - is_reader=is_reader, - ) - - return _finalize_output(cdm_tables, logger) + return pd.concat(all_tables, axis=1, join="outer").reset_index(drop=True) def map_model( - data, - imodel, - cdm_subset=None, - codes_subset=None, - null_label="null", - cdm_complete=True, - drop_missing_obs=True, - drop_duplicates=True, - log_level="INFO", -) -> pd.DataFrame: + data: pd.DataFrame | Iterable[pd.DataFrame], + imodel: str, + cdm_subset: str | list[str] | None = None, + codes_subset: str | list[str] | None = None, + null_label: str = "null", + cdm_complete: bool = True, + drop_missing_obs: bool = True, + drop_duplicates: bool = True, + log_level: str = "INFO", +) -> pd.DataFrame | ParquetStreamReader: """Map a pandas DataFrame to the CDM header and observational tables. Parameters ---------- - data: pandas.DataFrame, pd.parser.TextFileReader or io.String + data: pandas.DataFrame or Iterable[pd.DataFrame] input data to map. imodel: str A specific mapping from generic data model to CDM, like map a SID-DCK from IMMA1’s core and attachments to CDM in a specific way. e.g. ``icoads_r300_d704`` - cdm_subset: list, optional + cdm_subset: str or list, optional subset of CDM model tables to map. Defaults to the full set of CDM tables defined for the imodel. - codes_subset: list, optional + codes_subset: str or list, optional subset of code mapping tables to map. Default to the full set of code mapping tables defined for the imodel. null_label: str @@ -536,30 +429,53 @@ def map_model( cdm_tables: pandas.DataFrame DataFrame with MultiIndex columns (cdm_table, column_name). """ + + @process_function(data_only=True) + def _map_model(): + return ProcessFunction( + data=data, + func=_map_data_model, + func_kwargs={ + "imodel_maps": imodel_maps, + "imodel_functions": imodel_functions, + "cdm_tables": cdm_tables, + "null_label": null_label, + "codes_subset": codes_subset, + "cdm_complete": cdm_complete, + "drop_missing_obs": drop_missing_obs, + "drop_duplicates": drop_duplicates, + "logger": logger, + }, + makecopy=False, + ) + logger = logging_hdlr.init_logger(__name__, level=log_level) if imodel is None: - logger.error("Input data model 'imodel' is not defined.") - return + raise ValueError("Input data model 'imodel' is not defined.") if not isinstance(imodel, str): - logger.error(f"Input data model type is not supported: {type(imodel)}") - return + raise TypeError(f"Input data model type is not supported: {type(imodel)}") - imodel = imodel.split("_") - if imodel[0] not in get_args(properties.SupportedDataModels): - logger.error("Input data model " f"{imodel[0]}" " not supported") - return + data_model = imodel.split("_") + if data_model[0] not in get_args(properties.SupportedDataModels): + raise ValueError("Input data model " f"{data_model[0]}" " not supported") + + if not cdm_subset: + cdm_subset = properties.cdm_tables + + imodel_maps = get_imodel_maps(*data_model, cdm_tables=cdm_subset) + imodel_functions = mapping_functions(imodel) + + cdm_tables = _prepare_cdm_tables(imodel_maps.keys()) + + result = _map_model() + + if isinstance(result, pd.DataFrame): + return pd.DataFrame(result) + elif isinstance(result, ParquetStreamReader): + return result - return _map_and_convert( - imodel[0], - *imodel[1:], - data=data, - cdm_subset=cdm_subset, - codes_subset=codes_subset, - null_label=null_label, - cdm_complete=cdm_complete, - drop_missing_obs=drop_missing_obs, - drop_duplicates=drop_duplicates, - logger=logger, + raise ValueError( + f"result mus be a pd.DataFrame or ParquetStreamReader, not {type(result)}." ) diff --git a/cdm_reader_mapper/cdm_mapper/writer.py b/cdm_reader_mapper/cdm_mapper/writer.py index 8de4f3ed..4affd0ca 100755 --- a/cdm_reader_mapper/cdm_mapper/writer.py +++ b/cdm_reader_mapper/cdm_mapper/writer.py @@ -5,7 +5,7 @@ Exports tables written in the C3S Climate Data Store Common Data Model (CDM) format to ascii files, The tables format is contained in a python dictionary, stored as an attribute in a pandas.DataFrame -(or pd.io.parsers.TextFileReader). +(or Iterable[pd.DataFrame]). This module uses a set of printer functions to "print" element values to a string object before exporting them to a final ascii file. diff --git a/cdm_reader_mapper/common/inspect.py b/cdm_reader_mapper/common/inspect.py index 148b13d3..960d56d2 100755 --- a/cdm_reader_mapper/common/inspect.py +++ b/cdm_reader_mapper/common/inspect.py @@ -8,31 +8,51 @@ from __future__ import annotations -from typing import Any +from typing import Any, Iterable, Mapping import pandas as pd -from .pandas_TextParser_hdlr import make_copy -from .pandas_TextParser_hdlr import get_length as get_length_hdlr +from .iterators import ProcessFunction, process_function -def _count_by_cat(series) -> dict: +def merge_sum_dicts(dicts): + """Recursively merge dictionaries, summing numeric values at the leaves.""" + result = {} + + for d in dicts: + for key, value in d.items(): + if key not in result: + result[key] = value + else: + if isinstance(value, Mapping) and isinstance(result[key], Mapping): + result[key] = merge_sum_dicts([result[key], value]) + else: + result[key] += value + + return result + + +def _count_by_cat(df, columns) -> dict: """Count unique values in a pandas Series, including NaNs.""" - counts = series.value_counts(dropna=False) - counts.index = counts.index.where(~counts.index.isna(), "nan") - return counts.to_dict() + count_dict = {} + for column in columns: + counts = df[column].value_counts(dropna=False) + counts.index = counts.index.where(~counts.index.isna(), "nan") + count_dict[column] = counts.to_dict() + return count_dict +@process_function() def count_by_cat( - data: pd.DataFrame | pd.io.parsers.TextFileReader, + data: pd.DataFrame | Iterable[pd.DataFrame], columns: str | list[str] | tuple | None = None, ) -> dict[str, dict[Any, int]]: """ - Count unique values per column in a DataFrame or a TextFileReader. + Count unique values per column in a DataFrame or a Iterable of DataFrame. Parameters ---------- - data : pandas.DataFrame or pd.io.parsers.TextFileReader + data : pandas.DataFrame or Iterable[pd.DataFrame] Input dataset. columns : str, list or tuple, optional Name(s) of the data column(s) to be selected. If None, all columns are used. @@ -45,41 +65,36 @@ def count_by_cat( Notes ----- - - Works with large files via TextFileReader by iterating through chunks. + - Works with large files via ParquetStreamReader by iterating through chunks. """ if columns is None: columns = data.columns if not isinstance(columns, list): columns = [columns] - counts = {col: {} for col in columns} - - if isinstance(data, pd.DataFrame): - for column in columns: - counts[column] = _count_by_cat(data[column]) - return counts - - data_cp = make_copy(data) - if data_cp is None: - return counts + return ProcessFunction( + data=data, + func=_count_by_cat, + func_kwargs={"columns": columns}, + non_data_output="acc", + makecopy=False, + non_data_proc=merge_sum_dicts, + ) - for chunk in data_cp: - for column in columns: - chunk_counts = _count_by_cat(chunk[column]) - for k, v in chunk_counts.items(): - counts[column][k] = counts[column].get(k, 0) + v - data_cp.close() - return counts +def _get_length(data: pd.DataFrame): + """Get length pd.DataFrame.""" + return len(data) -def get_length(data: pd.DataFrame | pd.io.parsers.TextFileReader) -> int: +@process_function() +def get_length(data: pd.DataFrame | Iterable[pd.DataFrame]) -> int: """ Get the total number of rows in a pandas object. Parameters ---------- - data : pandas.DataFrame or pandas.io.parsers.TextFileReader + data : pandas.DataFrame or Iterable[pd.DataFrame] Input dataset. Returns @@ -89,9 +104,16 @@ def get_length(data: pd.DataFrame | pd.io.parsers.TextFileReader) -> int: Notes ----- - - Works with large files via TextFileReader by using a specialized handler + - Works with large files via ParquetStreamReader by using a specialized handler to count rows without loading the entire file into memory. """ - if not isinstance(data, pd.io.parsers.TextFileReader): - return len(data) - return get_length_hdlr(data) + if hasattr(data, "_row_count"): + return data._row_count + + return ProcessFunction( + data=data, + func=_get_length, + non_data_output="acc", + makecopy=True, + non_data_proc=sum, + ) diff --git a/cdm_reader_mapper/common/iterators.py b/cdm_reader_mapper/common/iterators.py new file mode 100755 index 00000000..9f28e727 --- /dev/null +++ b/cdm_reader_mapper/common/iterators.py @@ -0,0 +1,557 @@ +"""Utilities for handling pandas TextParser objects safely.""" + +from __future__ import annotations + +import tempfile + +import inspect +import itertools + +import pandas as pd +import xarray as xr + +import pyarrow as pa +import pyarrow.parquet as pq + +from functools import wraps + +from pathlib import Path + +from typing import ( + Any, + Callable, + Generator, + Iterable, + Iterator, + Literal, + Sequence, +) + + +class ProcessFunction: + """Stores data and a callable function with optional arguments for processing.""" + + def __init__( + self, + data: pd.DataFrame | pd.Series | Iterable[pd.DataFrame] | Iterable[pd.Series], + func: Callable[..., Any], + func_args: Any | list[Any] | tuple[Any] | None = None, + func_kwargs: dict[str, Any] | None = None, + **kwargs, + ): + self.data = data + + if not isinstance(func, Callable): + raise ValueError(f"Function {func} is not callable.") + + self.func = func + + if func_args is None: + func_args = () + + if not isinstance(func_args, (list, tuple)): + func_args = (func_args,) + + self.func_args = func_args + + if func_kwargs is None: + func_kwargs = {} + + self.func_kwargs = func_kwargs + + self.kwargs = kwargs + + +class ParquetStreamReader: + """A wrapper that mimics pandas.io.parsers.TextFileReader.""" + + def __init__( + self, + source: ( + Iterator[pd.DataFrame | pd.Series] + | Callable[[], Iterator[pd.DataFrame | pd.Series]] + ), + ): + self._closed = False + self._buffer: list[pd.DataFrame | pd.Series] = [] + + if callable(source): + # factory that produces a fresh iterator + self._factory = source + elif isinstance(source, Iterator): + self._factory = lambda: source + else: + raise TypeError( + "ParquetStreamReader expects an iterator or a factory callable." + ) + + self._generator = self._factory() + + def __iter__(self): + """Allows: for df in reader: ...""" + return self + + def __next__(self): + """Allows: next(reader)""" + if self._closed: + raise ValueError("I/O operation on closed stream.") + if self._buffer: + return self._buffer.pop(0) + return next(self._generator) + + def prepend(self, chunk: pd.DataFrame | pd.Series): + """ + Push a chunk back onto the front of the stream. + Useful for peeking at the first chunk without losing it. + """ + # Insert at 0 ensures FIFO order (peeking logic) + self._buffer.insert(0, chunk) + + def get_chunk(self): + """ + Safe for Large Files. + Returns the next single chunk from disk. + (Note: 'size' is ignored here as chunks are pre-determined by the write step) + """ + return next(self) + + def read( + self, + ): + """ + WARNING: unsafe for Files > RAM. + Reads ALL remaining data into memory at once. + """ + # Consume the entire rest of the stream + chunks = list(self) + + if not chunks: + return pd.DataFrame() + + df = pd.concat(chunks) + return df + + def copy(self): + """Create an independent copy of the stream.""" + if self._closed: + raise ValueError("Cannot copy a closed stream.") + self._generator, new_gen = itertools.tee(self._generator) + return ParquetStreamReader(new_gen) + + def empty(self): + """Return True if stream is empty.""" + copy_stream = self.copy() + + try: + next(copy_stream) + return False + except StopIteration: + return True + + def reset_index(self, drop=False): + """Reset indexes continuously.""" + if self._closed: + raise ValueError("Cannot copy a closed stream.") + + offset = 0 + chunks = [] + + for df in self: + df = df.copy() + n = len(df) + + indexes = range(offset, offset + n) + df.index = indexes + + if drop is False: + df.insert(0, "index", indexes) + + offset += n + chunks.append(df) + + return ParquetStreamReader(lambda: iter(chunks)) + + def close(self): + """Close the stream and release resources.""" + self._closed = True + + def __enter__(self): + """Allows: with ParquetStreamReader(...) as reader: ...""" + return self + + def __exit__(self, *_): + """Allows: with ParquetStreamReader(...) as reader: ...""" + self.close() + + +def _sort_chunk_outputs( + outputs: tuple, capture_meta: bool, requested_types: tuple[type, ...] +) -> tuple[list[pd.DataFrame | pd.Series], list[Any]]: + """Separates DataFrames from metadata in the function output.""" + data, meta = [], [] + for out in outputs: + if isinstance(out, requested_types): + data.append(out) + elif isinstance(out, list) and out and isinstance(out[0], requested_types): + data.extend(out) + elif capture_meta: + # Only capture metadata from the first chunk + meta.append(out) + + return data, meta + + +def _initialize_storage( + first_batch: list[pd.DataFrame | pd.Series], +) -> tuple[list, list]: + """Creates temp directories and captures schemas from the first chunk.""" + temp_dirs = [] + schemas = [] + + for obj in first_batch: + if isinstance(obj, pd.DataFrame): + schemas.append((pd.DataFrame, obj.columns)) + elif isinstance(obj, pd.Series): + schemas.append((pd.Series, obj.name)) + else: + raise TypeError( + f"Unsupported data type: {type(obj)}." + "Use one of [pd.DataFrame, pd.Series]." + ) + + temp_dirs.append(tempfile.TemporaryDirectory()) + + return temp_dirs, schemas + + +def _write_chunks_to_disk( + batch: list[pd.DataFrame | pd.Series], + temp_dirs: list[tempfile.TemporaryDirectory], + chunk_counter: int, +) -> None: + """Writes the current batch of DataFrames to their respective temp directories.""" + for i, data_out in enumerate(batch): + if isinstance(data_out, pd.Series): + data_out = data_out.to_frame() + + file_path = Path(temp_dirs[i].name) / f"part_{chunk_counter:05d}.parquet" + + table = pa.Table.from_pandas(data_out, preserve_index=True) + pq.write_table(table, file_path, compression="snappy") + + +def _parquet_generator( + temp_dir, data_type, schema +) -> Generator[pd.DataFrame | pd.Series]: + """Yields DataFrames from a temp directory, restoring schema.""" + try: + files = sorted(Path(temp_dir.name).glob("*.parquet")) + + for f in files: + df = pd.read_parquet(f) + + if data_type is pd.Series: + s = df.iloc[:, 0].copy() + s.name = schema + yield s + else: + yield df + + finally: + temp_dir.cleanup() + + +def _process_chunks( + readers: list[ParquetStreamReader], + func: Callable[..., Any], + requested_types: tuple[str], + static_args: list[Any], + static_kwargs: dict[str, Any], + non_data_output: str, + non_data_proc: Callable[..., Any] | None, + non_data_proc_args: tuple[Any] | None, + non_data_proc_kwargs: dict[str, Any] | None, +): + """Process chunks.""" + # State variables + temp_dirs = None + schemas = None + output_non_data: dict[int, list[Any]] = {} + chunk_counter: int = 0 + + for items in zip(*readers): + + if not isinstance(items[0], requested_types): + raise TypeError( + f"Unsupported data type in Iterable {items[0]}: {type(items[0])}" + f"Requested types are: {requested_types} " + ) + + result = func(*items, *static_args, **static_kwargs) + if not isinstance(result, tuple): + result = (result,) + + # Sort outputs + capture_meta = non_data_output == "acc" or chunk_counter == 0 + + data, meta = _sort_chunk_outputs(result, capture_meta, requested_types) + + for i, meta in enumerate(meta): + output_non_data.setdefault(i, []).append(meta) + + # Write DataFrames + if data: + if temp_dirs is None: + temp_dirs, schemas = _initialize_storage(data) + + _write_chunks_to_disk(data, temp_dirs, chunk_counter) + + chunk_counter += 1 + + if chunk_counter == 0: + raise ValueError("Iterable is empty.") + + keys = list(output_non_data.keys()) + if len(keys) == 1: + output_non_data = output_non_data[keys[0]] + + if isinstance(output_non_data, list) and len(output_non_data) == 1: + output_non_data = output_non_data[0] + + if isinstance(non_data_proc, Callable): + output_non_data = non_data_proc( + output_non_data, *non_data_proc_args, **non_data_proc_kwargs + ) + + # If no data outputs at all + if temp_dirs is None: + return output_non_data + + final_iterators = [ + ParquetStreamReader(lambda d=d, t=t, s=s: _parquet_generator(d, t, s)) + for d, (t, s) in zip(temp_dirs, schemas) + ] + + if isinstance(output_non_data, tuple): + output_non_data = list(output_non_data) + else: + output_non_data = [output_non_data] + + return tuple(final_iterators + output_non_data) + + +def _prepare_readers( + reader: Iterator[pd.DataFrame | pd.Series], + func_args: Sequence[Any], + func_kwargs: dict[str, Any], + makecopy: bool, +) -> tuple[list[ParquetStreamReader], list[Any], dict[str, Any]]: + """Prepare readers for chunking.""" + reader = ensure_parquet_reader(reader) + + args_reader = [] + args = [] + for arg in func_args: + converted = ensure_parquet_reader(arg) + if isinstance(converted, ParquetStreamReader): + args_reader.append(converted) + else: + args.append(converted) + + kwargs = {} + for k, v in func_kwargs.items(): + converted = ensure_parquet_reader(v) + if isinstance(converted, ParquetStreamReader): + args_reader.append(converted) + else: + kwargs[k] = converted + + readers = [reader] + args_reader + + if makecopy: + readers = [r.copy() for r in readers] + + return readers, args, kwargs + + +def parquet_stream_from_iterable( + iterable: Iterable[pd.DataFrame | pd.Series], +) -> ParquetStreamReader: + """ + Stream an iterable of DataFrame/Series to parquet + and return a disk-backed ParquetStreamReader. + + Memory usage remains constant. + """ + iterator = iter(iterable) + + try: + first = next(iterator) + except StopIteration: + raise ValueError("Iterable is empty.") + + if not isinstance(first, (pd.DataFrame, pd.Series)): + raise TypeError("Iterable must contain pd.DataFrame or pd.Series objects.") + + temp_dir = tempfile.TemporaryDirectory() + temp_dirs = [temp_dir] + + if isinstance(first, pd.DataFrame): + data_type = pd.DataFrame + schema = first.columns + else: + data_type = pd.Series + schema = first.name + _write_chunks_to_disk([first], temp_dirs, chunk_counter=0) + + for idx, chunk in enumerate(iterator, start=1): + if not isinstance(chunk, type(first)): + raise TypeError("All chunks must be of the same type.") + + _write_chunks_to_disk([chunk], temp_dirs, chunk_counter=idx) + + return ParquetStreamReader(lambda: _parquet_generator(temp_dir, data_type, schema)) + + +def is_valid_iterator(reader: Any) -> bool: + """Check if reader is a valid Iterable.""" + return isinstance(reader, Iterator) + + +def ensure_parquet_reader(obj: Any) -> Any: + """Ensure obj is a ParquetStreamReader.""" + if isinstance(obj, ParquetStreamReader): + return obj + + if is_valid_iterator(obj): + return parquet_stream_from_iterable(obj) + + return obj + + +def process_disk_backed( + reader: Iterator[pd.DataFrame | pd.Series], + func: Callable[..., Any], + func_args: Sequence[Any] | None = None, + func_kwargs: dict[str, Any] | None = None, + requested_types: type | tuple[type, ...] = (pd.DataFrame, pd.Series), + non_data_output: Literal["first", "acc"] = "first", + non_data_proc: Callable[..., Any] | None = None, + non_data_proc_args: tuple[Any] | None = None, + non_data_proc_kwargs: dict[str, Any] | None = None, + makecopy: bool = True, +) -> tuple[Any, ...]: + """ + Consumes a stream of DataFrames, processes them, and returns a tuple of + results. DataFrames are cached to disk (Parquet) and returned as generators. + """ + if func_args is None: + func_args = () + if func_kwargs is None: + func_kwargs = {} + + if not isinstance(requested_types, (list, tuple)): + requested_types = (requested_types,) + + readers, static_args, static_kwargs = _prepare_readers( + reader, func_args, func_kwargs, makecopy + ) + + if non_data_proc is not None: + if not isinstance(non_data_proc, Callable): + raise ValueError(f"Function {non_data_proc} is not callable.") + + if non_data_proc_args is None: + non_data_proc_args = () + if non_data_proc_kwargs is None: + non_data_proc_kwargs = {} + + return _process_chunks( + readers, + func, + requested_types, + static_args, + static_kwargs, + non_data_output, + non_data_proc, + non_data_proc_args, + non_data_proc_kwargs, + ) + + +def _process_function(results, data_only=False): + if not isinstance(results, ProcessFunction): + return results + + data = results.data + func = results.func + args = results.func_args + kwargs = results.func_kwargs + + if isinstance(data, (pd.DataFrame, pd.Series, xr.Dataset, xr.DataArray)): + return func(data, *args, **kwargs) + + if is_valid_iterator(data) and not isinstance(data, ParquetStreamReader): + data = parquet_stream_from_iterable(data) + + if isinstance(data, (list, tuple)): + data = parquet_stream_from_iterable(data) + + if not isinstance(data, ParquetStreamReader): + raise TypeError(f"Unsupported data type: {type(data)}") + + result = process_disk_backed( + data, + func, + func_args=args, + func_kwargs=kwargs, + **results.kwargs, + ) + + if data_only is True: + result = result[0] + + return result + + +def process_function(data_only=False, postprocessing=None): + """Decorator to apply function to both pd.DataFrame and Iterable[pd.DataFrame].""" + + def decorator(func): + sig = inspect.signature(func) + + @wraps(func) + def wrapper(*args, **kwargs): + bound_args = sig.bind(*args, **kwargs) + bound_args.apply_defaults() + original_call = bound_args.arguments.copy() + + result_class = func(*args, **kwargs) + results = _process_function( + result_class, + data_only=data_only, + ) + + if postprocessing is None: + return results + + postproc_func = postprocessing.get("func") + if not isinstance(postproc_func, Callable): + raise ValueError(f"Function {postproc_func} is not callable.") + postproc_list = postprocessing.get("kwargs", {}) + if isinstance(postproc_list, str): + postproc_list = [postproc_list] + + postproc_kwargs = {k: original_call[k] for k in postproc_list} + + result_list = [] + for result in results: + if isinstance(result, (pd.DataFrame, pd.Series, ParquetStreamReader)): + result = postproc_func(result, **postproc_kwargs) + result_list.append(result) + + return tuple(result_list) + + return wrapper + + return decorator diff --git a/cdm_reader_mapper/common/pandas_TextParser_hdlr.py b/cdm_reader_mapper/common/pandas_TextParser_hdlr.py deleted file mode 100755 index 8a61d825..00000000 --- a/cdm_reader_mapper/common/pandas_TextParser_hdlr.py +++ /dev/null @@ -1,160 +0,0 @@ -"""Utilities for handling pandas TextParser objects safely.""" - -from __future__ import annotations - -import pandas as pd -from pandas.io.parsers import TextFileReader -from io import StringIO -import logging - -logger = logging.getLogger(__name__) - -READ_CSV_KWARGS = [ - "chunksize", - "names", - "dtype", - "parse_dates", - "date_parser", - "infer_datetime_format", - "delimiter", - "quotechar", - "escapechar", - "skip_blank_lines", -] - - -def _get_raw_buffer(parser: TextFileReader) -> str | None: - if hasattr(parser, "_raw_buffer"): - return parser._raw_buffer - - f = getattr(parser.handles, "handle", None) - if f is None: - raise ValueError("TextFileReader has no accessible handle for copying.") - - try: - f = parser.handles.handle - raw = f.getvalue() - parser._raw_buffer = raw - return raw - except Exception as e: - raise RuntimeError("Failed to read raw buffer") from e - - -def _new_reader_from_buffer(parser: TextFileReader) -> TextFileReader | None: - raw = _get_raw_buffer(parser) - if raw is None: - return None - - read_dict = read_dict = { - k: parser.orig_options.get(k) - for k in READ_CSV_KWARGS - if k in parser.orig_options - } - return pd.read_csv(StringIO(raw), **read_dict) - - -def make_copy(parser: TextFileReader) -> TextFileReader | None: - """ - Create a duplicate of a pandas TextFileReader object. - - Parameters - ---------- - Parser : pandas.io.parsers.TextFileReader - The TextFileReader whose state will be copied. - - Returns - ------- - pandas.io.parsers.TextFileReader or None - A new TextFileReader with identical content and read options, - or None if copying fails. - - Notes - ----- - - The source handle must support `.getvalue()`, meaning this works - only for in-memory file-like objects such as `StringIO`. - """ - try: - return _new_reader_from_buffer(parser) - except Exception as e: - raise RuntimeError(f"Failed to copy TextParser: {e}") from e - - -def restore(parser: TextFileReader) -> TextFileReader | None: - """ - Restore a TextFileReader to its initial read position and state. - - Parameters - ---------- - Parser : pandas.io.parsers.TextFileReader - The TextFileReader to restore. - - Returns - ------- - pandas.io.parsers.TextFileReader or None - Restored TextFileReader, or None if restoration fails. - """ - return make_copy(parser) - - -def is_not_empty(parser: TextFileReader) -> bool | None: - """ - Determine whether a TextFileReader contains at least one row. - - Parameters - ---------- - Parser : pandas.io.parsers.TextFileReader - The parser to inspect. - - Returns - ------- - bool or None - True if not empty. - False if empty. - None if an error occurs. - """ - if hasattr(parser, "_is_not_empty"): - return parser._is_not_empty - - reader = make_copy(parser) - if reader is None: - return None - - try: - chunk = next(reader) - result = not chunk.empty - parser._is_not_empty = result - return result - except StopIteration: - parser._is_not_empty = False - return False - - -def get_length(parser: TextFileReader) -> int | None: - """ - Count total rows in a TextFileReader (consuming a copied stream). - - Parameters - ---------- - Parser : pandas.io.parsers.TextFileReader - The parser to measure. - - Returns - ------- - int or None - Total number of rows, or None if processing fails. - """ - if hasattr(parser, "_row_count"): - return parser._row_count - - reader = make_copy(parser) - if reader is None: - return None - - total = 0 - try: - for chunk in reader: - total += len(chunk) - parser._row_count = total - return total - except Exception as e: - raise RuntimeError("Failed while counting rows") from e diff --git a/cdm_reader_mapper/common/replace.py b/cdm_reader_mapper/common/replace.py index 058bf121..24250ee5 100755 --- a/cdm_reader_mapper/common/replace.py +++ b/cdm_reader_mapper/common/replace.py @@ -19,12 +19,14 @@ from __future__ import annotations +from typing import Iterable + import pandas as pd -from . import logging_hdlr +from .iterators import ParquetStreamReader, ProcessFunction, process_function -def replace_columns( +def _replace_columns( df_l: pd.DataFrame, df_r: pd.DataFrame, pivot_c: str | None = None, @@ -32,73 +34,35 @@ def replace_columns( pivot_r: str | None = None, rep_c: str | list[str] | None = None, rep_map: dict[str, str] | None = None, - log_level: str = "INFO", -) -> pd.DataFrame | None: - """ - Replace columns in one DataFrame using row-matching from another. - - Parameters - ---------- - df_l : pandas.DataFrame - The left DataFrame whose columns will be replaced. - df_r : pandas.DataFrame - The right DataFrame providing replacement values. - pivot_c : str, optional - A single pivot column present in both DataFrames. - Overrides `pivot_l` and `pivot_r`. - pivot_l : str, optional - Pivot column in `df_l`. Used only when `pivot_c` is not supplied. - pivot_r : str, optional - Pivot column in `df_r`. Used only when `pivot_c` is not supplied. - rep_c : str or list of str, optional - One or more column names to replace in `df_l`. - Ignored if `rep_map` is supplied. - rep_map : dict, optional - Mapping between left and right column names as `{left_col: right_col}`. - log_level : str, optional - Logging level to use. - - Returns - ------- - pandas.DataFrame or None - Updated DataFrame with replacements applied, or `None` if validation fails. - - Notes - ----- - This function logs errors and returns `None` instead of raising exceptions. - """ - logger = logging_hdlr.init_logger(__name__, level=log_level) - +): + """Helper function to replace columns in DataFrame.""" # Check inargs if not isinstance(df_l, pd.DataFrame) or not isinstance(df_r, pd.DataFrame): - logger.error("Input left and right data must be pandas DataFrames.") - return None + raise TypeError("Input left and right data must be pandas DataFrames.") if pivot_c is not None: pivot_l = pivot_r = pivot_c if pivot_l is None or pivot_r is None: - logger.error( + raise ValueError( "Pivot columns must be declared using `pivot_c` or both `pivot_l` and `pivot_r`." ) - return None if rep_map is None: if rep_c is None: - logger.error( + raise ValueError( "Replacement columns must be declared using `rep_c` or `rep_map`." ) - return None + if isinstance(rep_c, str): rep_c = [rep_c] rep_map = {col: col for col in rep_c} missing_cols = [src for src in rep_map.values() if src not in df_r.columns] if missing_cols: - logger.error( + raise ValueError( f"Replacement source columns not found in right DataFrame: {missing_cols}." ) - return None out = df_l.copy() right_lookup = ( @@ -115,3 +79,72 @@ def replace_columns( out[col] = aligned[col].values return out + + +def replace_columns( + df_l: pd.DataFrame | Iterable[pd.dataFrame], + df_r: pd.DataFrame | Iterable[pd.dataFrame], + pivot_c: str | None = None, + pivot_l: str | None = None, + pivot_r: str | None = None, + rep_c: str | list[str] | None = None, + rep_map: dict[str, str] | None = None, +) -> pd.DataFrame | ParquetStreamReader: + """ + Replace columns in one DataFrame using row-matching from another. + + Parameters + ---------- + df_l : pandas.DataFrame or Iterable[pd.dataFrame] + The left DataFrame whose columns will be replaced. + df_r : pandas.DataFrame or Iterable[pd.dataFrame] + The right DataFrame providing replacement values. + pivot_c : str, optional + A single pivot column present in both DataFrames. + Overrides `pivot_l` and `pivot_r`. + pivot_l : str, optional + Pivot column in `df_l`. Used only when `pivot_c` is not supplied. + pivot_r : str, optional + Pivot column in `df_r`. Used only when `pivot_c` is not supplied. + rep_c : str or list of str, optional + One or more column names to replace in `df_l`. + Ignored if `rep_map` is supplied. + rep_map : dict, optional + Mapping between left and right column names as `{left_col: right_col}`. + + Returns + ------- + pandas.DataFrame + Updated DataFrame with replacements applied. + + Notes + ----- + This function logs errors and returns `None` instead of raising exceptions. + """ + + @process_function(data_only=True) + def _replace_columns_hlp(): + return ProcessFunction( + data=df_l, + func=_replace_columns, + func_args=(df_r,), + func_kwargs={ + "pivot_c": pivot_c, + "pivot_l": pivot_l, + "pivot_r": pivot_r, + "rep_c": rep_c, + "rep_map": rep_map, + }, + makecopy=False, + ) + + result = _replace_columns_hlp() + + if isinstance(result, pd.DataFrame): + return pd.DataFrame(result) + elif isinstance(result, ParquetStreamReader): + return result + + raise ValueError( + f"result mus be a pd.DataFrame or ParquetStreamReader, not {type(result)}." + ) diff --git a/cdm_reader_mapper/common/select.py b/cdm_reader_mapper/common/select.py index fb1332e3..0cd15b5c 100755 --- a/cdm_reader_mapper/common/select.py +++ b/cdm_reader_mapper/common/select.py @@ -8,20 +8,33 @@ """ from __future__ import annotations -from io import StringIO -from typing import Iterable, Callable +from typing import Iterable import pandas as pd +from .iterators import ParquetStreamReader, ProcessFunction, process_function + + +def _concat_indexes(idx_dict): + selected_idx = pd.Index([]).append(idx_dict[0]) + rejected_idx = pd.Index([]).append(idx_dict[1]) + selected_idx = selected_idx.drop_duplicates() + rejected_idx = rejected_idx.drop_duplicates() + return selected_idx, rejected_idx + + +def _reset_index(data, reset_index=False): + if reset_index is False: + return data + return data.reset_index(drop=True) + def _split_df( df: pd.DataFrame, mask: pd.DataFrame, - reset_index: bool = False, inverse: bool = False, return_rejected: bool = False, ): - if inverse: selected = df[~mask] rejected = df[mask] if return_rejected else df.iloc[0:0] @@ -29,14 +42,9 @@ def _split_df( selected = df[mask] rejected = df[~mask] if return_rejected else df.iloc[0:0] - selected.attrs["_prev_index"] = mask.index[mask] - rejected.attrs["_prev_index"] = mask.index[~mask] - - if reset_index: - selected = selected.reset_index(drop=True) - rejected = rejected.reset_index(drop=True) - - return selected, rejected + selected_idx = mask.index[mask] + rejected_idx = mask.index[~mask] + return selected, rejected, selected_idx, rejected_idx def _split_by_boolean_df(df: pd.DataFrame, mask: pd.DataFrame, boolean: bool, **kwargs): @@ -55,6 +63,7 @@ def _split_by_column_df( **kwargs, ): mask_sel = df[col].isin(values) + mask_sel.name = col return _split_df(df=df, mask=mask_sel, **kwargs) @@ -66,135 +75,37 @@ def _split_by_index_df( ): index = pd.Index(index if isinstance(index, Iterable) else [index]) mask_sel = pd.Series(df.index.isin(index), index=df.index) - return _split_df(df=df, mask=mask_sel, **kwargs) -def _split_text_reader( - reader, - func: Callable, - *args, - reset_index=False, - inverse=False, - return_rejected=False, -): - buffer_sel = StringIO() - buffer_rej = StringIO() - - read_params = [ - "chunksize", - "names", - "dtype", - "parse_dates", - "date_parser", - "infer_datetime_format", - ] - - write_dict = {"header": None, "mode": "a", "index": not reset_index} - read_dict = {x: reader.orig_options.get(x) for x in read_params} - - new_args = [] - new_readers = [] - - prev_index_sel = None - prev_index_rej = None - - for d in args: - if isinstance(d, pd.io.parsers.TextFileReader): - new_readers.append(d) - else: - new_args.append(d) - - readers = [reader] + new_readers - - for zipped in zip(*readers): - - if not isinstance(zipped, tuple): - zipped = tuple(zipped) - - sel, rej = func( - *zipped, - *new_args, - reset_index=reset_index, - inverse=inverse, - return_rejected=return_rejected, - ) - - sel_prev_index = sel.attrs["_prev_index"] - - if prev_index_sel is None: - prev_index_sel = sel_prev_index - else: - prev_index_sel = prev_index_sel.union(sel_prev_index) - - rej_prev_index = rej.attrs["_prev_index"] - - if prev_index_rej is None: - prev_index_rej = rej_prev_index - else: - prev_index_rej = prev_index_rej.union(rej_prev_index) - - sel.to_csv(buffer_sel, **write_dict) - if return_rejected: - rej.to_csv(buffer_rej, **write_dict) - - dtypes = {} - for col, dtype in sel.dtypes.items(): - if dtype == "object": - dtype = "str" - dtypes[col] = dtype - - read_dict["dtype"] = dtypes - - buffer_sel.seek(0) - buffer_rej.seek(0) - - selected = pd.read_csv(buffer_sel, **read_dict) - rejected = pd.read_csv(buffer_rej, **read_dict) - - selected.attrs = {"_prev_index": prev_index_sel} - rejected.attrs = {"_prev_index": prev_index_rej} - - return selected, rejected - - -def _split_dispatch( - data, - func: Callable, - *args, - **kwargs, -): - - if isinstance(data, pd.DataFrame): - return func(data, *args, **kwargs) - - if isinstance(data, pd.io.parsers.TextFileReader): - return _split_text_reader( - data, - func, - *args, - **kwargs, - ) - - raise TypeError("Unsupported input type for split operation.") +PSR_KWARGS = { + "makecopy": False, + "non_data_output": "acc", + "non_data_proc": _concat_indexes, +} def split_by_boolean( - data: pd.DataFrame, - mask: pd.DataFrame, + data: pd.DataFrame | Iterable[pd.DataFrame], + mask: pd.DataFrame | Iterable[pd.DataFrame], boolean: bool, reset_index: bool = False, inverse: bool = False, return_rejected: bool = False, -) -> tuple[pd.DataFrame, pd.DataFrame]: +) -> tuple[ + pd.DataFrame | ParquetStreamReader, + pd.DataFrame | ParquetStreamReader, + pd.Index | pd.MultiIndex, + pd.Index | pd.MultiIndex, +]: """ Split a DataFrame using a boolean mask via ``split_dataframe_by_boolean``. Parameters ---------- - data : pandas.DataFrame + data : pandas.DataFrame or Iterable[pd.DataFrame] DataFrame to be split. - mask : pandas.DataFrame + mask : pandas.DataFrame or Iterable[pd.DataFrame] Boolean mask with the same length as ``data``. boolean : bool Determines mask interpretation: @@ -211,19 +122,23 @@ def split_by_boolean( Returns ------- - (pandas.DataFrame, pandas.DataFrame) - Tuple ``(selected, rejected)`` returned by the underlying - ``split_dataframe_by_boolean`` implementation. + (pandas.DataFrame or ParquetStreamReader, pandas.DataFrame or ParquetStreamReader, pd.Index or pd.MultiIndex, pd.Index or pd.MultiIndex) + Selected rows (all mask columns True), rejected rows, original indexes of selection and + original indexes of rejection. """ - return _split_dispatch( - data, - _split_by_boolean_df, - mask, - boolean, - reset_index=reset_index, - inverse=inverse, - return_rejected=return_rejected, - ) + + @process_function(postprocessing={"func": _reset_index, "kwargs": "reset_index"}) + def _split_by_boolean(reset_index=reset_index): + return ProcessFunction( + data=data, + func=_split_by_boolean_df, + func_args=(mask, boolean), + func_kwargs={"inverse": inverse, "return_rejected": return_rejected}, + **PSR_KWARGS, + ) + + result = _split_by_boolean() + return tuple(result) def split_by_boolean_true( @@ -232,7 +147,12 @@ def split_by_boolean_true( reset_index: bool = False, inverse: bool = False, return_rejected: bool = False, -) -> tuple[pd.DataFrame, pd.DataFrame]: +) -> tuple[ + pd.DataFrame | ParquetStreamReader, + pd.DataFrame | ParquetStreamReader, + pd.Index | pd.MultiIndex, + pd.Index | pd.MultiIndex, +]: """ Split rows where all mask columns are ``True``. @@ -247,12 +167,14 @@ def split_by_boolean_true( inverse : bool, optional If ``True``, invert the selection. return_rejected : bool, optional - If ``True``, also return rejected rows. + If ``True``, return rejected rows as the second output. + If ``False``, the rejected output is empty but dtype-preserving. Returns ------- - (pandas.DataFrame, pandas.DataFrame) - Selected rows (all mask columns True) and rejected rows. + (pandas.DataFrame or ParquetStreamReader, pandas.DataFrame or ParquetStreamReader, pd.Index or pd.MultiIndex, pd.Index or pd.MultiIndex) + Selected rows (all mask columns True), rejected rows, original indexes of selection and + original indexes of rejection. """ return split_by_boolean( data, @@ -270,7 +192,12 @@ def split_by_boolean_false( reset_index: bool = False, inverse: bool = False, return_rejected: bool = False, -) -> tuple[pd.DataFrame, pd.DataFrame]: +) -> tuple[ + pd.DataFrame | ParquetStreamReader, + pd.DataFrame | ParquetStreamReader, + pd.Index | pd.MultiIndex, + pd.Index | pd.MultiIndex, +]: """ Split rows where at least one mask column is ``False``. @@ -285,12 +212,14 @@ def split_by_boolean_false( inverse : bool, optional If ``True``, invert the selection. return_rejected : bool, optional - If ``True``, return rejected rows as well. + If ``True``, return rejected rows as the second output. + If ``False``, the rejected output is empty but dtype-preserving. Returns ------- - (pandas.DataFrame, pandas.DataFrame) - Selected rows (any mask column False) and rejected rows. + (pandas.DataFrame or ParquetStreamReader, pandas.DataFrame or ParquetStreamReader, pd.Index or pd.MultiIndex, pd.Index or pd.MultiIndex) + Selected rows (all mask columns True), rejected rows, original indexes of selection and + original indexes of rejection. """ return split_by_boolean( data, @@ -308,7 +237,12 @@ def split_by_column_entries( reset_index: bool = False, inverse: bool = False, return_rejected: bool = False, -) -> tuple[pd.DataFrame, pd.DataFrame]: +) -> tuple[ + pd.DataFrame | ParquetStreamReader, + pd.DataFrame | ParquetStreamReader, + pd.Index | pd.MultiIndex, + pd.Index | pd.MultiIndex, +]: """ Split a DataFrame based on matching values in a given column. @@ -324,23 +258,29 @@ def split_by_column_entries( inverse : bool, optional If ``True``, invert the selection. return_rejected : bool, optional - If ``True``, return rejected rows as the second DataFrame. + If ``True``, return rejected rows as the second output. + If ``False``, the rejected output is empty but dtype-preserving. Returns ------- - (pandas.DataFrame, pandas.DataFrame) - Selected rows (column value in provided list) and rejected rows. + (pandas.DataFrame or ParquetStreamReader, pandas.DataFrame or ParquetStreamReader, pd.Index or pd.MultiIndex, pd.Index or pd.MultiIndex) + Selected rows (all mask columns True), rejected rows, original indexes of selection and + original indexes of rejection. """ + + @process_function(postprocessing={"func": _reset_index, "kwargs": "reset_index"}) + def _split_by_column_entries(reset_index=reset_index): + return ProcessFunction( + data=data, + func=_split_by_column_df, + func_args=(col, values), + func_kwargs={"inverse": inverse, "return_rejected": return_rejected}, + **PSR_KWARGS, + ) + col, values = next(iter(selection.items())) - return _split_dispatch( - data, - _split_by_column_df, - col, - values, - reset_index=reset_index, - inverse=inverse, - return_rejected=return_rejected, - ) + result = _split_by_column_entries() + return tuple(result) def split_by_index( @@ -349,7 +289,12 @@ def split_by_index( reset_index: bool = False, inverse: bool = False, return_rejected: bool = False, -) -> tuple[pd.DataFrame, pd.DataFrame]: +) -> tuple[ + pd.DataFrame | ParquetStreamReader, + pd.DataFrame | ParquetStreamReader, + pd.Index | pd.MultiIndex, + pd.Index | pd.MultiIndex, +]: """ Split a DataFrame by selecting specific index labels. @@ -364,18 +309,25 @@ def split_by_index( inverse : bool, optional If ``True``, select rows **not** in ``index``. return_rejected : bool, optional - If ``True``, return rejected rows as well. + If ``True``, return rejected rows as the second output. + If ``False``, the rejected output is empty but dtype-preserving. Returns ------- - (pandas.DataFrame, pandas.DataFrame) - Selected rows (index in given list) and rejected rows. + (pandas.DataFrame or ParquetStreamReader, pandas.DataFrame or ParquetStreamReader, pd.Index or pd.MultiIndex, pd.Index or pd.MultiIndex) + Selected rows (all mask columns True), rejected rows, original indexes of selection and + original indexes of rejection. """ - return _split_dispatch( - data, - _split_by_index_df, - index, - reset_index=reset_index, - inverse=inverse, - return_rejected=return_rejected, - ) + + @process_function(postprocessing={"func": _reset_index, "kwargs": "reset_index"}) + def _split_by_index(reset_index=reset_index): + return ProcessFunction( + data=data, + func=_split_by_index_df, + func_args=(index,), + func_kwargs={"inverse": inverse, "return_rejected": return_rejected}, + **PSR_KWARGS, + ) + + result = _split_by_index() + return tuple(result) diff --git a/cdm_reader_mapper/core/_utilities.py b/cdm_reader_mapper/core/_utilities.py index 29d603df..266d5d2b 100755 --- a/cdm_reader_mapper/core/_utilities.py +++ b/cdm_reader_mapper/core/_utilities.py @@ -2,28 +2,35 @@ from __future__ import annotations +from typing import Iterable, Literal + from copy import deepcopy import numpy as np import pandas as pd -from cdm_reader_mapper.common.pandas_TextParser_hdlr import make_copy - from cdm_reader_mapper.common import ( get_length, ) -from io import StringIO as StringIO +from cdm_reader_mapper.common.iterators import ( + ParquetStreamReader, + process_disk_backed, + is_valid_iterator, + parquet_stream_from_iterable, +) def _copy(value): """Make copy of value""" if isinstance(value, dict): return deepcopy(value) - elif isinstance(value, pd.DataFrame): + elif isinstance(value, (pd.DataFrame, pd.Series)): + return value.copy() + elif isinstance(value, ParquetStreamReader): + return value.copy() + elif hasattr(value, "copy"): return value.copy() - elif isinstance(value, pd.io.parsers.TextFileReader): - return make_copy(value) return value @@ -38,58 +45,82 @@ def method(attr_func, *args, **kwargs): def reader_method(DataBundle, data, attr, *args, **kwargs): - """Handles operations on chunked DataFrame (TextFileReader).""" - data_buffer = StringIO() - TextParser = make_copy(data) - read_params = [ - "chunksize", - "parse_dates", - "date_parser", - "infer_datetime_format", - ] - write_dict = {"header": None, "mode": "a", "index": True} - read_dict = {x: TextParser.orig_options.get(x) for x in read_params} - inplace = kwargs.get("inplace", False) - for df_ in TextParser: - attr_func = getattr(df_, attr) - result_df = method(attr_func, *args, **kwargs) - if result_df is None: - result_df = df_ - result_df.to_csv(data_buffer, **write_dict) - dtypes = {} - for k, v in result_df.dtypes.items(): - if v == "object": - v = "str" - dtypes[k] = v - read_dict["dtype"] = dtypes - read_dict["names"] = result_df.columns - data_buffer.seek(0) - TextParser = pd.read_csv(data_buffer, **read_dict) + """ + Handles operations on chunked data (ParquetStreamReader). + Uses process_disk_backed to stream processing without loading into RAM. + """ + inplace = kwargs.pop("inplace", False) + + # Define the transformation function to apply per chunk + def apply_operation(df): + # Fetch the attribute (method or property) from the chunk + attr_obj = getattr(df, attr) + + # Use the 'method' helper to execute it (call or subscript) + result = method(attr_obj, *args, **kwargs) + + # If the operation was inplace on the DataFrame (returns None), yield the modified DataFrame itself. + if result is None: + return df + return result + + # Process stream using Disk-Backed Parquet Engine + result_tuple = process_disk_backed( + data, + apply_operation, + makecopy=True, + ) + + # The result is a tuple: (ParquetStreamReader, [extra_outputs]) + new_reader = result_tuple[0] + + # Handle inplace logic if inplace: - DataBundle._data = TextParser - return - return TextParser + DataBundle._data = new_reader + return None + + return new_reader -def combine_attribute_values(attr_func, TextParser, attr): - """Collect values of the attribute across all chunks and combine them.""" - combined_values = [attr_func] - for chunk in TextParser: +def combine_attribute_values(first_value, iterator, attr): + """ + Collect values of an attribute across all chunks and combine them. + + Parameters + ---------- + first_value : Any + The value from the first chunk (already read). + iterator : Iterator/ParquetStreamReader + The stream positioned at the second chunk. + attr : str + The attribute name to fetch from remaining chunks. + """ + combined_values = [first_value] + + # Iterate through the rest of the stream + for chunk in iterator: combined_values.append(getattr(chunk, attr)) - if isinstance(attr_func, pd.Index): - combined_index = combined_values[0] + # Logic to merge results based on type + if isinstance(first_value, pd.Index): + combined_index = first_value for idx in combined_values[1:]: combined_index = combined_index.union(idx) return combined_index - if isinstance(attr_func, (int, float)): + + if isinstance(first_value, (int, float)): return sum(combined_values) - if isinstance(attr_func, tuple) and len(attr_func) == 2: + + if isinstance(first_value, tuple) and len(first_value) == 2: + # Tuple usually implies shape (rows, cols) + # Sum rows (0), keep cols (1) constant first_ = sum(value[0] for value in combined_values) - second_ = attr_func[1] + second_ = first_value[1] return (first_, second_) - if isinstance(attr_func, (list, np.ndarray)): + + if isinstance(first_value, (list, np.ndarray)): return np.concatenate(combined_values) + return combined_values @@ -117,15 +148,35 @@ class _DataBundle: def __init__( self, - data=pd.DataFrame(), - columns=None, - dtypes=None, - parse_dates=None, - encoding=None, - mask=pd.DataFrame(), - imodel=None, - mode="data", + data: pd.DataFrame | Iterable[pd.DataFrame] | None = None, + columns: pd.Index | pd.MultiIndex | list | None = None, + dtypes: pd.Series | dict | None = None, + parse_dates: list | bool | None = None, + encoding: str | None = None, + mask: pd.DataFrame | Iterable[pd.DataFrame] | None = None, + imodel: str | None = None, + mode: Literal["data", "tables"] = "data", ): + if data is None: + data = pd.DataFrame(columns=columns, dtype=dtypes) + if mask is None: + mask = mask or pd.DataFrame(columns=data.columns, dtype=bool) + + if mode not in ["data", "tables"]: + raise ValueError( + f"'mode' {mode} is not valid, use one of ['data', 'tables']." + ) + + if ( + is_valid_iterator(data) and not isinstance(data, ParquetStreamReader) + ) or isinstance(data, (list, tuple)): + data = parquet_stream_from_iterable(data) + + if ( + is_valid_iterator(mask) and not isinstance(mask, ParquetStreamReader) + ) or isinstance(mask, (list, tuple)): + mask = parquet_stream_from_iterable(mask) + self._data = data self._columns = columns self._dtypes = dtypes @@ -151,19 +202,45 @@ def __getattr__(self, attr): if not callable(attr_func): return attr_func return SubscriptableMethod(attr_func) - elif isinstance(data, pd.io.parsers.TextFileReader): - def wrapped_reader_method(*args, **kwargs): - return reader_method(self, data, attr, *args, **kwargs) + if isinstance(data, ParquetStreamReader): + # This allows db.read(), db.close(), db.get_chunk() to work + if hasattr(data, attr): + return getattr(data, attr) + + data = data.copy() + + try: + first_chunk = data.get_chunk() + except ValueError: + raise ValueError("Cannot access attribute on empty data stream.") + + if not hasattr(first_chunk, attr): + # Restore state before raising error + data.prepend(first_chunk) + raise AttributeError(f"DataFrame chunk has no attribute '{attr}'.") + + attr_value = getattr(first_chunk, attr) + + if callable(attr_value): + # METHOD CALL (e.g., .dropna(), .fillna()) + # Put the chunk BACK so the reader_method sees the full stream. + data.prepend(first_chunk) + + def wrapped_reader_method(*args, **kwargs): + return reader_method(self, data, attr, *args, **kwargs) - TextParser = make_copy(data) - first_chunk = next(TextParser) - attr_func = getattr(first_chunk, attr) - if callable(attr_func): return SubscriptableMethod(wrapped_reader_method) - return combine_attribute_values(attr_func, TextParser, attr) + else: + # PROPERTY ACCESS (e.g., .shape, .dtypes) + # DO NOT put the chunk back yet. Pass the 'first_value' + # and the 'data' iterator (which is now at chunk 2) to the combiner. + # The combiner will consume the rest. + return combine_attribute_values(attr_value, data, attr) - raise TypeError("'data' is neither a DataFrame nor a TextFileReader object.") + raise TypeError( + f"'data' is {type(data)}, expected DataFrame or ParquetStreamReader." + ) def __repr__(self) -> str: """Return a string representation for :py:attr:`data`.""" @@ -274,22 +351,26 @@ def _stack(self, other, datasets, inplace, **kwargs): other = [other] if not isinstance(datasets, list): datasets = [datasets] + for data in datasets: - _data = f"_{data}" - _df = getattr(db_, _data) if hasattr(db_, _data) else pd.DataFrame() + data_ = f"_{data}" + df_ = getattr(db_, data_) if hasattr(db_, data_) else pd.DataFrame() - if isinstance(_df, pd.io.parsers.TextFileReader): - raise ValueError("Data must be a DataFrame not a TextFileReader.") + if is_valid_iterator(df_): + raise ValueError( + "Data must be a pd.DataFrame not a iterable of pd.DataFrames." + ) to_concat = [ - getattr(concat, _data) for concat in other if hasattr(concat, _data) + getattr(concat, data_) for concat in other if hasattr(concat, data_) ] if not to_concat: continue - if not _df.empty: - to_concat = [_df] + to_concat - _df = pd.concat(to_concat, **kwargs) - _df = _df.reset_index(drop=True) - setattr(self, f"_{data}", _df) + if not df_.empty: + to_concat = [df_] + to_concat + + concatenated = pd.concat(to_concat, **kwargs) + concatenated = concatenated.reset_index(drop=True) + setattr(self, data_, concatenated) return self._return_db(db_, inplace) diff --git a/cdm_reader_mapper/core/databundle.py b/cdm_reader_mapper/core/databundle.py index c6336dc0..2ec16fa0 100755 --- a/cdm_reader_mapper/core/databundle.py +++ b/cdm_reader_mapper/core/databundle.py @@ -33,7 +33,7 @@ class DataBundle(_DataBundle): Parameters ---------- - data: pandas.DataFrame, optional + data: pd.DataFrame or Iterable[pd.DataFrame], optional MDF DataFrame. columns: pd.Index, pd.MultiIndex or list, optional Column labels of ``data`` @@ -140,7 +140,7 @@ def stack_v( Note ---- - * This is only working with DataFrames, not with TextFileReaders! + * This is only working with pd.DataFrames, not with iterables of pd.DataFrames! * The DataFrames in the :py:class:`~DataBundle` have to have the same data columns! Returns @@ -177,7 +177,7 @@ def stack_h( Note ---- - * This is only working with DataFrames, not with TextFileReaders! + * This is only working with pd.DataFrames, not with iterables of pd.DataFrames! * The DataFrames in the :py:class:`~DataBundle` may have different data columns! Examples @@ -237,10 +237,11 @@ def select_where_all_true( """ db_ = self._get_db(inplace) _mask = _copy(db_._mask) - db_._data = split_by_boolean_true(db_._data, _mask, **kwargs)[0] + db_._data, _, selected_idx, _ = split_by_boolean_true( + db_._data, _mask, **kwargs + ) if do_mask is True: - _prev_index = db_._data.attrs["_prev_index"] - db_._mask = split_by_index(db_._mask, _prev_index, **kwargs)[0] + db_._mask, _, _, _ = split_by_index(db_._mask, selected_idx, **kwargs) return self._return_db(db_, inplace) def select_where_all_false( @@ -285,10 +286,11 @@ def select_where_all_false( """ db_ = self._get_db(inplace) _mask = _copy(db_._mask) - db_._data = split_by_boolean_false(db_._data, _mask, **kwargs)[0] + db_._data, _, selected_idx, _ = split_by_boolean_false( + db_._data, _mask, **kwargs + ) if do_mask is True: - _prev_index = db_._data.attrs["_prev_index"] - db_._mask = split_by_index(db_._mask, _prev_index, **kwargs)[0] + db_._mask, _, _, _ = split_by_index(db_._mask, selected_idx, **kwargs) return self._return_db(db_, inplace) def select_where_entry_isin( @@ -337,10 +339,11 @@ def select_where_entry_isin( For more information see :py:func:`split_by_column_entries` """ db_ = self._get_db(inplace) - db_._data = split_by_column_entries(db_._data, selection, **kwargs)[0] + db_._data, _, selected_idx, _ = split_by_column_entries( + db_._data, selection, **kwargs + ) if do_mask is True: - _prev_index = db_._data.attrs["_prev_index"] - db_._mask = split_by_index(db_._mask, _prev_index, **kwargs)[0] + db_._mask, _, _, _ = split_by_index(db_._mask, selected_idx, **kwargs) return self._return_db(db_, inplace) def select_where_index_isin( @@ -386,10 +389,9 @@ def select_where_index_isin( For more information see :py:func:`split_by_index` """ db_ = self._get_db(inplace) - db_._data = split_by_index(db_._data, index, **kwargs)[0] + db_._data, _, selected_idx, _ = split_by_index(db_._data, index, **kwargs) if do_mask is True: - _prev_index = db_._data.attrs["_prev_index"] - db_._mask = split_by_index(db_._mask, _prev_index, **kwargs)[0] + db_._mask, _, _, _ = split_by_index(db_._mask, selected_idx, **kwargs) return self._return_db(db_, inplace) def split_by_boolean_true( @@ -427,14 +429,12 @@ def split_by_boolean_true( db1_ = self.copy() db2_ = self.copy() _mask = _copy(db1_._mask) - db1_._data, db2_._data = split_by_boolean_true( + db1_._data, db2_._data, selected_idx, _ = split_by_boolean_true( db1_._data, _mask, return_rejected=True, **kwargs ) if do_mask is True: - _prev_index = db1_._data.attrs["_prev_index"] - - db1_._mask, db2_._mask = split_by_index( - db1_._mask, _prev_index, return_rejected=True, **kwargs + db1_._mask, db2_._mask, _, _ = split_by_index( + db1_._mask, selected_idx, return_rejected=True, **kwargs ) return db1_, db2_ @@ -473,13 +473,12 @@ def split_by_boolean_false( db1_ = self.copy() db2_ = self.copy() _mask = _copy(db1_._mask) - db1_._data, db2_._data = split_by_boolean_false( + db1_._data, db2_._data, selected_idx, _ = split_by_boolean_false( db1_._data, _mask, return_rejected=True, **kwargs ) if do_mask is True: - _prev_index = db1_._data.attrs["_prev_index"] - db1_._mask, db2_._mask = split_by_index( - db1_._mask, _prev_index, return_rejected=True, **kwargs + db1_._mask, db2_._mask, _, _ = split_by_index( + db1_._mask, selected_idx, return_rejected=True, **kwargs ) return db1_, db2_ @@ -522,13 +521,12 @@ def split_by_column_entries( """ db1_ = self.copy() db2_ = self.copy() - db1_._data, db2_._data = split_by_column_entries( + db1_._data, db2_._data, selected_idx, _ = split_by_column_entries( db1_._data, selection, return_rejected=True, **kwargs ) if do_mask is True: - _prev_index = db1_._data.attrs["_prev_index"] - db1_._mask, db2_._mask = split_by_index( - db1_._mask, _prev_index, return_rejected=True, **kwargs + db1_._mask, db2_._mask, _, _ = split_by_index( + db1_._mask, selected_idx, return_rejected=True, **kwargs ) return db1_, db2_ @@ -570,11 +568,11 @@ def split_by_index( """ db1_ = self.copy() db2_ = self.copy() - db1_._data, db2_._data = split_by_index( + db1_._data, db2_._data, _, _ = split_by_index( db1_._data, index, return_rejected=True, **kwargs ) if do_mask is True: - db1_._mask, db2_._mask = split_by_index( + db1_._mask, db2_._mask, _, _ = split_by_index( db1_._mask, index, return_rejected=True, **kwargs ) return db1_, db2_ diff --git a/cdm_reader_mapper/core/writer.py b/cdm_reader_mapper/core/writer.py index ae3899ae..e9ae0eeb 100755 --- a/cdm_reader_mapper/core/writer.py +++ b/cdm_reader_mapper/core/writer.py @@ -2,10 +2,9 @@ from __future__ import annotations -from typing import get_args +from typing import Iterable, get_args import pandas as pd -from pandas.io.parsers import TextFileReader from cdm_reader_mapper.cdm_mapper.writer import write_tables from cdm_reader_mapper.mdf_reader.writer import write_data @@ -21,7 +20,7 @@ def write( - data: pd.DataFrame | TextFileReader, + data: pd.DataFrame | Iterable[pd.DataFrame], mode: SupportedWriteModes = "data", **kwargs, ) -> None: @@ -29,7 +28,7 @@ def write( Parameters ---------- - data: pandas.DataFrame or TextFileReader + data: pandas.DataFrame or Iterable[pd.DataFrame] Data to export. mode: str, {data, tables} Write data mode: diff --git a/cdm_reader_mapper/mdf_reader/reader.py b/cdm_reader_mapper/mdf_reader/reader.py index ced76b50..62fcf90a 100755 --- a/cdm_reader_mapper/mdf_reader/reader.py +++ b/cdm_reader_mapper/mdf_reader/reader.py @@ -337,7 +337,6 @@ def read_data( data_kwargs=data_kwargs, mask_kwargs=mask_kwargs, ) - return DataBundle( data=data, columns=info["columns"], diff --git a/cdm_reader_mapper/mdf_reader/utils/filereader.py b/cdm_reader_mapper/mdf_reader/utils/filereader.py index 9b556cf6..b484f0d0 100755 --- a/cdm_reader_mapper/mdf_reader/utils/filereader.py +++ b/cdm_reader_mapper/mdf_reader/utils/filereader.py @@ -4,19 +4,15 @@ import logging -from typing import Callable, Any, Sequence, Mapping +from typing import Any, Mapping, Sequence, Iterable import pandas as pd import xarray as xr from dataclasses import replace -from pandas.io.parsers import TextFileReader from .. import properties -from .utilities import ( - process_textfilereader, - remove_boolean_values, -) +from .utilities import remove_boolean_values from .convert_and_decode import convert_and_decode from .validators import validate @@ -30,27 +26,7 @@ ) from cdm_reader_mapper.core.databundle import DataBundle - - -def _apply_or_chunk( - data: pd.DataFrame | TextFileReader, - func: Callable[..., Any], - func_args: Sequence[Any] | None = None, - func_kwargs: Mapping[str, Any] | None = None, - **kwargs: Mapping[str, Any], -): - """Apply a function directly or chunk-wise depending on input type.""" - func_args = func_args or [] - func_kwargs = func_kwargs or {} - if not isinstance(data, TextFileReader): - return func(data, *func_args, **func_kwargs) - return process_textfilereader( - data, - func, - func_args, - func_kwargs, - **kwargs, - ) +from cdm_reader_mapper.common.iterators import ProcessFunction, process_function def _merge_kwargs(*dicts: Mapping[str, Any]) -> dict[str, Any]: @@ -133,7 +109,7 @@ def __init__( def _process_data( self, - data: pd.DataFrame | TextFileReader, + data: pd.DataFrame | Iterable[pd.DataFrame], convert_flag: bool = False, decode_flag: bool = False, converter_dict: dict | None = None, @@ -153,7 +129,7 @@ def _process_data( Parameters ---------- - data : pandas.DataFrame or TextFileReader + data : pandas.DataFrame or Iterable[pd.DataFrame] Input data. convert_flag : bool Whether to apply converters. @@ -234,6 +210,7 @@ def _process_data( return data, mask, config + @process_function() def open_data( self, source: str, @@ -246,7 +223,7 @@ def open_data( select_kwargs: dict | None = None, ) -> ( tuple[pd.DataFrame, pd.DataFrame, ParserConfig] - | tuple[TextFileReader, TextFileReader, ParserConfig] + | tuple[Iterable[pd.DataFrame], Iterable[pd.DataFrame], ParserConfig] ): """ Open and parse source data according to parser configuration. @@ -273,8 +250,18 @@ def open_data( Returns ------- tuple - (data, mask, config) or chunked equivalents if using TextFileReader. + (data, mask, config) or chunked equivalents if using Iterable[pd.DataFrame]. """ + + @process_function() + def _open_data(): + return ProcessFunction( + data=to_parse, + func=self._process_data, + func_kwargs=func_kwargs, + makecopy=False, + ) + pd_kwargs = dict(pd_kwargs or {}) xr_kwargs = dict(xr_kwargs or {}) convert_kwargs = convert_kwargs or {} @@ -293,7 +280,6 @@ def open_data( if open_with == "netcdf": to_parse = xr.open_mfdataset(source, **xr_kwargs).squeeze() config = update_xr_config(to_parse, self.config) - write_kwargs, read_kwargs = {}, {} elif open_with == "pandas": config = update_pd_config(pd_kwargs, self.config) pd_kwargs["encoding"] = config.encoding @@ -303,28 +289,14 @@ def open_data( pd_kwargs.setdefault("escapechar", "\0") pd_kwargs.setdefault("dtype", object) pd_kwargs.setdefault("skip_blank_lines", False) - - write_kwargs = {"encoding": pd_kwargs["encoding"]} - chunksize = pd_kwargs.get("chunksize") - read_kwargs = ( - {"chunksize": chunksize, "dtype": config.dtypes}, - {"chunksize": chunksize, "dtype": "boolean"}, - ) - to_parse = pd.read_fwf(source, **pd_kwargs) else: raise ValueError("open_with must be 'pandas' or 'netcdf'") func_kwargs["config"] = config - return _apply_or_chunk( - to_parse, - self._process_data, - func_kwargs=func_kwargs, - makecopy=False, - write_kwargs=write_kwargs, - read_kwargs=read_kwargs, - ) + result = _open_data() + return tuple(result) def read( self, diff --git a/cdm_reader_mapper/mdf_reader/utils/parser.py b/cdm_reader_mapper/mdf_reader/utils/parser.py index 3ba2e9ae..3ff633e6 100755 --- a/cdm_reader_mapper/mdf_reader/utils/parser.py +++ b/cdm_reader_mapper/mdf_reader/utils/parser.py @@ -286,7 +286,7 @@ def parse_pandas( records = df[col].map( lambda line: _parse_line(line, order_specs, sections, excludes) ) - return pd.DataFrame.from_records(records.to_list()) + return pd.DataFrame.from_records(records.to_list(), index=records.keys()) def parse_netcdf( diff --git a/cdm_reader_mapper/mdf_reader/utils/utilities.py b/cdm_reader_mapper/mdf_reader/utils/utilities.py index 51cd5105..0041cbff 100755 --- a/cdm_reader_mapper/mdf_reader/utils/utilities.py +++ b/cdm_reader_mapper/mdf_reader/utils/utilities.py @@ -3,18 +3,13 @@ from __future__ import annotations import ast -import csv -import os - -from io import StringIO -from pathlib import Path -from typing import Any, Iterable, Callable +import os import pandas as pd +from pathlib import Path +from typing import Any, Callable, Iterable -from .. import properties - -from cdm_reader_mapper.common.pandas_TextParser_hdlr import make_copy +from cdm_reader_mapper.common.iterators import ProcessFunction, process_function def as_list(x: str | Iterable[Any] | None) -> list[Any] | None: @@ -210,13 +205,13 @@ def update_and_select( return df, {"columns": df.columns, "dtypes": df.dtypes} +@process_function() def _read_data_from_file( filepath: Path, reader: Callable[..., Any], col_subset: str | list | None = None, column_names: pd.Index | pd.MultiIndex | None = None, reader_kwargs: dict | None = None, - iterator: bool = False, ) -> tuple[pd.DataFrame | Iterable[pd.DataFrame], dict[str, Any]]: """Helper file reader.""" if filepath is None or not Path(filepath).is_file(): @@ -226,27 +221,12 @@ def _read_data_from_file( data = reader(filepath, **reader_kwargs) - if isinstance(data, pd.DataFrame): - return update_and_select(data, subset=col_subset, column_names=column_names) - - if iterator is True: - writer_kwargs = {} - if "encoding" in reader_kwargs: - writer_kwargs["encoding"] = reader_kwargs["encoding"] - - return process_textfilereader( - data, - func=update_and_select, - func_kwargs={ - "subset": col_subset, - "column_names": column_names, - }, - read_kwargs=reader_kwargs, - write_kwargs=writer_kwargs, - makecopy=False, - ) - - raise ValueError(f"Unsupported reader return type: {type(data)}") + return ProcessFunction( + data=data, + func=update_and_select, + func_kwargs={"subset": col_subset, "column_names": column_names}, + makecopy=False, + ) def read_csv( @@ -276,14 +256,14 @@ def read_csv( - The CSV as a DataFrame. Empty if file does not exist. - dictionary containing data column labels and data types """ - return _read_data_from_file( + result = _read_data_from_file( filepath, reader=pd.read_csv, col_subset=col_subset, column_names=column_names, reader_kwargs={"delimiter": delimiter, **kwargs}, - iterator=True, ) + return tuple(result) def read_parquet( @@ -309,16 +289,17 @@ def read_parquet( Returns ------- tuple[pd.DataFrame, dict] - - The CSV as a DataFrame. Empty if file does not exist. + - The PARQUET as a DataFrame. Empty if file does not exist. - dictionary containing data column labels and data types """ - return _read_data_from_file( + result = _read_data_from_file( filepath, - reader=pd.read_parquet, - col_subset=col_subset, - column_names=column_names, + pd.read_parquet, + col_subset, + column_names, reader_kwargs=kwargs, ) + return tuple(result) def read_feather( @@ -347,13 +328,14 @@ def read_feather( - The CSV as a DataFrame. Empty if file does not exist. - dictionary containing data column labels and data types """ - return _read_data_from_file( + result = _read_data_from_file( filepath, - reader=pd.read_feather, - col_subset=col_subset, - column_names=column_names, + pd.read_feather, + col_subset, + column_names, reader_kwargs=kwargs, ) + return tuple(result) def convert_dtypes(dtypes) -> tuple[str]: @@ -465,106 +447,3 @@ def remove_boolean_values(data, dtypes) -> pd.DataFrame: data = data.map(_remove_boolean_values) dtype = _adjust_dtype(dtypes, data) return data.astype(dtype) - - -def process_textfilereader( - reader: Iterable[pd.DataFrame], - func: Callable, - func_args: tuple = (), - func_kwargs: dict[str, Any] | None = None, - read_kwargs: dict[str, Any] | tuple[dict[str, Any], ...] | None = None, - write_kwargs: dict[str, Any] | None = None, - makecopy: bool = True, -) -> tuple[Iterable[pd.DataFrame], ...]: - """ - Process a stream of DataFrames using a function and return processed results. - - Each DataFrame from `reader` is passed to `func`, which can return one or more - DataFrames or other outputs. DataFrame outputs are concatenated in memory and - returned as a tuple along with any additional non-DataFrame outputs. - - Parameters - ---------- - reader : Iterable[pd.DataFrame] - An iterable of DataFrames (e.g., a CSV reader returning chunks). - func : Callable - Function to apply to each DataFrame. - func_args : tuple, optional - Positional arguments passed to `func`. - func_kwargs : dict, optional - Keyword arguments passed to `func`. - read_kwargs : dict or tuple of dict, optional - Arguments to pass to `pd.read_csv` when reconstructing output DataFrames. - write_kwargs : dict, optional - Arguments to pass to `DataFrame.to_csv` when buffering output. - makecopy : bool, default True - If True, makes a copy of each input DataFrame before processing. - - Returns - ------- - tuple - A tuple containing: - - One or more processed DataFrames (in the same order as returned by `func`) - - Any additional outputs from `func` that are not DataFrames - """ - func_kwargs = func_kwargs or {} - read_kwargs = read_kwargs or {} - write_kwargs = write_kwargs or {} - - buffers = [] - columns = [] - - if makecopy is True: - reader = make_copy(reader) - - output_add = [] - - for df in reader: - outputs = func(df, *func_args, **func_kwargs) - if not isinstance(outputs, tuple): - outputs = (outputs,) - - output_dfs = [] - first_chunk = not buffers - - for out in outputs: - if isinstance(out, pd.DataFrame): - output_dfs.append(out) - elif first_chunk: - output_add.append(out) - - if not buffers: - buffers = [StringIO() for _ in output_dfs] - columns = [out.columns for out in output_dfs] - - for buffer, out_df in zip(buffers, output_dfs): - out_df.to_csv( - buffer, - header=False, - mode="a", - index=False, - quoting=csv.QUOTE_NONE, - sep=properties.internal_delimiter, - quotechar="\0", - escapechar="\0", - **write_kwargs, - ) - - if isinstance(read_kwargs, dict): - read_kwargs = tuple(read_kwargs for _ in range(len(buffers))) - - result_dfs = [] - for buffer, cols, rk in zip(buffers, columns, read_kwargs): - buffer.seek(0) - rk = {k: v for k, v in rk.items() if k != "delimiter"} - result_dfs.append( - pd.read_csv( - buffer, - names=cols, - delimiter=properties.internal_delimiter, - quotechar="\0", - escapechar="\0", - **rk, - ) - ) - return tuple(result_dfs + output_add) diff --git a/cdm_reader_mapper/mdf_reader/writer.py b/cdm_reader_mapper/mdf_reader/writer.py index 6722ecae..c0e9046c 100755 --- a/cdm_reader_mapper/mdf_reader/writer.py +++ b/cdm_reader_mapper/mdf_reader/writer.py @@ -6,15 +6,18 @@ import logging from io import StringIO as StringIO from pathlib import Path -from typing import Any, get_args +from typing import Iterable, get_args import pandas as pd -from pandas.io.parsers import TextFileReader from .utils.utilities import join, update_column_names, update_dtypes from ..common import get_filename -from ..common.pandas_TextParser_hdlr import make_copy +from ..common.iterators import ( + ParquetStreamReader, + is_valid_iterator, + parquet_stream_from_iterable, +) from ..properties import SupportedFileTypes @@ -26,35 +29,25 @@ def _normalize_data_chunks( - data: pd.DataFrame | TextFileReader | None, -) -> list | TextFileReader: + data: pd.DataFrame | Iterable[pd.DataFrame] | None, +) -> list | ParquetStreamReader: """Helper function to normalize data chunks.""" if data is None: data = pd.DataFrame() if isinstance(data, pd.DataFrame): return [data] - if isinstance(data, TextFileReader): - return make_copy(data) + if is_valid_iterator(data): + if not isinstance(data, ParquetStreamReader): + data = parquet_stream_from_iterable(data) + return data.copy() + if isinstance(data, (list, tuple)): + return parquet_stream_from_iterable(data) raise TypeError(f"Unsupported data type found: {type(data)}.") -def _write_data( - data_df: pd.DataFrame, - mask_df: pd.DataFrame, - data_fn: str, - mask_fn: str, - writer: str, - write_kwargs: dict[str, Any], -) -> None: - """Helper function to write data on disk.""" - getattr(data_df, writer)(data_fn, **write_kwargs) - if not mask_df.empty: - getattr(mask_df, writer)(mask_fn, **write_kwargs) - - def write_data( - data: pd.DataFrame | TextFileReader, - mask: pd.DataFrame | TextFileReader | None = None, + data: pd.DataFrame | Iterable[pd.DataFrame], + mask: pd.DataFrame | Iterable[pd.DataFrame] | None = None, data_format: SupportedFileTypes = "csv", dtypes: pd.Series | dict | None = None, parse_dates: list | bool = False, @@ -64,7 +57,7 @@ def write_data( suffix: str | None = None, extension: str = None, filename: str | dict | None = None, - col_subset: str | list | tuple | None = None, + col_subset: str | list[str] | tuple[str] | None = None, delimiter: str = ",", **kwargs, ) -> None: @@ -72,10 +65,10 @@ def write_data( Parameters ---------- - data: pandas.DataFrame - pandas.DataFrame to export. - mask: pandas.DataFrame, optional - validation mask to export. + data: pandas.DataFrame or Iterable[pd.DataFrame] + Data to export. + mask: pandas.DataFrame or Iterable[pd.DataFrame], optional + Validation mask to export. data_format: {"csv", "parquet", "feather"}, default: "csv" Format of output data file(s). dtypes: dict, optional @@ -132,6 +125,9 @@ def write_data( f"data_format must be one of {supported_file_types}, not {data_format}." ) + if mask is not None and not isinstance(data, type(mask)): + raise ValueError("type of 'data' and type of 'mask' do not match.") + extension = extension or data_format if not isinstance(dtypes, (dict, pd.Series)): @@ -194,14 +190,10 @@ def write_data( **kwargs, ) - _write_data( - data_df=data_df, - mask_df=mask_df, - data_fn=filename_data, - mask_fn=filename_mask, - writer=WRITERS[data_format], - write_kwargs=write_kwargs, - ) + writer = WRITERS[data_format] + getattr(data_df, writer)(filename_data, **write_kwargs) + if not mask_df.empty: + getattr(mask_df, writer)(filename_mask, **write_kwargs) with open(filename_info, "w") as fileObj: json.dump(info, fileObj, indent=4) diff --git a/cdm_reader_mapper/metmetpy/correct.py b/cdm_reader_mapper/metmetpy/correct.py index 85f85603..ee504a3c 100755 --- a/cdm_reader_mapper/metmetpy/correct.py +++ b/cdm_reader_mapper/metmetpy/correct.py @@ -59,12 +59,12 @@ from __future__ import annotations -from io import StringIO -from typing import Any +from typing import Any, Iterable import pandas as pd -from ..common import logging_hdlr, pandas_TextParser_hdlr +from ..common import logging_hdlr +from ..common.iterators import ProcessFunction, process_function from ..common.json_dict import collect_json_files, combine_dicts from . import properties @@ -84,6 +84,9 @@ def _correct_dt( """Apply deck-specific datetime corrections to a dataset.""" logger = logging_hdlr.init_logger(__name__, level=log_level) + if isinstance(data, pd.Series): + raise TypeError("pd.Series is not supported now.") + # 1. Optional deck specific corrections datetime_correction = correction_method.get(dck, {}).get("function") if not datetime_correction: @@ -96,15 +99,13 @@ def _correct_dt( logger.info(f'Applying "{datetime_correction}" datetime correction') try: trans = getattr(corr_f_dt, datetime_correction) - except AttributeError as e: - logger.error(f"Correction function '{datetime_correction}' not found.") - raise e + except AttributeError: + raise AttributeError(f"Correction function '{datetime_correction}' not found.") try: return trans(data) except Exception as e: - logger.error("Error applying datetime correction", exc_info=True) - raise e + raise RuntimeError("func '{trans.__name__}' could not be executed") from e def _correct_pt( @@ -118,6 +119,9 @@ def _correct_pt( """Apply platform-type corrections for a given deck.""" logger = logging_hdlr.init_logger(__name__, level=log_level) + if isinstance(data, pd.Series): + raise TypeError("pd.Series is not supported now.") + deck_fix = fix_methods.get(dck) if not deck_fix: logger.info( @@ -165,17 +169,18 @@ def _correct_pt( ) +@process_function(data_only=True) def correct_datetime( - data: pd.DataFrame | pd.io.parsers.TextFileReader, + data: pd.DataFrame | Iterable[pd.DataFrame], imodel: str, log_level: str = "INFO", _base=_base, -) -> pd.DataFrame | pd.io.parsers.TextFileReader: +) -> pd.DataFrame | Iterable[pd.DataFrame]: """Apply ICOADS deck specific datetime corrections. Parameters ---------- - data: pandas.DataFrame or pandas.io.parsers.TextFileReader + data: pandas.DataFrame or Iterable[pd.DataFrame] Input dataset. imodel: str Name of internally available data model. @@ -188,14 +193,16 @@ def correct_datetime( Returns ------- - pandas.DataFrame or pandas.io.parsers.TextFileReader - a pandas.DataFrame or pandas.io.parsers.TextFileReader - with the adjusted data + pandas.DataFrame or Iterable[pd.DataFrame] + A pandas.DataFrame or Iterable[pd.DataFrame] with the adjusted data. Raises ------ ValueError If `_correct_dt` raises an error during correction. + TypeError + If `data` is not a pd.DataFrame or an Iterable[pd.DataFrame]. + If `data` is a pd.Series. """ logger = logging_hdlr.init_logger(__name__, level=log_level) _base = f"{_base}.datetime" @@ -213,44 +220,36 @@ def correct_datetime( logger.warning("Module will proceed with no attempt to apply id replacements") return data - correction_method = combine_dicts(replacements_method_files, base=_base) - - if isinstance(data, pd.DataFrame): - return _correct_dt(data, imodel, dck, correction_method, log_level=log_level) - elif isinstance(data, pd.io.parsers.TextFileReader): - read_params = [ - "chunksize", - "names", - "dtype", - "parse_dates", - "date_parser", - "infer_datetime_format", - ] - read_dict = {x: data.orig_options.get(x) for x in read_params} - - buffer = StringIO() - data_ = pandas_TextParser_hdlr.make_copy(data) - for df in data_: - df = _correct_dt(df, imodel, dck, correction_method, log_level=log_level) - df.to_csv(buffer, header=False, index=False, mode="a") + if isinstance(data, pd.Series): + raise TypeError("pd.Series is not supported now.") - buffer.seek(0) - return pd.read_csv(buffer, **read_dict) + correction_method = combine_dicts(replacements_method_files, base=_base) - raise TypeError(f"Unsupported data type: {type(data)}") + return ProcessFunction( + data=data, + func=_correct_dt, + func_kwargs={ + "data_model": imodel, + "dck": dck, + "correction_method": correction_method, + "log_level": log_level, + }, + makecopy=False, + ) +@process_function(data_only=True) def correct_pt( - data: pd.DataFrame | pd.io.parsers.TextFileReader, + data: pd.DataFrame | Iterable[pd.DataFrame], imodel: str, log_level="INFO", _base=_base, -) -> pd.DataFrame | pd.io.parsers.TextFileReader: +) -> pd.DataFrame | Iterable[pd.DataFrame]: """Apply ICOADS deck specific platform ID corrections. Parameters ---------- - data: pandas.DataFrame or pandas.io.parsers.TextFileReader + data: pandas.DataFrame or Iterable[pd.DataFrame] Input dataset. imodel: str Name of internally available data model. @@ -261,21 +260,24 @@ def correct_pt( Returns ------- - pandas.DataFrame or pandas.io.parsers.TextFileReader - a pandas.DataFrame or pandas.io.parsers.TextFileReader - with the adjusted data + pandas.DataFrame or Iterable[pd.DataFrame] + A pandas.DataFrame or Iterable[pd.DataFrame] with the adjusted data. Raises ------ ValueError If `_correct_pt` raises an error during correction. + If platform column is not defined in properties file. + TypeError + If `data` is not a pd.DataFrame or an Iterable[pd.DataFrame]. + If `data` is a pd.Series. """ logger = logging_hdlr.init_logger(__name__, level=log_level) _base = f"{_base}.platform_type" mrd = imodel.split("_") if len(mrd) < 3: - logger.warning(f"Dataset {imodel} has to deck information.") + logger.warning(f"Dataset {imodel} has no deck information.") return data dck = mrd[2] @@ -286,33 +288,26 @@ def correct_pt( logger.warning(f"Dataset {imodel} not included in platform library") return data + if isinstance(data, pd.Series): + raise TypeError("pd.Series is not supported now.") + fix_methods = combine_dicts(fix_files, base=_base) pt_col = properties.metadata_datamodels["platform"].get(mrd[0]) if not pt_col: - logger.error( - f"Data model {imodel} platform column not defined in properties file" + raise ValueError( + f"Data model {imodel} platform column not defined in properties file." ) - return data - if isinstance(data, pd.DataFrame): - return _correct_pt(data, imodel, dck, pt_col, fix_methods, log_level="INFO") - elif isinstance(data, pd.io.parsers.TextFileReader): - read_params = [ - "chunksize", - "names", - "dtype", - "parse_dates", - "date_parser", - "infer_datetime_format", - ] - read_dict = {x: data.orig_options.get(x) for x in read_params} - buffer = StringIO() - for df in data: - df = _correct_pt(df, imodel, dck, pt_col, fix_methods, log_level="INFO") - df.to_csv(buffer, header=False, index=False, mode="a") - - buffer.seek(0) - return pd.read_csv(buffer, **read_dict) - - raise TypeError(f"Unsupported data type: {type(data)}") + return ProcessFunction( + data=data, + func=_correct_pt, + func_kwargs={ + "imodel": imodel, + "dck": dck, + "pt_col": pt_col, + "fix_methods": fix_methods, + "log_level": log_level, + }, + makecopy=False, + ) diff --git a/cdm_reader_mapper/metmetpy/datetime/correction_functions.py b/cdm_reader_mapper/metmetpy/datetime/correction_functions.py index 9c662073..d00626be 100755 --- a/cdm_reader_mapper/metmetpy/datetime/correction_functions.py +++ b/cdm_reader_mapper/metmetpy/datetime/correction_functions.py @@ -44,4 +44,6 @@ def dck_201_icoads(data: pd.DataFrame) -> pd.DataFrame: datetime_.loc[loc] = datetime_.loc[loc] - pd.Timedelta(days=1) data[datetime_cols] = model_datetimes.from_datetime(datetime_, "icoads") + data[datetime_cols] = data[datetime_cols].astype("int") + return data diff --git a/cdm_reader_mapper/metmetpy/validate.py b/cdm_reader_mapper/metmetpy/validate.py index f8180a02..68892e56 100755 --- a/cdm_reader_mapper/metmetpy/validate.py +++ b/cdm_reader_mapper/metmetpy/validate.py @@ -59,9 +59,12 @@ import logging import re +from typing import Iterable + import pandas as pd -from ..common import logging_hdlr, pandas_TextParser_hdlr +from ..common import logging_hdlr +from ..common.iterators import ProcessFunction, process_function from ..common.json_dict import collect_json_files, combine_dicts from . import properties @@ -71,21 +74,24 @@ def _get_id_col( - data: pd.DataFrame, imodel: str, logger: logging.logger + data: pd.DataFrame, + imodel: str, ) -> int | list[int] | None: """Retrieve the ID column(s) for a given data model from the metadata.""" id_col = properties.metadata_datamodels["id"].get(imodel) if not id_col: - logger.error(f"Data model {imodel} ID column not defined in properties file.") - return + raise ValueError( + f"Data model {imodel} ID column not defined in properties file." + ) if not isinstance(id_col, list): id_col = [id_col] id_col = [col for col in id_col if col in data.columns] if not id_col: - logger.error(f"No ID columns found. Selected columns are {list(data.columns)}") - return + raise ValueError( + f"No ID columns found. Selected columns are {list(data.columns)}" + ) if len(id_col) == 1: id_col = id_col[0] @@ -120,8 +126,31 @@ def _get_patterns( return patterns +def _validate_id(data, mrd, combined_compiled, na_values): + """Helper function to validate ID.""" + id_col = _get_id_col(data, mrd[0]) + if id_col is None: + raise ValueError("No ID conversion columns found.") + + id_series = data[id_col] + + return id_series.str.match(combined_compiled, na=na_values) + + +def _validate_datetime(data: pd.DataFrame | pd.Series, model: str): + """Helper function to validate datetime.""" + data_model_datetime = model_datetimes.to_datetime(data, model) + + if len(data_model_datetime) == 0: + raise ValueError( + f"No columns found for datetime conversion. Selected columns are {list(data.columns)}." + ) + return data_model_datetime.notna() + + +@process_function(data_only=True) def validate_id( - data: pd.DataFrame | pd.Series | pd.io.parsers.TextFileReader, + data: pd.DataFrame | pd.Series | Iterable[pd.DataFrame, pd.Series], imodel: str, blank: bool = False, log_level: str = "INFO", @@ -131,7 +160,7 @@ def validate_id( Parameters ---------- - data : pd.DataFrame, pd.Series, or pd.io.parsers.TextFileReader + data : pd.DataFrame, pd.Series, or Iterable[pd.DataFrame, pd.Series] Input dataset or series containing ID values. imodel : str Name of internally available data model, e.g., "icoads_r300_d201". @@ -149,71 +178,69 @@ def validate_id( Raises ------ - None explicitly; errors are logged and function returns None on failure. + TypeError + If `data` is not a pd.DataFrame or a pd.Series or an Iterable[pd.DataFrame | pd.Series]. + Value Error + If dataset `imodel` has no deck information. + If no ID conversion columns found. + If input deck is not defined in ID library files. + FilenotFounderror + If dataset `imodel` has no ID deck library. Notes ----- - - If `data` is a TextFileReader, it is fully read into a DataFrame. - Uses `_get_id_col` to determine which column(s) contain IDs. - Uses `_get_patterns` to get regex patterns for the deck. - Empty values match "^$" pattern if `blank=True`. """ logger = logging_hdlr.init_logger(__name__, level=log_level) - if isinstance(data, pd.io.parsers.TextFileReader): - data = pandas_TextParser_hdlr.make_copy(data).read() - elif not isinstance(data, (pd.DataFrame, pd.Series)): - logger.error( - f"Input data must be a pd.DataFrame or pd.Series.\ - Input data type is {type(data)}" - ) - return - mrd = imodel.split("_") if len(mrd) < 3: - logger.error(f"Dataset {imodel} has no deck information.") - return + raise ValueError(f"Dataset {imodel} has no deck information.") dck = mrd[2] - id_col = _get_id_col(data, mrd[0], logger) - if id_col is None: - return - - id_series = data[id_col] - data_model_files = collect_json_files(*mrd, base=_base) if len(data_model_files) == 0: - logger.error(f'Input dataset "{imodel}" has no ID deck library') - return + raise FileNotFoundError(f'Input dataset "{imodel}" has no ID deck library') id_models = combine_dicts(data_model_files, base=_base) dck_id_model = id_models.get(dck) if not dck_id_model: - logger.error(f'Input dck "{dck}" not defined in file {data_model_files}') - return + raise ValueError(f'Input dck "{dck}" not defined in file {data_model_files}') patterns = _get_patterns(dck_id_model, blank, dck, data_model_files, logger) na_values = True if "^$" in patterns else False combined_compiled = re.compile("|".join(patterns)) - return id_series.str.match(combined_compiled, na=na_values) + return ProcessFunction( + data=data, + func=_validate_id, + func_kwargs={ + "mrd": mrd, + "combined_compiled": combined_compiled, + "na_values": na_values, + }, + makecopy=False, + ) +@process_function(data_only=True) def validate_datetime( - data: pd.DataFrame | pd.Series | pd.io.parsers.TextFileReader, + data: pd.DataFrame | pd.Series | Iterable[pd.DataFrame, pd.Series], imodel: str, blank: bool = False, log_level: str = "INFO", -) -> pd.Series | None: +) -> pd.Series: """Validate datetime columns in a dataset according to the specified model. Parameters ---------- - data : pd.DataFrame, pd.Series, or pd.io.parsers.TextFileReader + data : pd.DataFrame, pd.Series, or Iterable[pd.DataFrame, pd.Series] Input dataset or series containing ID values. imodel : str Name of internally available data model, e.g., "icoads_r300_d201". @@ -231,34 +258,16 @@ def validate_datetime( Raises ------ - None explicitly; errors are logged and function returns None on failure. - - Notes - ----- - - If `data` is a TextFileReader, it is fully read into a DataFrame. + TypeError + If `data` is not a pd.DataFrame or a pd.Series or an Iterable[pd.DataFrame | pd.Series]. + ValueError + If no columns found for datetime conversion. """ - logger = logging_hdlr.init_logger(__name__, level=log_level) model = imodel.split("_")[0] - if isinstance(data, pd.io.parsers.TextFileReader): - data = pandas_TextParser_hdlr.make_copy(data).read() - elif not isinstance(data, (pd.DataFrame, pd.Series)): - logger.error( - f"Input data must be a pd.DataFrame or pd.Series.Input data type is {type(data)}." - ) - return - - data_model_datetime = model_datetimes.to_datetime(data, model) - - if not isinstance(data_model_datetime, pd.Series): - logger.error( - f'Data model "{model}" datetime conversor not defined in model_datetimes module"' - ) - return - elif len(data_model_datetime) == 0: - data_columns = list(data.columns) - logger.info( - f"No columns found for datetime conversion. Selected columns are {data_columns}" - ) - return - return data_model_datetime.notna() + return ProcessFunction( + data=data, + func=_validate_datetime, + func_kwargs={"model": model}, + makecopy=False, + ) diff --git a/tests/test_cdm_mapper.py b/tests/test_cdm_mapper.py index e831ed7d..a2af09f0 100755 --- a/tests/test_cdm_mapper.py +++ b/tests/test_cdm_mapper.py @@ -6,7 +6,6 @@ from io import StringIO from cdm_reader_mapper.cdm_mapper.mapper import ( - _check_input_data_type, _is_empty, _drop_duplicated_rows, _get_nested_value, @@ -19,7 +18,6 @@ _column_mapping, _convert_dtype, _table_mapping, - _map_and_convert, _prepare_cdm_tables, map_model, ) @@ -100,15 +98,20 @@ def data_header_expected(): ) -def _map_model_test_data(data_model, encoding="utf-8", select=None, **kwargs): +def _map_model_test_data( + data_model, encoding="utf-8", select=None, chunksize=None, **kwargs +): source = test_data[f"test_{data_model}"]["mdf_data"] info = open_json_file(test_data[f"test_{data_model}"]["mdf_info"]) - df = pd.read_csv(source, dtype=info["dtypes"], encoding=encoding) - if ":" in df.columns[0]: - df.columns = pd.MultiIndex.from_tuples(col.split(":") for col in df.columns) + df = pd.read_csv( + source, dtype=info["dtypes"], encoding=encoding, chunksize=chunksize + ) result = map_model(df, data_model, **kwargs) if not select: select = cdm_tables + if chunksize: + result = result.read() + for cdm_table in select: expected = pd.read_csv( test_data[f"test_{data_model}"][f"cdm_{cdm_table}"], @@ -119,6 +122,7 @@ def _map_model_test_data(data_model, encoding="utf-8", select=None, **kwargs): ) result_table = result[cdm_table].copy() result_table = result_table.dropna() + result_table = result_table.reset_index(drop=True) if "record_timestamp" in expected.columns: expected = expected.drop("record_timestamp", axis=1) @@ -130,41 +134,6 @@ def _map_model_test_data(data_model, encoding="utf-8", select=None, **kwargs): pd.testing.assert_frame_equal(result_table, expected) -def test_check_input_data_type_df_non_empty(sample_df): - logger = logging_hdlr.init_logger(__name__, level="INFO") - result = _check_input_data_type(sample_df, logger) - - assert result == [sample_df] - - -def test_check_input_data_type_df_empty(sample_df_empty): - logger = logging_hdlr.init_logger(__name__, level="INFO") - result = _check_input_data_type(sample_df_empty, logger) - - assert result is None - - -def test_check_input_data_type_textfilereader_non_empty(sample_tfr): - logger = logging_hdlr.init_logger(__name__, level="INFO") - result = _check_input_data_type(sample_tfr, logger) - - assert result is sample_tfr - - -def test_check_input_data_type_textfilereader_empty(sample_tfr_empty): - logger = logging_hdlr.init_logger(__name__, level="INFO") - result = _check_input_data_type(sample_tfr_empty, logger) - - assert result is None - - -def test_check_input_data_type_invalid_type(sample_string): - logger = logging_hdlr.init_logger(__name__, level="INFO") - result = _check_input_data_type(sample_string, logger) - - assert result is None - - @pytest.mark.parametrize( "value, expected", [ @@ -435,21 +404,6 @@ def test_table_mapping( pd.testing.assert_frame_equal(result[expected.columns], expected) -def test_map_and_convert(data_header, data_header_expected): - logger = logging_hdlr.init_logger(__name__, level="INFO") - result = _map_and_convert( - "icoads", - "r300", - "d720", - data=data_header, - cdm_subset=["header"], - logger=logger, - ) - pd.testing.assert_frame_equal( - result[data_header_expected.columns], data_header_expected - ) - - def test_map_model_icoads(data_header, data_header_expected): result = map_model( data_header, @@ -513,6 +467,9 @@ def test_map_model_pub47(): ("observations-ws", "observation_height_above_station_surface"), ("observations-ws", "sensor_id"), ] + # print(result) + # print(type(result)) + # exit() result = result[columns] exp = np.array( @@ -604,3 +561,10 @@ def test_map_model_test_data_select(): select=["header", "observations-sst"], cdm_subset=["header", "observations-sst"], ) + + +def test_map_model_test_data_chunksize(): + _map_model_test_data( + "icoads_r300_d714", + chunksize=2, + ) diff --git a/tests/test_common.py b/tests/test_common.py index dea0cec7..962fa0bc 100755 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -12,9 +12,11 @@ import numpy as np import pandas as pd +import xarray as xr from io import StringIO from pathlib import Path +import pyarrow.parquet as pq from urllib.parse import urlparse @@ -26,7 +28,6 @@ _split_by_index_df, _split_by_boolean_df, _split_by_column_df, - _split_dispatch, split_by_boolean, split_by_boolean_true, split_by_boolean_false, @@ -34,14 +35,7 @@ split_by_index, ) from cdm_reader_mapper.common.replace import replace_columns -from cdm_reader_mapper.common.pandas_TextParser_hdlr import ( - make_copy, - restore, - is_not_empty, -) -from cdm_reader_mapper.common.pandas_TextParser_hdlr import ( - get_length as get_length_hdlr, -) + from cdm_reader_mapper.common.logging_hdlr import init_logger from cdm_reader_mapper.common.json_dict import ( open_json_file, @@ -60,6 +54,22 @@ load_file, get_path, ) +from cdm_reader_mapper.common.iterators import ( + ProcessFunction, + ParquetStreamReader, + _sort_chunk_outputs, + _initialize_storage, + _write_chunks_to_disk, + _parquet_generator, + _process_chunks, + _prepare_readers, + parquet_stream_from_iterable, + is_valid_iterator, + ensure_parquet_reader, + process_disk_backed, + _process_function, + process_function, +) def make_parser(text, **kwargs): @@ -68,13 +78,6 @@ def make_parser(text, **kwargs): return pd.read_csv(buffer, chunksize=2, **kwargs) -def make_broken_parser(text: str): - """Return a pandas TextFileReader that will fail in make_copy.""" - parser = pd.read_csv(StringIO(text), chunksize=2) - parser.handles.handle = None - return parser - - def compute_md5(content: bytes) -> str: """Helper to get MD5 of bytes.""" return hashlib.md5(content, usedforsecurity=False).hexdigest() # noqa: S324 @@ -118,6 +121,10 @@ def create_temp_file(suffix: str) -> tuple[Path, str, Path]: return tmp_path, suffix, md5_path +def dummy_func(x): + return 2 * x + + @pytest.fixture def sample_df(): return pd.DataFrame( @@ -198,14 +205,14 @@ def tmp_json_file(tmp_path): def test_split_df(sample_df): mask = pd.Series([True, False, False, True, False], index=sample_df.index) - selected, rejected = _split_df(sample_df, mask, return_rejected=True) + selected, rejected, _, _ = _split_df(sample_df, mask, return_rejected=True) assert list(selected.index) == [10, 13] assert list(rejected.index) == [11, 12, 14] def _test_split_df_false_mask(sample_df): mask = pd.Series([False, False, False, False, False], index=sample_df.index) - selected, rejected = _split_df(sample_df, mask, return_rejected=True) + selected, rejected, _, _ = _split_df(sample_df, mask, return_rejected=True) assert list(selected.index) == [10, 13] assert list(rejected.index) == [11, 12, 14] @@ -222,7 +229,7 @@ def test_split_df_multiindex(sample_df): ("C", "c"), ] ) - selected, rejected = _split_df(sample_df, mask, return_rejected=True) + selected, rejected, _, _ = _split_df(sample_df, mask, return_rejected=True) assert list(selected.index) == [10, 13] assert list(rejected.index) == [11, 12, 14] @@ -238,7 +245,7 @@ def test_split_by_boolean_df( sample_df, column, boolean, expected_selected, expected_rejected ): mask = sample_df[[column]] - selected, rejected = _split_by_boolean_df( + selected, rejected, _, _ = _split_by_boolean_df( sample_df, mask, boolean=boolean, return_rejected=True ) assert list(selected.index) == expected_selected @@ -247,7 +254,7 @@ def test_split_by_boolean_df( def test_split_by_boolean_df_empty_mask(sample_df): mask = pd.DataFrame(columns=sample_df.columns) - selected, rejected = _split_by_boolean_df( + selected, rejected, _, _ = _split_by_boolean_df( sample_df, mask, boolean=True, return_rejected=True ) assert list(selected.index) == list(sample_df.index) @@ -265,7 +272,7 @@ def test_split_by_boolean_df_empty_mask(sample_df): def test_split_by_column_df( sample_df, col, values, return_rejected, expected_selected, expected_rejected ): - selected, rejected = _split_by_column_df( + selected, rejected, _, _ = _split_by_column_df( sample_df, col, values, return_rejected=return_rejected ) assert list(selected.index) == expected_selected @@ -288,81 +295,20 @@ def test_split_by_index_df( expected_selected, expected_rejected, ): - selected, rejected = _split_by_index_df( + selected, rejected, _, _ = _split_by_index_df( sample_df, index_list, inverse=inverse, return_rejected=return_rejected ) assert list(selected.index) == expected_selected assert list(rejected.index) == expected_rejected -@pytest.mark.parametrize("TextFileReader", [False, True]) -def test_split_wrapper_index(sample_df, sample_reader, TextFileReader): - if TextFileReader: - data = sample_reader - else: - data = sample_df - - selected, rejected = _split_dispatch( - data, _split_by_index_df, [11, 13], return_rejected=True - ) - - if TextFileReader: - selected = selected.read() - rejected = rejected.read() - - assert list(selected.index) == [11, 13] - assert list(rejected.index) == [10, 12, 14] - - -@pytest.mark.parametrize("TextFileReader", [False, True]) -def test_split_wrapper_column(sample_df, sample_reader, TextFileReader): - if TextFileReader: - data = sample_reader - else: - data = sample_df - - selected, rejected = _split_dispatch( - data, _split_by_column_df, "B", ["y"], return_rejected=True - ) - - if TextFileReader: - selected = selected.read() - rejected = rejected.read() - - assert list(selected.index) == [11, 14] - assert list(rejected.index) == [10, 12, 13] - - -@pytest.mark.parametrize("TextFileReader", [False, True]) -def test_split_wrapper_boolean(sample_df, sample_reader, boolean_mask, TextFileReader): - if TextFileReader: - data = sample_reader - else: - data = sample_df - - selected, rejected = _split_dispatch( - data, - _split_by_boolean_df, - boolean_mask[["mask1"]], - True, - return_rejected=True, - ) - - if TextFileReader: - selected = selected.read() - rejected = rejected.read() - - assert list(selected.index) == [11, 13] - assert list(rejected.index) == [10, 12, 14] - - @pytest.mark.parametrize("TextFileReader", [False, True]) def test_split_by_index_basic(sample_df, sample_reader, TextFileReader): if TextFileReader: data = sample_reader else: data = sample_df - selected, rejected = split_by_index(data, [11, 13], return_rejected=True) + selected, rejected, _, _ = split_by_index(data, [11, 13], return_rejected=True) if TextFileReader: selected = selected.read() @@ -373,7 +319,7 @@ def test_split_by_index_basic(sample_df, sample_reader, TextFileReader): def test_split_by_index_multiindex(sample_reader_multi): - selected, rejected = split_by_index( + selected, rejected, _, _ = split_by_index( sample_reader_multi, [11, 13], return_rejected=True ) @@ -391,7 +337,7 @@ def test_split_by_column_entries_basic(sample_df, sample_reader, TextFileReader) else: data = sample_df - selected, rejected = split_by_column_entries( + selected, rejected, _, _ = split_by_column_entries( data, {"B": ["y"]}, return_rejected=True ) @@ -403,46 +349,102 @@ def test_split_by_column_entries_basic(sample_df, sample_reader, TextFileReader) assert list(rejected.index) == [10, 12, 13] -@pytest.mark.parametrize("TextFileReader", [False, True]) -def test_split_by_boolean_basic_false( - sample_df, sample_reader, boolean_mask, TextFileReader +@pytest.mark.parametrize( + "inverse, reset_index, exp_selected_idx, exp_rejected_idx", + [ + (False, False, [], [10, 11, 12, 13, 14]), + (False, True, [], [10, 11, 12, 13, 14]), + (True, False, [10, 11, 12, 13, 14], []), + (True, True, [10, 11, 12, 13, 14], []), + ], +) +@pytest.mark.parametrize("chunked", [False, True]) +def test_split_by_boolean_basic_true( + sample_df, + sample_reader, + boolean_mask, + inverse, + reset_index, + exp_selected_idx, + exp_rejected_idx, + chunked, ): - if TextFileReader: + if chunked: data = sample_reader else: data = sample_df - selected, rejected = split_by_boolean( - data, boolean_mask, boolean=False, return_rejected=True + selected, rejected, _, _ = split_by_boolean( + data, + boolean_mask, + boolean=True, + inverse=inverse, + reset_index=reset_index, + return_rejected=True, ) - if TextFileReader: + exp_selected = sample_df.loc[exp_selected_idx] + exp_rejected = sample_df.loc[exp_rejected_idx] + + if reset_index is True: + exp_selected = exp_selected.reset_index(drop=True) + exp_rejected = exp_rejected.reset_index(drop=True) + + if chunked: selected = selected.read() rejected = rejected.read() - assert selected.empty - assert list(rejected.index) == [10, 11, 12, 13, 14] + pd.testing.assert_frame_equal(selected, exp_selected) + pd.testing.assert_frame_equal(rejected, exp_rejected) -@pytest.mark.parametrize("TextFileReader", [False, True]) -def test_split_by_boolean_basic_true( - sample_df, sample_reader, boolean_mask, TextFileReader +@pytest.mark.parametrize( + "inverse, reset_index, exp_selected_idx, exp_rejected_idx", + [ + (False, False, [], [10, 11, 12, 13, 14]), + (False, True, [], [10, 11, 12, 13, 14]), + (True, False, [10, 11, 12, 13, 14], []), + (True, True, [10, 11, 12, 13, 14], []), + ], +) +@pytest.mark.parametrize("chunked", [False, True]) +def test_split_by_boolean_basic_false( + sample_df, + sample_reader, + boolean_mask, + inverse, + reset_index, + exp_selected_idx, + exp_rejected_idx, + chunked, ): - if TextFileReader: + if chunked: data = sample_reader else: data = sample_df - selected, rejected = split_by_boolean( - data, boolean_mask, boolean=True, return_rejected=True + selected, rejected, _, _ = split_by_boolean( + data, + boolean_mask, + boolean=False, + inverse=inverse, + reset_index=reset_index, + return_rejected=True, ) - if TextFileReader: + exp_selected = sample_df.loc[exp_selected_idx] + exp_rejected = sample_df.loc[exp_rejected_idx] + + if reset_index is True: + exp_selected = exp_selected.reset_index(drop=True) + exp_rejected = exp_rejected.reset_index(drop=True) + + if chunked: selected = selected.read() rejected = rejected.read() - assert selected.empty - assert list(rejected.index) == [10, 11, 12, 13, 14] + pd.testing.assert_frame_equal(selected, exp_selected) + pd.testing.assert_frame_equal(rejected, exp_rejected) @pytest.mark.parametrize("TextFileReader", [False, True]) @@ -454,7 +456,7 @@ def test_split_by_boolean_true_basic( else: data = sample_df - selected, rejected = split_by_boolean_true( + selected, rejected, _, _ = split_by_boolean_true( data, boolean_mask_true, return_rejected=True ) @@ -475,7 +477,7 @@ def test_split_by_boolean_false_basic( else: data = sample_df - selected, rejected = split_by_boolean_false( + selected, rejected, _, _ = split_by_boolean_false( data, boolean_mask, return_rejected=True ) @@ -494,7 +496,7 @@ def test_split_by_index_empty(empty_df, empty_reader, TextFileReader): else: data = empty_df - selected, rejected = split_by_index(data, [0, 1], return_rejected=True) + selected, rejected, _, _ = split_by_index(data, [0, 1], return_rejected=True) if TextFileReader: selected = selected.read() @@ -511,7 +513,9 @@ def test_split_by_column_empty(empty_df, empty_reader, TextFileReader): else: data = empty_df - selected, rejected = split_by_column_entries(data, {"A": [1]}, return_rejected=True) + selected, rejected, _, _ = split_by_column_entries( + data, {"A": [1]}, return_rejected=True + ) if TextFileReader: selected = selected.read() @@ -529,7 +533,7 @@ def test_split_by_boolean_empty(empty_df, empty_reader, TextFileReader): data = empty_df mask = empty_df.astype(bool) - selected, rejected = split_by_boolean( + selected, rejected, _, _ = split_by_boolean( data, mask, boolean=True, return_rejected=True ) @@ -541,7 +545,7 @@ def test_split_by_boolean_empty(empty_df, empty_reader, TextFileReader): assert rejected.empty -def test_basic_replacement(): +def test_basic_replacement_df(): df_l = pd.DataFrame({"id": [1, 2], "x": [10, 20]}) df_r = pd.DataFrame({"id": [1, 2], "x": [100, 200]}) @@ -549,6 +553,15 @@ def test_basic_replacement(): assert out["x"].tolist() == [100, 200] +def test_basic_replacement_textfilereader(): + parser_l = make_parser("id,x\n1,10\n2,20") + parser_r = make_parser("id,x\n1,100\n2,200") + + out = replace_columns(parser_l, parser_r, pivot_c="id", rep_c="x") + out = out.read() + assert out["x"].tolist() == [100, 200] + + def test_rep_map_different_names(): df_l = pd.DataFrame({"id": [1, 2], "a": [1, 2]}) df_r = pd.DataFrame({"id": [1, 2], "b": [10, 20]}) @@ -557,25 +570,28 @@ def test_rep_map_different_names(): assert out["a"].tolist() == [10, 20] -def test_missing_pivot_returns_none(): +def test_missing_pivot_raises(): df_l = pd.DataFrame({"id": [1]}) df_r = pd.DataFrame({"id": [1]}) - assert replace_columns(df_l, df_r, rep_c="x") is None + with pytest.raises(ValueError): + replace_columns(df_l, df_r, rep_c="x") -def test_missing_replacement_returns_none(): +def test_missing_replacement_raises(): df_l = pd.DataFrame({"id": [1]}) df_r = pd.DataFrame({"id": [1]}) - assert replace_columns(df_l, df_r, pivot_c="id") is None + with pytest.raises(ValueError): + replace_columns(df_l, df_r, pivot_c="id") -def test_missing_source_col_returns_none(): +def test_missing_source_col_raises(): df_l = pd.DataFrame({"id": [1], "a": [10]}) df_r = pd.DataFrame({"id": [1]}) - assert replace_columns(df_l, df_r, pivot_c="id", rep_map={"a": "missing"}) is None + with pytest.raises(ValueError): + replace_columns(df_l, df_r, pivot_c="id", rep_map={"a": "missing"}) def test_index_reset(): @@ -586,79 +602,6 @@ def test_index_reset(): assert list(out.index) == [0, 1] -def test_make_copy_basic(): - parser = make_parser("a,b\n1,2\n3,4\n") - cp = make_copy(parser) - - assert cp is not None - - expected = pd.DataFrame({"a": [1, 3], "b": [2, 4]}) - - assert cp.get_chunk().equals(expected) - assert parser.get_chunk().equals(expected) - - -def test_make_copy_failure_memory(): - parser = make_broken_parser("a,b\n1,2\n") - with pytest.raises(RuntimeError): - make_copy(parser) - - -def test_restore_basic(): - parser = make_parser("a,b\n1,2\n3,4\n") - parser.get_chunk() - - restored = restore(parser) - assert restored is not None - - expected = pd.DataFrame({"a": [1, 3], "b": [2, 4]}) - assert restored.get_chunk().equals(expected) - - -def test_restore_failure_memory(): - parser = make_broken_parser("a,b\n1,2\n") - with pytest.raises(RuntimeError): - restore(parser) - - -def test_is_not_empty_true(): - parser = make_parser("a,b\n1,2\n") - assert is_not_empty(parser) is True - - -def test_is_not_empty_false(): - parser = make_parser("a,b\n") - assert is_not_empty(parser) is False - - -def test_is_not_empty_failure_make_copy_memory(): - parser = make_broken_parser("a,b\n1,2\n") - with pytest.raises(RuntimeError): - is_not_empty(parser) - - -def test_get_length_basic(): - parser = make_parser("a,b\n1,2\n3,4\n5,6\n") - assert get_length_hdlr(parser) == 3 - - -def test_get_length_empty(): - parser = make_parser("a,b\n") - assert get_length_hdlr(parser) == 0 - - -def test_get_length_failure_due_to_bad_line(): - parser = make_parser("a,b\n1,2\n1,2,3\n") - with pytest.raises(RuntimeError): - get_length_hdlr(parser) - - -def test_get_length_failure_make_copy_memory(): - parser = make_broken_parser("a,b\n1,2\n") - with pytest.raises(RuntimeError): - get_length_hdlr(parser) - - def test_init_logger_returns_logger(): logger = init_logger("test_module") assert isinstance(logger, logging.Logger) @@ -886,8 +829,8 @@ def test_get_filename_name_part(pattern, expected_name): ], ) def test_count_by_cat_i(data, expected): - series = pd.Series(data) - assert _count_by_cat(series) == expected + series = pd.DataFrame(data, columns=["test"]) + assert _count_by_cat(series, ["test"])["test"] == expected @pytest.mark.parametrize( @@ -919,12 +862,7 @@ def test_count_by_cat_single_column_string(): def test_count_by_cat_textfilereader(): - text = """A,B -1,x -2,y -2,x -nan,z -""" + text = "A,B\n1,x\n2,y\n2,x\nnan,z" parser = make_parser(text) result = count_by_cat(parser, ["A", "B"]) @@ -935,16 +873,6 @@ def test_count_by_cat_textfilereader(): assert result == expected -def test_count_by_cat_broken_parser(): - text = """A,B -1,x -2,y -""" - parser = make_broken_parser(text) - with pytest.raises(RuntimeError): - count_by_cat(parser, ["A", "B"]) - - @pytest.mark.parametrize( "data, expected_len", [ @@ -952,7 +880,7 @@ def test_count_by_cat_broken_parser(): (make_parser("A,B\n1,x\n2,y\n3,z"), 3), ], ) -def test_get_length(data, expected_len): +def test_get_length_inspect(data, expected_len): assert get_length(data) == expected_len @@ -1133,3 +1061,1241 @@ def test_get_path_missing_file(tmp_path, caplog): assert any( "No module named" in msg or "Cannot treat" in msg for msg in caplog.messages ) + + +def test_class_process_function_basic(): + df = pd.DataFrame({"a": [1, 2, 3]}) + + pf = ProcessFunction(data=df, func=dummy_func) + + assert isinstance(pf, ProcessFunction) + pd.testing.assert_frame_equal(pf.data, df) + assert pf.func is dummy_func + assert pf.func_args == () + assert pf.func_kwargs == {} + + +def test_class_process_function_raises(): + df = pd.DataFrame({"a": [1, 2, 3]}) + + with pytest.raises(ValueError, match="not callable"): + ProcessFunction(data=df, func="invalid_function") + + +def test_class_process_function_tuple(): + df = pd.DataFrame({"a": [1, 2, 3]}) + + pf = ProcessFunction(data=df, func=dummy_func, func_args=10) + + assert pf.func_args == (10,) + + +def test_class_process_function_extra(): + df = pd.DataFrame({"a": [1, 2, 3]}) + + pf = ProcessFunction(df, dummy_func, extra=123, flag=True) + + assert pf.kwargs == {"extra": 123, "flag": True} + + +def make_chunks(): + return [ + pd.DataFrame({"a": [1, 2]}), + pd.DataFrame({"a": [3, 4]}), + ] + + +def chunk_generator(): + yield from make_chunks() + + +def test_init_with_iterator(): + reader = ParquetStreamReader(iter(make_chunks())) + assert isinstance(reader, ParquetStreamReader) + + +def test_init_with_factory(): + reader = ParquetStreamReader(lambda: iter(make_chunks())) + assert isinstance(reader, ParquetStreamReader) + + +def test_init_invalid_source(): + with pytest.raises(TypeError): + ParquetStreamReader(source=123) + + +def test_iteration_over_chunks(): + reader = ParquetStreamReader(lambda: iter(make_chunks())) + + chunks = list(reader) + + assert len(chunks) == 2 + assert chunks[0]["a"].iloc[0] == 1 + assert chunks[1]["a"].iloc[-1] == 4 + + +def test_next_raises_stop_iteration(): + reader = ParquetStreamReader(lambda: iter([])) + + with pytest.raises(StopIteration): + next(reader) + + +def test_prepend_pushes_chunk_to_front(): + chunks = make_chunks() + reader = ParquetStreamReader(lambda: iter(chunks)) + + first = next(reader) + reader.prepend(first) + + again = next(reader) + + pd.testing.assert_frame_equal(first, again) + + +def test_get_chunk_returns_next_chunk(): + reader = ParquetStreamReader(lambda: iter(make_chunks())) + + chunk = reader.get_chunk() + + assert isinstance(chunk, pd.DataFrame) + assert len(chunk) == 2 + + +def test_read_concatenates_all_chunks(): + reader = ParquetStreamReader(lambda: iter(make_chunks())) + + df = reader.read() + + assert len(df) == 4 + assert df["a"].tolist() == [1, 2, 3, 4] + + +def test_read_empty_stream_returns_empty_dataframe(): + reader = ParquetStreamReader(lambda: iter([])) + + df = reader.read() + + assert isinstance(df, pd.DataFrame) + assert df.empty + + +def test_copy_creates_independent_stream(): + reader = ParquetStreamReader(lambda: iter(make_chunks())) + + reader_copy = reader.copy() + + original_first = next(reader) + copy_first = next(reader_copy) + + pd.testing.assert_frame_equal(original_first, copy_first) + + +def test_copy_closed_stream_raises(): + reader = ParquetStreamReader(lambda: iter(make_chunks())) + reader.close() + + with pytest.raises(ValueError): + reader.copy() + + +def test_empty_returns_true_if_empty(): + reader = ParquetStreamReader(lambda: iter([])) + assert reader.empty() is True + + +def test_empty_returns_false_if_not_empty(): + reader = ParquetStreamReader(lambda: iter(make_chunks())) + assert reader.empty() is False + + +def test_reset_index_continuous_index(): + reader = ParquetStreamReader(lambda: iter(make_chunks())) + + new_reader = reader.reset_index(drop=True) + + df = new_reader.read() + + assert df.index.tolist() == [0, 1, 2, 3] + + +def test_reset_index_keeps_old_index_column(): + reader = ParquetStreamReader(lambda: iter(make_chunks())) + + new_reader = reader.reset_index(drop=False) + df = new_reader.read() + + assert "index" in df.columns + assert df.index.tolist() == [0, 1, 2, 3] + + +def test_reset_index_closed_stream_raises(): + reader = ParquetStreamReader(lambda: iter(make_chunks())) + reader.close() + + with pytest.raises(ValueError): + reader.reset_index() + + +def test_next_on_closed_stream_raises(): + reader = ParquetStreamReader(lambda: iter(make_chunks())) + reader.close() + + with pytest.raises(ValueError): + next(reader) + + +def test_context_manager_closes_stream(): + reader = ParquetStreamReader(lambda: iter(make_chunks())) + + with reader as r: + chunk = next(r) + assert len(chunk) == 2 + + with pytest.raises(ValueError): + next(reader) + + +@pytest.mark.parametrize( + "outputs,capture_meta,expected_data_len,expected_meta_len", + [ + ((pd.DataFrame({"a": [1]}),), False, 1, 0), + ((pd.DataFrame({"a": [1]}), "meta"), True, 1, 1), + (([pd.DataFrame({"a": [1]}), pd.DataFrame({"a": [2]})],), False, 2, 0), + (("meta1", "meta2"), True, 0, 2), + ], +) +def test_sort_chunk_outputs_parametrized( + outputs, capture_meta, expected_data_len, expected_meta_len +): + data, meta = _sort_chunk_outputs( + outputs, + capture_meta=capture_meta, + requested_types=(pd.DataFrame,), + ) + + assert len(data) == expected_data_len + assert len(meta) == expected_meta_len + + +def make_df_0(): + return pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + + +def make_series_0(): + return pd.Series([1, 2, 3], name="my_series") + + +@pytest.mark.parametrize( + "inputs,expected_schema_types", + [ + ([make_df_0()], [(pd.DataFrame, make_df_0().columns)]), + ([make_series_0()], [(pd.Series, "my_series")]), + ( + [make_df_0(), make_series_0()], + [ + (pd.DataFrame, make_df_0().columns), + (pd.Series, "my_series"), + ], + ), + ( + [make_df_0(), make_df_0()], + [ + (pd.DataFrame, make_df_0().columns), + (pd.DataFrame, make_df_0().columns), + ], + ), + ], +) +def test_initialize_storage_valid(inputs, expected_schema_types): + temp_dirs, schemas = _initialize_storage(inputs) + + try: + # Correct number of temp dirs created + assert len(temp_dirs) == len(inputs) + + # Ensure they are TemporaryDirectory instances + assert all(isinstance(td, tempfile.TemporaryDirectory) for td in temp_dirs) + + # Check schemas + assert len(schemas) == len(expected_schema_types) + + for (actual_type, actual_meta), (exp_type, exp_meta) in zip( + schemas, expected_schema_types + ): + assert actual_type is exp_type + + if exp_type is pd.DataFrame: + assert list(actual_meta) == list(exp_meta) + else: + assert actual_meta == exp_meta + + finally: + # Clean up temp dirs to avoid ResourceWarning + for td in temp_dirs: + td.cleanup() + + +def test_initialize_storage_empty(): + temp_dirs, schemas = _initialize_storage([]) + + assert temp_dirs == [] + assert schemas == [] + + +@pytest.mark.parametrize( + "invalid_input", + [ + [123], + ["string"], + [object()], + [make_df_0(), 42], + ], +) +def test_initialize_storage_invalid_type_raises(invalid_input): + with pytest.raises(TypeError, match="Unsupported data type"): + _initialize_storage(invalid_input) + + +def make_df_1(): + return pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + + +def make_series_1(): + return pd.Series([10, 20], name="s") + + +def read_parquet(path: Path) -> pd.DataFrame: + return pq.read_table(path).to_pandas() + + +@pytest.mark.parametrize( + "batch", + [ + [make_df_1()], + [make_series_1()], + [make_df_1(), make_df_1()], + [make_df_1(), make_series_1()], + ], +) +def test_write_chunks_creates_files(batch): + temp_dirs = [tempfile.TemporaryDirectory() for _ in batch] + + try: + _write_chunks_to_disk(batch, temp_dirs, chunk_counter=0) + + for i, _ in enumerate(batch): + expected_file = Path(temp_dirs[i].name) / "part_00000.parquet" + assert expected_file.exists() + + finally: + for td in temp_dirs: + td.cleanup() + + +@pytest.mark.parametrize( + "counter,expected_name", + [ + (0, "part_00000.parquet"), + (1, "part_00001.parquet"), + (42, "part_00042.parquet"), + (1234, "part_01234.parquet"), + ], +) +def test_chunk_counter_format(counter, expected_name): + batch = [make_df_1()] + temp_dirs = [tempfile.TemporaryDirectory()] + + try: + _write_chunks_to_disk(batch, temp_dirs, chunk_counter=counter) + + expected_file = Path(temp_dirs[0].name) / expected_name + assert expected_file.exists() + + finally: + temp_dirs[0].cleanup() + + +def test_series_written_as_dataframe(): + s = make_series_1() + temp_dirs = [tempfile.TemporaryDirectory()] + + try: + _write_chunks_to_disk([s], temp_dirs, chunk_counter=0) + + file_path = Path(temp_dirs[0].name) / "part_00000.parquet" + df = read_parquet(file_path) + + # Series becomes single-column dataframe + assert list(df.columns) == ["s"] + assert df["s"].tolist() == [10, 20] + + finally: + temp_dirs[0].cleanup() + + +def test_index_is_preserved(): + df = make_df_1() + df.index = ["x", "y"] + + temp_dirs = [tempfile.TemporaryDirectory()] + + try: + _write_chunks_to_disk([df], temp_dirs, chunk_counter=0) + + file_path = Path(temp_dirs[0].name) / "part_00000.parquet" + result = read_parquet(file_path) + + assert list(result.index) == ["x", "y"] + + finally: + temp_dirs[0].cleanup() + + +def test_multiple_chunk_writes(): + batch = [make_df_1()] + temp_dirs = [tempfile.TemporaryDirectory()] + + try: + _write_chunks_to_disk(batch, temp_dirs, chunk_counter=0) + _write_chunks_to_disk(batch, temp_dirs, chunk_counter=1) + + file0 = Path(temp_dirs[0].name) / "part_00000.parquet" + file1 = Path(temp_dirs[0].name) / "part_00001.parquet" + + assert file0.exists() + assert file1.exists() + + finally: + temp_dirs[0].cleanup() + + +def test_mismatched_temp_dirs_raises_index_error(): + batch = [make_df_1(), make_df_1()] + temp_dirs = [tempfile.TemporaryDirectory()] # only one dir + + try: + with pytest.raises(IndexError): + _write_chunks_to_disk(batch, temp_dirs, chunk_counter=0) + finally: + temp_dirs[0].cleanup() + + +def write_parquet(path: Path, df: pd.DataFrame): + df.to_parquet(path, index=True) + + +def make_df(values, columns=("a",)): + return pd.DataFrame(values, columns=columns) + + +def test_parquet_generator_dataframe(): + temp_dir = tempfile.TemporaryDirectory() + + try: + df1 = make_df([[1], [2]]) + df2 = make_df([[3], [4]]) + + write_parquet(Path(temp_dir.name) / "part_00000.parquet", df1) + write_parquet(Path(temp_dir.name) / "part_00001.parquet", df2) + + gen = _parquet_generator( + temp_dir=temp_dir, + data_type=pd.DataFrame, + schema=df1.columns, + ) + + outputs = list(gen) + + assert len(outputs) == 2 + pd.testing.assert_frame_equal(outputs[0], df1) + pd.testing.assert_frame_equal(outputs[1], df2) + + finally: + # Generator should already cleanup, but ensure no crash + if Path(temp_dir.name).exists(): + temp_dir.cleanup() + + +def test_parquet_generator_series(): + temp_dir = tempfile.TemporaryDirectory() + + try: + df1 = make_df([[10], [20]]) + df2 = make_df([[30], [40]]) + + write_parquet(Path(temp_dir.name) / "part_00000.parquet", df1) + write_parquet(Path(temp_dir.name) / "part_00001.parquet", df2) + + gen = _parquet_generator( + temp_dir=temp_dir, + data_type=pd.Series, + schema="my_series", + ) + + outputs = list(gen) + + assert len(outputs) == 2 + assert isinstance(outputs[0], pd.Series) + assert outputs[0].name == "my_series" + assert outputs[0].tolist() == [10, 20] + assert outputs[1].tolist() == [30, 40] + + finally: + if Path(temp_dir.name).exists(): + temp_dir.cleanup() + + +def test_files_are_read_sorted(): + temp_dir = tempfile.TemporaryDirectory() + + try: + df1 = make_df([[1]]) + df2 = make_df([[2]]) + + # Intentionally reversed names + write_parquet(Path(temp_dir.name) / "part_00001.parquet", df2) + write_parquet(Path(temp_dir.name) / "part_00000.parquet", df1) + + gen = _parquet_generator( + temp_dir=temp_dir, + data_type=pd.DataFrame, + schema=df1.columns, + ) + + outputs = list(gen) + + # Should be sorted lexicographically + assert outputs[0]["a"].iloc[0] == 1 + assert outputs[1]["a"].iloc[0] == 2 + + finally: + if Path(temp_dir.name).exists(): + temp_dir.cleanup() + + +def test_empty_directory_yields_nothing(): + temp_dir = tempfile.TemporaryDirectory() + + gen = _parquet_generator( + temp_dir=temp_dir, + data_type=pd.DataFrame, + schema=None, + ) + + outputs = list(gen) + assert outputs == [] + + +def test_cleanup_after_full_iteration(): + temp_dir = tempfile.TemporaryDirectory() + + df = make_df([[1]]) + write_parquet(Path(temp_dir.name) / "part_00000.parquet", df) + + gen = _parquet_generator( + temp_dir=temp_dir, + data_type=pd.DataFrame, + schema=df.columns, + ) + + list(gen) + + # Directory should be removed after generator finishes + assert not Path(temp_dir.name).exists() + + +def test_cleanup_on_partial_iteration(): + temp_dir = tempfile.TemporaryDirectory() + + df1 = make_df([[1]]) + df2 = make_df([[2]]) + + write_parquet(Path(temp_dir.name) / "part_00000.parquet", df1) + write_parquet(Path(temp_dir.name) / "part_00001.parquet", df2) + + gen = _parquet_generator( + temp_dir=temp_dir, + data_type=pd.DataFrame, + schema=df1.columns, + ) + + next(gen) # consume one element + gen.close() # trigger generator finalization + + assert not Path(temp_dir.name).exists() + + +def make_reader(chunks): + return ParquetStreamReader(lambda: iter(chunks)) + + +def df(val): + return pd.DataFrame({"a": [val]}) + + +def test_process_chunks_data_only(): + readers = [make_reader([df(1), df(2)])] + + def func(x): + return x * 2 + + result = _process_chunks( + readers=readers, + func=func, + requested_types=(pd.DataFrame,), + static_args=[], + static_kwargs={}, + non_data_output="first", + non_data_proc=None, + non_data_proc_args=(), + non_data_proc_kwargs={}, + ) + + data_reader = result[0] + output = data_reader.read() + + assert output["a"].tolist() == [2, 4] + + +def test_metadata_only_first_chunk(): + readers = [make_reader([df(1), df(2)])] + + def func(x): + return x, f"meta_{x['a'].iloc[0]}" + + result = _process_chunks( + readers=readers, + func=func, + requested_types=(pd.DataFrame,), + static_args=[], + static_kwargs={}, + non_data_output="first", + non_data_proc=None, + non_data_proc_args=(), + non_data_proc_kwargs={}, + ) + + data_reader, meta = result + + assert data_reader.read()["a"].tolist() == [1, 2] + assert meta == "meta_1" # only first chunk captured + + +def test_metadata_accumulation(): + readers = [make_reader([df(1), df(2)])] + + def func(x): + return x, x["a"].iloc[0] + + result = _process_chunks( + readers=readers, + func=func, + requested_types=(pd.DataFrame,), + static_args=[], + static_kwargs={}, + non_data_output="acc", + non_data_proc=None, + non_data_proc_args=(), + non_data_proc_kwargs={}, + ) + + _, meta = result + + assert meta == [1, 2] + + +def test_non_data_proc_applied_helper(): + readers = [make_reader([df(1), df(2)])] + + def func(x): + return x, x["a"].iloc[0] + + def processor(meta): + return sum(meta) + + result = _process_chunks( + readers=readers, + func=func, + requested_types=(pd.DataFrame,), + static_args=[], + static_kwargs={}, + non_data_output="acc", + non_data_proc=processor, + non_data_proc_args=(), + non_data_proc_kwargs={}, + ) + + _, meta = result + + assert meta == 3 + + +def test_only_metadata_output(): + readers = [make_reader([df(1), df(2)])] + + def func(x): + return x["a"].iloc[0] + + result = _process_chunks( + readers=readers, + func=func, + requested_types=(pd.DataFrame,), + static_args=[], + static_kwargs={}, + non_data_output="acc", + non_data_proc=None, + non_data_proc_args=(), + non_data_proc_kwargs={}, + ) + + # Should return metadata only + assert result == [1, 2] + + +def test_empty_iterable_raises(): + readers = [make_reader([])] + + def func(x): + return x + + with pytest.raises(ValueError, match="Iterable is empty"): + _process_chunks( + readers=readers, + func=func, + requested_types=(pd.DataFrame,), + static_args=[], + static_kwargs={}, + non_data_output="first", + non_data_proc=None, + non_data_proc_args=(), + non_data_proc_kwargs={}, + ) + + +def test_invalid_type_raises(): + readers = [make_reader(["not_df"])] + + def func(x): + return x + + with pytest.raises(TypeError): + _process_chunks( + readers=readers, + func=func, + requested_types=(pd.DataFrame,), + static_args=[], + static_kwargs={}, + non_data_output="first", + non_data_proc=None, + non_data_proc_args=(), + non_data_proc_kwargs={}, + ) + + +def test_multiple_readers(): + r1 = make_reader([df(1), df(2)]) + r2 = make_reader([df(10), df(20)]) + + def func(x, y): + return x + y + + result = _process_chunks( + readers=[r1, r2], + func=func, + requested_types=(pd.DataFrame,), + static_args=[], + static_kwargs={}, + non_data_output="first", + non_data_proc=None, + non_data_proc_args=(), + non_data_proc_kwargs={}, + ) + + data_reader = result[0] + output = data_reader.read() + + assert output["a"].tolist() == [11, 22] + + +def make_reader_2(values=None): + if values is None: + values = [] + return ParquetStreamReader(lambda: iter(values)) + + +def make_df_2(val): + return pd.DataFrame({"a": [val]}) + + +def test_base_reader_only(): + base = make_reader_2([make_df_2(1)]) + + readers, args, kwargs = _prepare_readers( + reader=base, + func_args=[], + func_kwargs={}, + makecopy=False, + ) + + assert readers == [base] + assert args == [] + assert kwargs == {} + + +@pytest.mark.parametrize( + "func_args,expected_reader_count,expected_static_len", + [ + ([], 1, 0), + ([123], 1, 1), + ([make_reader_2()], 2, 0), + ([make_reader_2(), 999], 2, 1), + ], +) +def test_func_args_separation(func_args, expected_reader_count, expected_static_len): + base = make_reader_2([make_df_2(1)]) + + readers, args, kwargs = _prepare_readers( + reader=base, + func_args=func_args, + func_kwargs={}, + makecopy=False, + ) + + assert len(readers) == expected_reader_count + assert len(args) == expected_static_len + assert kwargs == {} + + +def test_func_kwargs_separation(): + base = make_reader_2([make_df_2(1)]) + reader_kw = make_reader_2([make_df_2(2)]) + + readers, args, kwargs = _prepare_readers( + reader=base, + func_args=[], + func_kwargs={"r": reader_kw, "x": 42}, + makecopy=False, + ) + + assert len(readers) == 2 + assert args == [] + assert kwargs == {"x": 42} + + +def test_reader_ordering(): + base = make_reader_2() + r1 = make_reader_2() + r2 = make_reader_2() + + readers, _, _ = _prepare_readers( + reader=base, + func_args=[r1], + func_kwargs={"k": r2}, + makecopy=False, + ) + + assert readers[0] is base + assert readers[1] is r1 + assert readers[2] is r2 + + +def test_makecopy_false_preserves_identity(): + base = make_reader_2() + r1 = make_reader_2() + + readers, _, _ = _prepare_readers( + reader=base, + func_args=[r1], + func_kwargs={}, + makecopy=False, + ) + + assert readers[0] is base + assert readers[1] is r1 + + +def test_makecopy_true_creates_copies(): + base = make_reader_2([make_df_2(1)]) + r1 = make_reader_2([make_df_2(2)]) + + readers, _, _ = _prepare_readers( + reader=base, + func_args=[r1], + func_kwargs={}, + makecopy=True, + ) + + # Copies should not be the same object + assert readers[0] is not base + assert readers[1] is not r1 + + # But should behave identically + assert readers[0].read()["a"].tolist() == [1] + assert readers[1].read()["a"].tolist() == [2] + + +def test_empty_args_and_kwargs(): + base = make_reader_2() + + readers, args, kwargs = _prepare_readers( + reader=base, + func_args=[], + func_kwargs={}, + makecopy=False, + ) + + assert readers == [base] + assert args == [] + assert kwargs == {} + + +def make_df_3(val): + return pd.DataFrame({"a": [val]}) + + +def make_series_3(val, name="s"): + return pd.Series([val], name=name) + + +def reader_from_list(items): + return iter(items) + + +@pytest.mark.parametrize( + "input_data,requested_types", + [ + ([make_df_3(1), make_df_3(2)], (pd.DataFrame,)), + ([make_series_3(10), make_series_3(20)], (pd.Series,)), + ], +) +def test_basic_processing(input_data, requested_types): + def func(x): + return x + + result = process_disk_backed( + reader=reader_from_list(input_data), + func=func, + requested_types=requested_types, + ) + + # First element is a generator + gen = result[0] + + output = list(gen) + assert all(isinstance(o, requested_types) for o in output) + + if isinstance(output[0], pd.DataFrame): + assert [row["a"].iloc[0] for row in output] == [ + df["a"].iloc[0] for df in input_data if isinstance(df, pd.DataFrame) + ] + else: + assert [o.iloc[0] for o in output] == [ + s.iloc[0] for s in input_data if isinstance(s, pd.Series) + ] + + +def test_non_data_first_mode(): + def func(df): + return df, df["a"].iloc[0] + + result = process_disk_backed( + reader=reader_from_list([make_df_3(1), make_df_3(2)]), + func=func, + non_data_output="first", + ) + + gen, meta = result + + # Only first chunk captured + assert meta == 1 + output = list(gen) + assert [row["a"].iloc[0] for row in output] == [1, 2] + + +def test_non_data_acc_mode(): + def func(df): + return df, df["a"].iloc[0] + + result = process_disk_backed( + reader=reader_from_list([make_df_3(1), make_df_3(2)]), + func=func, + non_data_output="acc", + ) + + gen, meta = result + assert meta == [1, 2] + + output = list(gen) + assert [row["a"].iloc[0] for row in output] == [1, 2] + + +def test_non_data_proc_applied_function(): + def func(df): + return df, df["a"].iloc[0] + + def processor(meta, factor): + return [x * factor for x in meta] + + result = process_disk_backed( + reader=reader_from_list([make_df_3(1), make_df_3(2)]), + func=func, + non_data_output="acc", + non_data_proc=processor, + non_data_proc_args=(10,), + non_data_proc_kwargs={}, + ) + + gen, meta = result + assert meta == [10, 20] + + output = list(gen) + assert [row["a"].iloc[0] for row in output] == [1, 2] + + +def test_func_args_kwargs(): + def func(df, val, extra=0): + return df * val + extra + + result = process_disk_backed( + reader=reader_from_list([make_df_3(1), make_df_3(2)]), + func=func, + func_args=[2], + func_kwargs={"extra": 5}, + ) + + gen = result[0] + output = list(gen) + assert [row["a"].iloc[0] for row in output] == [1 * 2 + 5, 2 * 2 + 5] + + +def test_empty_iterator_raises(): + def func(x): + return x + + with pytest.raises(ValueError, match="Iterable is empty"): + process_disk_backed( + reader=reader_from_list([]), + func=func, + ) + + +def test_requested_types_single_type(): + def func(x): + return x + + input_data = [make_df_3(1)] + # requested_types as single type + result = process_disk_backed( + reader=reader_from_list(input_data), + func=func, + requested_types=pd.DataFrame, + ) + + gen = result[0] + output = list(gen) + assert all(isinstance(o, pd.DataFrame) for o in output) + + +def test_parquet_stream_from_iterable_dataframe(): + dfs = [make_df_3(1), make_df_3(2)] + reader = parquet_stream_from_iterable(dfs) + + assert isinstance(reader, ParquetStreamReader) + output = list(reader) + assert all(isinstance(df, pd.DataFrame) for df in output) + assert [df["a"].iloc[0] for df in output] == [1, 2] + + +def test_parquet_stream_from_iterable_series(): + series_list = [make_series_3(10), make_series_3(20)] + reader = parquet_stream_from_iterable(series_list) + + assert isinstance(reader, ParquetStreamReader) + output = list(reader) + assert all(isinstance(s, pd.Series) for s in output) + assert [s.iloc[0] for s in output] == [10, 20] + + +def test_parquet_stream_from_iterable_empty_raises(): + with pytest.raises(ValueError, match="Iterable is empty"): + parquet_stream_from_iterable([]) + + +def test_parquet_stream_from_iterable_mixed_types_raises(): + dfs = [make_df_3(1), make_series_3(2)] + with pytest.raises(TypeError, match="All chunks must be of the same type"): + parquet_stream_from_iterable(dfs) + + +def test_parquet_stream_from_iterable_wrong_type_first_raises(): + with pytest.raises( + TypeError, match="Iterable must contain pd.DataFrame or pd.Series" + ): + parquet_stream_from_iterable([123, 456]) + + +def test_ensure_parquet_reader_returns_existing_reader(): + reader = parquet_stream_from_iterable([make_df_3(1)]) + result = ensure_parquet_reader(reader) + assert result is reader + + +def test_ensure_parquet_reader_converts_iterator(): + dfs = [make_df_3(1), make_df_3(2)] + iterator = iter(dfs) + result = ensure_parquet_reader(iterator) + assert isinstance(result, ParquetStreamReader) + output = list(result) + assert [df["a"].iloc[0] for df in output] == [1, 2] + + +def test_ensure_parquet_reader_returns_non_iterator_unchanged(): + obj = 123 + result = ensure_parquet_reader(obj) + assert result == 123 + + +@pytest.mark.parametrize( + "value,expected", + [ + (iter([1, 2, 3]), True), # iterator + ((x for x in range(5)), True), # generator expression + ([1, 2, 3], False), # list + ((1, 2, 3), False), # tuple + (123, False), # int + ("abc", False), # string + (None, False), # None + ], +) +def test_is_valid_iterator(value, expected): + assert is_valid_iterator(value) is expected + + +def test_non_process_function_returns(): + val = 123 + assert _process_function(val) == val + + +def test_dataframe_calls_func_directly(): + df = make_df_3(5) + + called = {} + + def func(d): + called["data"] = d + return d["a"].iloc[0] * 2 + + pf = ProcessFunction(df, func) + result = _process_function(pf) + + assert result == 10 + assert called["data"] is df + + +def test_series_calls_func_directly(): + s = make_series_3(7) + + def func(x): + return x.iloc[0] + 3 + + pf = ProcessFunction(s, func) + result = _process_function(pf) + assert result == 10 + + +def test_xarray_dataset_direct_call(): + ds = xr.Dataset({"a": ("x", [1, 2])}) + + def func(x): + return x["a"].sum().item() + + pf = ProcessFunction(ds, func) + result = _process_function(pf) + assert result == 3 + + +def test_iterator_of_dataframes_disk_backed(): + dfs = [make_df_3(1), make_df_3(2)] + it = iter(dfs) + + def func(df): + return df["a"].iloc[0] * 10 + + pf = ProcessFunction(it, func, non_data_output="acc") + result = _process_function(pf) + assert result == [10, 20] + + +def test_list_of_dataframes_disk_backed(): + dfs = [make_df_3(3), make_df_3(4)] + + def func(df): + return df["a"].iloc[0] * 2 + + pf = ProcessFunction(dfs, func, non_data_output="acc") + result = _process_function(pf) + assert result == [6, 8] + + +def test_data_only_returns_first(): + dfs = [make_df_3(1)] + pf = ProcessFunction(dfs, lambda df: df) + result = _process_function(pf, data_only=True) + assert isinstance(result, ParquetStreamReader) + + +def test_unsupported_type_raises(): + pf = ProcessFunction(12345, lambda x: x) + with pytest.raises(TypeError, match="Unsupported data type"): + _process_function(pf) + + +def test_basic_dataframe_decorator(): + @process_function() + def func(df): + return df * 2 + + df = make_df_3(3) + result = func(df) + assert isinstance(result, pd.DataFrame) + assert result["a"].iloc[0] == 6 + + +def test_iterable_returns_disk_backed(): + @process_function() + def func(dfs): + return dfs + + dfs = [make_df_3(1), make_df_3(2)] + result = func(dfs) + + assert isinstance(result, list) + assert len(result) == 2 + + pd.testing.assert_frame_equal(result[0], pd.DataFrame({"a": [1]})) + pd.testing.assert_frame_equal(result[1], pd.DataFrame({"a": [2]})) + + +def test_data_only_returns_generator_only(): + @process_function(data_only=True) + def func(dfs): + return dfs + + dfs = [make_df_3(1)] + result = func(dfs) + + assert isinstance(result, list) + assert len(result) == 1 + + pd.testing.assert_frame_equal(result[0], pd.DataFrame({"a": [1]})) + + +def test_postprocessing_not_callable_raises(): + @process_function(postprocessing={"func": 123, "kwargs": []}) + def func(df): + return df + + df = make_df_3(1) + with pytest.raises(ValueError, match="is not callable"): + func(df) diff --git a/tests/test_databundle.py b/tests/test_databundle.py index 57f4adf7..175f006a 100755 --- a/tests/test_databundle.py +++ b/tests/test_databundle.py @@ -119,7 +119,6 @@ def test_copy_df(sample_db_df): def test_copy_reader(sample_db_reader): db_cp = sample_db_reader.copy() - pd.testing.assert_frame_equal(sample_db_reader.data.read(), db_cp.data.read()) pd.testing.assert_frame_equal(sample_db_reader.mask.read(), db_cp.mask.read()) @@ -315,7 +314,7 @@ def test_select_operators_reader( @pytest.mark.parametrize( "func, args, idx_exp", [ - # ("select_where_all_true", [], [0, 1, 2], [3, 4]), + # ("select_where_all_true", [[0, 1, 2]], [3, 4]), # ("select_where_all_false", [], [3], [0, 1, 2, 4]), ("select_where_index_isin", [[0, 2, 4]], [0, 2, 4]), # ("select_where_entry_isin", [{("core", "ID"): [25629, 26558]}], [1, 3]), diff --git a/tests/test_mdf_reader.py b/tests/test_mdf_reader.py index 2cee982c..18b03c7d 100755 --- a/tests/test_mdf_reader.py +++ b/tests/test_mdf_reader.py @@ -15,6 +15,7 @@ validate_read_mdf_args, ) from cdm_reader_mapper.mdf_reader.utils.filereader import _apply_multiindex +from cdm_reader_mapper.common.iterators import ParquetStreamReader from cdm_reader_mapper.mdf_reader.utils.utilities import ( read_csv, read_parquet, @@ -404,8 +405,8 @@ def test_read_data_textfilereader(): ]: assert hasattr(db, attr) - assert isinstance(db.data, pd.io.parsers.TextFileReader) - assert isinstance(db.mask, pd.io.parsers.TextFileReader) + assert isinstance(db.data, ParquetStreamReader) + assert isinstance(db.mask, ParquetStreamReader) assert isinstance(db.columns, pd.MultiIndex) assert isinstance(db.dtypes, pd.Series) assert db.parse_dates == [] diff --git a/tests/test_metmetpy.py b/tests/test_metmetpy.py index b8e9ae8c..1d7f13f5 100755 --- a/tests/test_metmetpy.py +++ b/tests/test_metmetpy.py @@ -7,6 +7,8 @@ from io import StringIO +from cdm_reader_mapper.common.iterators import ParquetStreamReader + from cdm_reader_mapper.metmetpy import properties from cdm_reader_mapper.metmetpy.datetime.correction_functions import dck_201_icoads from cdm_reader_mapper.metmetpy.datetime.model_datetimes import ( @@ -645,9 +647,43 @@ def test_correct_datetime_textfilereader(): result = correct_datetime(parser, "icoads_r300_d201").read() - pd.testing.assert_frame_equal( - result.reset_index(drop=True), expected.reset_index(drop=True) - ) + pd.testing.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data", ["invalid_data", 1, 1.0, True, {"1": 2}, {1, 2}]) +def test_correct_datetime_invalid_data(data): + with pytest.raises(TypeError, match="Unsupported data type"): + correct_datetime(data, "icoads_r300_d201") + + +def test_correct_datetime_series(): + with pytest.raises(TypeError, match="pd.Series is not supported now."): + correct_datetime(pd.Series([1, 2, 3]), "icoads_r300_d201") + + +@pytest.mark.parametrize("data", [[1, 2], (1, 2)]) +def test_correct_datetime_invalid_iterable_entries(data): + with pytest.raises( + TypeError, match="Iterable must contain pd.DataFrame or pd.Series objects." + ): + correct_datetime(data, "icoads_r300_d201") + + +@pytest.mark.parametrize( + "data", [ParquetStreamReader(iter([])), ParquetStreamReader(iter(()))] +) +def test_correct_datetime_empty_iterable(data): + with pytest.raises(ValueError, match="Iterable is empty."): + correct_datetime(data, "icoads_r300_d201") + + +def test_correct_datetime_valid_iterable(): + df1 = pd.DataFrame({YR: [1899], MO: [1], DY: [1], HR: [0]}, index=[0]) + df2 = pd.DataFrame({YR: [1900], MO: [1], DY: [1], HR: [12]}, index=[1]) + result = correct_datetime(ParquetStreamReader(iter([df1, df2])), "icoads_r300_d201") + + exp = pd.DataFrame({YR: [1898, 1900], MO: [12, 1], DY: [31, 1], HR: [0, 12]}) + pd.testing.assert_frame_equal(result.read(), exp) @pytest.mark.parametrize( @@ -716,30 +752,61 @@ def test_correct_pt_textfilereader(csv_text, names, imodel, expected): dtype=object, skip_blank_lines=False, ) - result = ( - correct_pt(parser, imodel, log_level="CRITICAL").read().reset_index(drop=True) - ) - pd.testing.assert_frame_equal(result, expected, check_dtype=False) + result = correct_pt(parser, imodel, log_level="CRITICAL") + pd.testing.assert_frame_equal(result.read(), expected, check_dtype=False) + + +@pytest.mark.parametrize("data", ["invalid_data", 1, 1.0, True, {"1": 2}, {1, 2}]) +def test_correct_pt_invalid_data(data): + with pytest.raises(TypeError, match="Unsupported data type"): + correct_pt(data, "icoads_r300_d993") + + +def test_correct_pt_series(): + with pytest.raises(TypeError, match="pd.Series is not supported now."): + correct_pt(pd.Series([1, 2, 3]), "icoads_r300_d993") + + +@pytest.mark.parametrize("data", [[1, 2], (1, 2)]) +def test_correct_pt_invalid_iterable_entries(data): + with pytest.raises( + TypeError, match="Iterable must contain pd.DataFrame or pd.Series objects." + ): + correct_pt(data, "icoads_r300_d993") + + +@pytest.mark.parametrize( + "data", [ParquetStreamReader(iter([])), ParquetStreamReader(iter(()))] +) +def test_correct_pt_empty_iterable(data): + with pytest.raises(ValueError, match="Iterable is empty."): + correct_pt(data, "icoads_r300_d993") + + +def test_correct_pt_valid_iterable(): + df1 = pd.DataFrame({PT: [None, "7", None]}, index=[0, 1, 2]) + df2 = pd.DataFrame({PT: ["6", "7", None]}, index=[3, 4, 5]) + result = correct_pt(ParquetStreamReader(iter([df1, df2])), "icoads_r300_d993") + + exp = pd.DataFrame({PT: ["5", "7", "5", "6", "7", "5"]}) + pd.testing.assert_frame_equal(result.read(), exp) def test_get_id_col_not_defined(): - logger = logging.getLogger("test_logger") df = pd.DataFrame({"X": [1, 2, 3]}) - result = _get_id_col(df, "unknown_model", logger) - assert result is None + with pytest.raises(ValueError, match="ID column not defined in properties file"): + _get_id_col(df, "unknown_model") def test_get_id_col_missing_in_data(): - logger = logging.getLogger("test_logger") df = pd.DataFrame({"X": [1, 2, 3]}) - result = _get_id_col(df, "icoads", logger) - assert result is None + with pytest.raises(ValueError, match="No ID columns found."): + _get_id_col(df, "icoads") def test_get_id_col_single_column_present(): - logger = logging.getLogger("test_logger") df = pd.DataFrame({("core", "ID"): [1, 2, 3], ("other", "ID"): [4, 5, 6]}) - result = _get_id_col(df, "icoads", logger) + result = _get_id_col(df, "icoads") assert result == ("core", "ID") @@ -831,9 +898,8 @@ def test_validate_id_textfilereader(): ) result = validate_id(parser, "icoads_r300_d201", blank=False, log_level="CRITICAL") expected = pd.Series([True, False, True], name=ID) - pd.testing.assert_series_equal( - result.reset_index(drop=True), expected, check_dtype=False - ) + + pd.testing.assert_series_equal(result.read(), expected) @pytest.mark.parametrize( @@ -857,9 +923,7 @@ def test_validate_id_textfilereader(): ) def test_validate_datetime_dataframe(data_input, expected): result = validate_datetime(data_input.copy(), "icoads", log_level="CRITICAL") - pd.testing.assert_series_equal( - result.reset_index(drop=True), expected, check_dtype=False - ) + pd.testing.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -879,6 +943,4 @@ def test_validate_datetime_textfilereader(csv_text, expected): skip_blank_lines=False, ) result = validate_datetime(parser, "icoads", log_level="CRITICAL") - pd.testing.assert_series_equal( - result.reset_index(drop=True), expected, check_dtype=False - ) + pd.testing.assert_series_equal(result.read(), expected) diff --git a/tests/test_pandas.py b/tests/test_pandas.py index e11adbf6..f4a2a2e5 100755 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -2,7 +2,6 @@ import pandas as pd import pytest -from io import StringIO from cdm_reader_mapper import DataBundle @@ -14,33 +13,25 @@ def sample_df(): return DataBundle(data=data) -@pytest.fixture -def sample_text_reader(): - """Fixture that returns a TextFileReader.""" - csv_data = "A,B\n1,x\n2,y\n, z" - data = pd.read_csv(StringIO(csv_data), chunksize=1) - return DataBundle(data=data) - - -@pytest.mark.parametrize("fixture_name", ["sample_df", "sample_text_reader"]) +@pytest.mark.parametrize("fixture_name", ["sample_df"]) def test_index(request, fixture_name): obj = request.getfixturevalue(fixture_name) assert list(obj.index) == [0, 1, 2] -@pytest.mark.parametrize("fixture_name", ["sample_df", "sample_text_reader"]) +@pytest.mark.parametrize("fixture_name", ["sample_df"]) def test_size(request, fixture_name): obj = request.getfixturevalue(fixture_name) assert obj.size == 6 -@pytest.mark.parametrize("fixture_name", ["sample_df", "sample_text_reader"]) +@pytest.mark.parametrize("fixture_name", ["sample_df"]) def test_shape(request, fixture_name): obj = request.getfixturevalue(fixture_name) assert obj.shape == (3, 2) -@pytest.mark.parametrize("fixture_name", ["sample_df", "sample_text_reader"]) +@pytest.mark.parametrize("fixture_name", ["sample_df"]) def test_dropna(request, fixture_name): obj = request.getfixturevalue(fixture_name) @@ -52,7 +43,7 @@ def test_dropna(request, fixture_name): assert dropped["A"].isna().sum() == 0 -@pytest.mark.parametrize("fixture_name", ["sample_df", "sample_text_reader"]) +@pytest.mark.parametrize("fixture_name", ["sample_df"]) def test_rename(request, fixture_name): obj = request.getfixturevalue(fixture_name) @@ -64,7 +55,7 @@ def test_rename(request, fixture_name): assert "A" not in renamed.columns -@pytest.mark.parametrize("fixture_name", ["sample_df", "sample_text_reader"]) +@pytest.mark.parametrize("fixture_name", ["sample_df"]) def test_rename_inplace(request, fixture_name): obj = request.getfixturevalue(fixture_name) @@ -73,7 +64,7 @@ def test_rename_inplace(request, fixture_name): assert "A_new" in obj.columns -@pytest.mark.parametrize("fixture_name", ["sample_df", "sample_text_reader"]) +@pytest.mark.parametrize("fixture_name", ["sample_df"]) def test_iloc(request, fixture_name): obj = request.getfixturevalue(fixture_name) diff --git a/tests/test_reader_filereader.py b/tests/test_reader_filereader.py index 89badf3a..09222535 100755 --- a/tests/test_reader_filereader.py +++ b/tests/test_reader_filereader.py @@ -5,9 +5,6 @@ import pandas as pd import xarray as xr -from io import StringIO - -from pandas.io.parsers import TextFileReader from pandas.testing import assert_frame_equal, assert_index_equal from cdm_reader_mapper import DataBundle @@ -15,7 +12,6 @@ from cdm_reader_mapper.mdf_reader.utils.parser import OrderSpec, ParserConfig from cdm_reader_mapper.mdf_reader.utils.filereader import ( - _apply_or_chunk, _merge_kwargs, _apply_multiindex, _select_years, @@ -68,22 +64,6 @@ def test_select_years_handles_non_numeric(): assert out["YR"].tolist() == ["2000", "2001"] -def test_apply_or_chunk_dataframe(): - df = pd.DataFrame({"test": [1, 2, 3, 4]}) - out = _apply_or_chunk(df, f, func_args=[2]) - assert isinstance(out, pd.DataFrame) - assert_frame_equal(out, pd.DataFrame({"test": [3, 4, 5, 6]})) - - -def test_apply_or_chunk_textfilereader(): - buffer = StringIO("test\n1\n2\n3\n4") - read_kwargs = {"chunksize": 2} - reader = pd.read_csv(buffer, **read_kwargs) - (out,) = _apply_or_chunk(reader, f, func_args=[2], read_kwargs=read_kwargs) - assert isinstance(out, TextFileReader) - assert_frame_equal(out.read(), pd.DataFrame({"test": [3, 4, 5, 6]})) - - @pytest.fixture def dtypes(): return { diff --git a/tests/test_reader_utilities.py b/tests/test_reader_utilities.py index 6685d145..55821641 100755 --- a/tests/test_reader_utilities.py +++ b/tests/test_reader_utilities.py @@ -4,7 +4,6 @@ import pytest from io import StringIO -from pandas.io.parsers import TextFileReader from pathlib import Path from cdm_reader_mapper.mdf_reader.utils.utilities import ( @@ -21,7 +20,12 @@ convert_str_boolean, _remove_boolean_values, remove_boolean_values, - process_textfilereader, +) + +from cdm_reader_mapper.common.iterators import ( + ParquetStreamReader, + process_disk_backed, + parquet_stream_from_iterable, ) @@ -34,7 +38,8 @@ def make_parser(text: str, chunksize: int = 1) -> pd.io.parsers.TextFileReader: @pytest.fixture def sample_reader() -> pd.io.parsers.TextFileReader: buffer = StringIO("A,B\n1,2\n3,4\n") - return pd.read_csv(buffer, chunksize=1) + reader = pd.read_csv(buffer, chunksize=1) + return parquet_stream_from_iterable(reader) @pytest.fixture @@ -246,36 +251,53 @@ def test_remove_boolean_values(): assert result["B"].dtype.name == "int64" -def test_process_textfilereader(sample_reader): - reader_out, extra_out = process_textfilereader( - sample_reader, sample_func, read_kwargs={"chunksize": 1} - ) - assert isinstance(reader_out, TextFileReader) - df_out = reader_out.read() - assert df_out.shape == (2, 2) - assert df_out["A"].iloc[0] == 2 - assert df_out["B"].iloc[1] == 8 +def test_process_textfilereader_basic(sample_reader): + reader_out, extra_out = process_disk_backed(sample_reader, sample_func) + assert isinstance(reader_out, ParquetStreamReader) + + chunk1 = reader_out.get_chunk() + assert chunk1.shape == (1, 2) + assert chunk1.iloc[0]["A"] == 2 + + chunk2 = reader_out.get_chunk() + assert chunk2.shape == (1, 2) + assert chunk2.iloc[0]["B"] == 8 + + assert isinstance(extra_out, dict) assert extra_out == {"note": "first_chunk_only"} + with pytest.raises(StopIteration): + reader_out.get_chunk() + def test_process_textfilereader_only_df(sample_reader): - (reader_out,) = process_textfilereader( - sample_reader, sample_func_only_df, read_kwargs={"chunksize": 1} - ) - assert isinstance(reader_out, TextFileReader) - df_out = reader_out.read() - assert df_out.shape == (2, 2) - assert df_out["A"].iloc[0] == 2 - assert df_out["B"].iloc[1] == 8 + reader_out, extra_out = process_disk_backed(sample_reader, sample_func_only_df) + assert isinstance(reader_out, ParquetStreamReader) + + chunk1 = reader_out.get_chunk() + assert chunk1.shape == (1, 2) + assert chunk1.iloc[0]["A"] == 2 + + chunk2 = reader_out.get_chunk() + assert chunk2.shape == (1, 2) + assert chunk2.iloc[0]["B"] == 8 + + assert extra_out == {} def test_process_textfilereader_makecopy_flag(sample_reader): - reader_out, extra_out = process_textfilereader( - sample_reader, sample_func, makecopy=True, read_kwargs={"chunksize": 1} + reader_out, extra_out = process_disk_backed( + sample_reader, sample_func, makecopy=True ) - assert isinstance(reader_out, TextFileReader) - df_out = reader_out.read() - assert df_out.shape == (2, 2) - assert df_out["A"].iloc[0] == 2 - assert df_out["B"].iloc[1] == 8 + assert isinstance(reader_out, ParquetStreamReader) + + chunk1 = reader_out.get_chunk() + assert chunk1.shape == (1, 2) + assert chunk1.iloc[0]["A"] == 2 + + chunk2 = reader_out.get_chunk() + assert chunk2.shape == (1, 2) + assert chunk2.iloc[0]["B"] == 8 + + assert isinstance(extra_out, dict) assert extra_out == {"note": "first_chunk_only"}