diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5147e8f1..50bf3fb9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -79,6 +79,7 @@ repos: hooks: - id: codespell additional_dependencies: [ 'tomli' ] + exclude: tests/.*\.py - repo: https://github.com/python-jsonschema/check-jsonschema rev: 0.31.1 hooks: diff --git a/CHANGES.rst b/CHANGES.rst index 7e67fff1..344aa650 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -22,7 +22,8 @@ New features and enhancements Breaking changes ^^^^^^^^^^^^^^^^ -* ``cdm_reader_mapper.cdm_mapper``: rename `map_and_covnert` to helper function `_map_and_convert` (:pull:`343`) +* ``cdm_reader_mapper.cdm_mapper``: rename `map_and_convert` to helepr function `_map_and_convert` (:pull:`343`) +* replace `logging.error` with `raise` error statements (:pull:`345`) Internal changes ^^^^^^^^^^^^^^^^ @@ -34,6 +35,22 @@ Internal changes * ``cdm_reader_mapper.cdm_mapper``: introduce some helper functions (:pull:`324`) * add more unit tests (:issue:`311`, :pull:`324`) * ``cdm_reader_mapper.cdm_mapper``: split `map_and_convert` into multiple helper functions (:issue:`333`, :pull:`343`) +* exclude tests/*.py from `pre-commit` codespell hook (:pull:`345`) +* replace many `os` functions with `pathlib.Path` (:pull:`345`) +* re-work `mdf_reader` (:issue:`334`, :pull:`345`) + + * remove `reader.MDFFileReader` class + * remove `utils.configurator` module + * remove both `utils.decoder` and `mdf_reader.utils.converter` modules + * introduce `utils.parser` module: bunch of functions to parse input data into MDF data + * introduce `utils.convert_and_decode`: make converter and decoder functions more modular + * make `utils.validator` module more modular + * `utils.filereader.FileReader` uses `utils.parser` function for parsing + * move many helper function to `utils.utilities` + * serialize `schemas.schemas` module + +* add type hints and docstrings to `mdf_reader` (:pull:`345`) +* add unit tests for `mdf_reader` module to testing suite (:pull:`345`) Bug fixes ^^^^^^^^^ diff --git a/cdm_reader_mapper/common/inspect.py b/cdm_reader_mapper/common/inspect.py index 6ee3fbcf..267caafa 100755 --- a/cdm_reader_mapper/common/inspect.py +++ b/cdm_reader_mapper/common/inspect.py @@ -12,7 +12,7 @@ import pandas as pd -from cdm_reader_mapper.common import pandas_TextParser_hdlr +from . import pandas_TextParser_hdlr def _count_by_cat(series) -> dict: diff --git a/cdm_reader_mapper/common/replace.py b/cdm_reader_mapper/common/replace.py index 6e07fb48..15426ec0 100755 --- a/cdm_reader_mapper/common/replace.py +++ b/cdm_reader_mapper/common/replace.py @@ -22,7 +22,7 @@ import pandas as pd -from cdm_reader_mapper.common import logging_hdlr +from . import logging_hdlr def replace_columns( diff --git a/cdm_reader_mapper/mdf_reader/codes/codes.py b/cdm_reader_mapper/mdf_reader/codes/codes.py index 6ba65924..66b8a679 100755 --- a/cdm_reader_mapper/mdf_reader/codes/codes.py +++ b/cdm_reader_mapper/mdf_reader/codes/codes.py @@ -9,8 +9,6 @@ from __future__ import annotations -import logging -import os from pathlib import Path from cdm_reader_mapper.common.json_dict import ( @@ -23,53 +21,60 @@ def read_table( - code_table_name, - imodel=None, - ext_table_path=None, + code_table_name: str, + imodel: str | None = None, + ext_table_path: str | None = None, ) -> dict: """ - Read a data model code table file to a dictionary. + Load a data model code table into a Python dictionary. - It completes the code table to the full complexity - the data reader expects, by appending information - on secondary keys and expanding range keys. + The code table may define secondary keys, range expansions, or other + structures required by the data reader. This function resolves the + file location either from an external path or an internal data model. - Parameter - --------- - code_table_name: str - The external code table file. - imodel: str, optional - Name of internally available input data model. - e.g. icoads_r300_d704 - ext_table_path: str, optional - The path to the external code table file. - One of ``imodel`` and ``ext_table_path`` must be set. + Parameters + ---------- + code_table_name : str + The name of the code table (without file extension). + e.g., `"ICOADS.C0.IM"` + imodel : str, optional + Internal data model name, e.g., `"icoads_r300_d704"`. Required if + `ext_table_path` is not provided. + ext_table_path : str, optional + External path containing the code table file. If set, this path + takes precedence over `imodel`. Returns ------- - dict - Code table + Dict + The fully combined code table dictionary. + + Raises + ------ + FileNotFoundError + If the specified table file cannot be found. + ValueError + If neither `imodel` nor `ext_table_path` is provided. """ - # 1. Validate input if ext_table_path: - table_path = os.path.abspath(ext_table_path) - table_files = os.path.join(table_path, code_table_name + ".json") - if not os.path.isfile(table_files): - logging.error(f"Can't find input code table file {table_files}") - return - table_files = Path(table_files) - else: - imodel = imodel.split("_") + table_path = Path(ext_table_path).resolve() + table_file = table_path / f"{code_table_name}.json" + if not table_file.is_file(): + raise FileNotFoundError(f"Can't find input code table file {table_file}") + table_files = [table_file] + elif imodel: + parts = imodel.split("_") table_files = collect_json_files( - *imodel, + *parts, base=f"{properties._base}.codes", name=code_table_name, ) - if isinstance(table_files, Path): - table_files = [table_files] - # 2. Get tables + if isinstance(table_files, Path): + table_files = [table_files] + else: + raise ValueError("One of 'imodel' or 'ext_table_path' must be set") + tables = [open_json_file(ifile) for ifile in table_files] - # 3. Combine tables return combine_dicts(tables) diff --git a/cdm_reader_mapper/mdf_reader/reader.py b/cdm_reader_mapper/mdf_reader/reader.py index 1e5e9a5f..c921c031 100755 --- a/cdm_reader_mapper/mdf_reader/reader.py +++ b/cdm_reader_mapper/mdf_reader/reader.py @@ -2,342 +2,87 @@ from __future__ import annotations -import ast -import csv -import logging -import os from io import StringIO as StringIO +from pathlib import Path -import pandas as pd +from cdm_reader_mapper import DataBundle -from cdm_reader_mapper.common.json_dict import open_json_file -from cdm_reader_mapper.common.pandas_TextParser_hdlr import make_copy -from cdm_reader_mapper.core.databundle import DataBundle +from ..common.json_dict import open_json_file -from . import properties from .utils.filereader import FileReader -from .utils.utilities import adjust_dtype, remove_boolean_values, validate_arg -from .utils.validators import validate - +from .utils.utilities import validate_arg + +from .utils.utilities import as_list, as_path, read_csv + + +def validate_read_mdf_args( + *, + source: str | Path, + imodel: str | None = None, + ext_schema_path: str | Path | None = None, + ext_schema_file: str | Path | None = None, + year_init: int | None = None, + year_end: int | None = None, + chunksize: int | None = None, + skiprows: int | None = None, +): + """ + Validate arguments for reading an MDF file. -class MDFFileReader(FileReader): - """Class to represent reader output. + This function performs validation on file paths and numeric arguments + required for reading an MDF dataset. - Attributes - ---------- - data : pd.DataFrame or pd.io.parsers.TextFileReader - a pandas.DataFrame or pandas.io.parsers.TextFileReader - with the output data - mask : pd.DataFrame or pd.io.parsers.TextFileReader - a pandas.DataFrame or pandas.io.parsers.TextFileReader - with the output data validation mask - attrs : dict - a dictionary with the output data elements attributes + Raises + ------ + FileNotFoundError + If the source file does not exist. + ValueError + If required arguments are missing or numeric constraints are violated. """ + source = as_path(source, "source") - def __init__(self, *args, **kwargs): - FileReader.__init__(self, *args, **kwargs) - - def _convert_and_decode( - self, - df, - converter_dict, - converter_kwargs, - decoder_dict, - ) -> pd.DataFrame: - for section in converter_dict.keys(): - if section not in df.columns: - continue - if section in decoder_dict.keys(): - decoded = decoder_dict[section](df[section]) - decoded.index = df[section].index - df[section] = decoded - - converted = converter_dict[section]( - df[section], **converter_kwargs[section] - ) - converted.index = df[section].index - df[section] = converted - return df - - def _validate(self, df) -> pd.DataFrame: - return validate( - data=df, - imodel=self.imodel, - ext_table_path=self.ext_table_path, - schema=self.schema, - disables=self.disable_reads, - ) + if not source.exists(): + raise FileNotFoundError(f"Source file not found: {source}") - def convert_and_decode_entries( - self, - data, - convert=True, - decode=True, - converter_dict=None, - converter_kwargs=None, - decoder_dict=None, - ) -> pd.DataFrame | pd.io.parsers.TextFileReader: - """Convert and decode data entries by using a pre-defined data model. - - Overwrite attribute `data` with converted and/or decoded data. - - Parameters - ---------- - data: pd.DataFrame or pd.io.parsers.TextFileReader - Data to convert and decode. - convert: bool, default: True - If True convert entries by using a pre-defined data model. - decode: bool, default: True - If True decode entries by using a pre-defined data model. - converter_dict: dict of {Hashable: func}, optional - Functions for converting values in specific columns. - If None use information from a pre-defined data model. - converter_kwargs: dict of {Hashable: kwargs}, optional - Key-word arguments for converting values in specific columns. - If None use information from a pre-defined data model. - decoder_dict: dict, optional - Functions for decoding values in specific columns. - If None use information from a pre-defined data model. - """ - if converter_dict is None: - converter_dict = self.configurations["convert_decode"]["converter_dict"] - if converter_kwargs is None: - converter_kwargs = self.configurations["convert_decode"]["converter_kwargs"] - if decoder_dict is None: - decoder_dict = self.configurations["convert_decode"]["decoder_dict"] - if not (convert and decode): - self.dtypes = "object" - return data - if convert is not True: - converter_dict = {} - converter_kwargs = {} - if decode is not True: - decoder_dict = {} - - if isinstance(data, pd.DataFrame): - data = self._convert_and_decode( - data, - converter_dict, - converter_kwargs, - decoder_dict, - ) - else: - data_buffer = StringIO() - TextParser = make_copy(data) - for i, df_ in enumerate(TextParser): - df = self._convert_and_decode( - df_, - converter_dict, - converter_kwargs, - decoder_dict, - ) - df.to_csv( - data_buffer, - header=False, - mode="a", - encoding=self.encoding, - index=False, - quoting=csv.QUOTE_NONE, - sep=properties.internal_delimiter, - quotechar="\0", - escapechar="\0", - ) - - data_buffer.seek(0) - data = pd.read_csv( - data_buffer, - names=df.columns, - chunksize=self.chunksize, - dtype=object, - delimiter=properties.internal_delimiter, - quotechar="\0", - escapechar="\0", - ) - return data - - def validate_entries( - self, data, validate - ) -> pd.DataFrame | pd.io.parsers.TextFileReader: - """Validate data entries by using a pre-defined data model. - - Fill attribute `valid` with boolean mask. - """ - if validate is not True: - mask = pd.DataFrame(dtype="boolean") - elif isinstance(data, pd.DataFrame): - mask = self._validate(data) - else: - data_buffer = StringIO() - TextParser_ = make_copy(data) - for i, df_ in enumerate(TextParser_): - mask_ = self._validate(df_) - mask_.to_csv( - data_buffer, - header=False, - mode="a", - encoding=self.encoding, - index=False, - ) - data_buffer.seek(0) - mask = pd.read_csv( - data_buffer, - names=df_.columns, - chunksize=self.chunksize, - dtype="boolean", - ) - return mask - - def remove_boolean_values( - self, data - ) -> pd.DataFrame | pd.io.parsers.TextFileReader: - """DOCUMENTATION""" - if isinstance(data, pd.DataFrame): - data = data.map(remove_boolean_values) - dtype = adjust_dtype(self.dtypes, data) - return data.astype(dtype) - else: - data_buffer = StringIO() - TextParser = make_copy(data) - for i, df_ in enumerate(TextParser): - df = df_.map(remove_boolean_values) - dtype = adjust_dtype(self.dtypes, df) - date_columns = [] - df.to_csv( - data_buffer, - header=False, - mode="a", - encoding=self.encoding, - index=False, - quoting=csv.QUOTE_NONE, - sep=properties.internal_delimiter, - quotechar="\0", - escapechar="\0", - ) - date_columns = [] - for i, element in enumerate(list(dtype)): - if dtype.get(element) == "datetime": - date_columns.append(i) - dtype = adjust_dtype(dtype, df) - data_buffer.seek(0) - data = pd.read_csv( - data_buffer, - names=df.columns, - chunksize=self.chunksize, - dtype=dtype, - parse_dates=date_columns, - delimiter=properties.internal_delimiter, - quotechar="\0", - escapechar="\0", - ) - return data - - def read( - self, - chunksize=None, - sections=None, - skiprows=0, - convert=True, - decode=True, - converter_dict=None, - converter_kwargs=None, - validate=True, - encoding: str | None = None, - **kwargs, - ) -> DataBundle: - """Read data from disk. - - Parameters - ---------- - chunksize : int, optional - Number of reports per chunk. - sections : list, optional - List with subset of data model sections to output, optional - If None read pre-defined data model sections. - skiprows : int - Number of initial rows to skip from file, default: 0 - convert: bool, default: True - If True convert entries by using a pre-defined data model. - decode: bool, default: True - If True decode entries by using a pre-defined data model. - converter_dict: dict of {Hashable: func}, optional - Functions for converting values in specific columns. - If None use information from a pre-defined data model. - converter_kwargs: dict of {Hashable: kwargs}, optional - Key-word arguments for converting values in specific columns. - If None use information from a pre-defined data model. - validate: bool, default: True - Validate data entries by using a pre-defined data model. - encoding: str, optional - Encoding of the input file, overrides the value in the imodel schema - """ - # 0. VALIDATE INPUT - if not validate_arg("sections", sections, list): - return - if not validate_arg("chunksize", chunksize, int): - return - if not validate_arg("skiprows", skiprows, int): - return - - self.chunksize = chunksize - self.skiprows = skiprows - - # 2. READ AND VALIDATE DATA - logging.info(f"EXTRACTING DATA FROM MODEL: {self.imodel}") - # 2.1. Subset data model sections to requested sections - parsing_order = self.schema["header"].get("parsing_order") - sections_ = [x.get(y) for x in parsing_order for y in x] - read_sections_list = [y for x in sections_ for y in x] - if sections is None: - sections = read_sections_list - - # 2.2 Homogenize input data to an iterable with dataframes: - # a list with a single dataframe or a pd.io.parsers.TextFileReader - logging.info("Getting data string from source...") - self.configurations = self.get_configurations(read_sections_list, sections) - self.encoding = encoding or self.encoding - data = self.open_data( - read_sections_list, - sections, - # INFO: Set default as "pandas" to account for custom schema - open_with=properties.open_file.get(self.imodel, "pandas"), - encoding=self.encoding, - chunksize=chunksize, + if not imodel and not (ext_schema_path or ext_schema_file): + raise ValueError( + "One of imodel or ext_schema_path/ext_schema_file must be provided" ) - # 2.3. Extract, read and validate data in same loop - logging.info("Extracting and reading sections") - data = self.convert_and_decode_entries( - data, - convert=convert, - decode=decode, - ) - mask = self.validate_entries(data, validate) - - # 3. Create output DataBundle object - logging.info("Create an output DataBundle object") - data = self.remove_boolean_values(data) - return DataBundle( - data=data, - columns=self.columns, - dtypes=self.dtypes, - parse_dates=self.parse_dates, - encoding=self.encoding, - mask=mask, - imodel=self.imodel, - ) + validate_arg("chunksize", chunksize, int) + if chunksize is not None and chunksize <= 0: + raise ValueError("chunksize must be a positive integer") + + validate_arg("skiprows", skiprows, int) + if skiprows is not None and skiprows < 0: + raise ValueError("skiprows must be >= 0") + + if year_init is not None and year_end is not None: + if year_init > year_end: + raise ValueError("year_init must be <= year_end") def read_mdf( source, - imodel=None, - ext_schema_path=None, - ext_schema_file=None, - ext_table_path=None, - year_init=None, - year_end=None, + imodel: str | None = None, + ext_schema_path: str | None = None, + ext_schema_file: str | None = None, + ext_table_path: str | None = None, + year_init: int | None = None, + year_end: int | None = None, encoding: str | None = None, - **kwargs, + chunksize: int | None = None, + skiprows: int = None, + convert_flag: bool = True, + converter_dict: dict | None = None, + converter_kwargs: dict | None = None, + decode_flag: bool = True, + decoder_dict: dict | None = None, + validate_flag: bool = True, + sections: str | list | None = None, + excludes: str | list | None = None, + pd_kwargs: dict | None = None, + xr_kwargs: dict | None = None, ) -> DataBundle: """Read data files compliant with a user specific data model. @@ -362,14 +107,38 @@ def read_mdf( ext_schema_file: str, optional The external input data model schema file. One of ``imodel`` and ``ext_schema_path`` or ``ext_schema_file`` must be set. - ext_table_path: str, optional - The path to the external input data model code tables. year_init: str or int, optional Left border of time axis. year_end: str or int, optional Right border of time axis. encoding : str, optional The encoding of the input file. Overrides the value in the imodel schema file. + chunksize : int, optional + Number of reports per chunk. + skiprows : int, optional + Number of initial rows to skip from file, default: 0 + convert_flag: bool, default: True + If True convert entries by using a pre-defined data model. + converter_dict: dict of {Hashable: func}, optional + Functions for converting values in specific columns. + If None use information from a pre-defined data model. + converter_kwargs: dict of {Hashable: kwargs}, optional + Key-word arguments for converting values in specific columns. + If None use information from a pre-defined data model. + decode_flag: bool, default: True + If True decode entries by using a pre-defined data model. + decoder_dict: dict of {Hashable: func}, optional + Functions for decoding values in specific columns. + If None use information from a pre-defined data model. + validate_flag: bool, default: True + Validate data entries by using a pre-defined data model. + sections : list, optional + List with subset of data model sections to output, optional + If None read pre-defined data model sections. + pd_kwargs: dict, optional + Additional pandas arguments + xr_kwargs: dict, optional + Additional xarray arguments Returns ------- @@ -384,28 +153,70 @@ def read_mdf( write_data : Write MDF data and validation mask to disk. write_tables : Write CDM tables to disk. """ - - def get_list_element(lst, idx): - try: - return lst[idx] - except IndexError: - return None - - logging.basicConfig( - format="%(levelname)s\t[%(asctime)s](%(filename)s)\t%(message)s", - level=logging.INFO, - datefmt="%Y%m%d %H:%M:%S", - filename=None, - ) - return MDFFileReader( + if skiprows is None: + skiprows = 0 + validate_read_mdf_args( source=source, imodel=imodel, ext_schema_path=ext_schema_path, ext_schema_file=ext_schema_file, + year_init=year_init, + year_end=year_end, + chunksize=chunksize, + skiprows=skiprows, + ) + + pd_kwargs = pd_kwargs or {} + pd_kwargs.setdefault("encoding", encoding) + pd_kwargs.setdefault("chunksize", chunksize) + pd_kwargs.setdefault("skiprows", skiprows) + + xr_kwargs = xr_kwargs or {} + + convert_kwargs = dict( + convert_flag=convert_flag, + converter_dict=converter_dict, + converter_kwargs=converter_kwargs, + ) + + decode_kwargs = dict( + decode_flag=decode_flag, + decoder_dict=decoder_dict, + ) + + validate_kwargs = dict( + validate_flag=validate_flag, ext_table_path=ext_table_path, + ) + + sections = as_list(sections) + excludes = as_list(excludes) + + validate_arg("sections", sections, list) + validate_arg("excludes", excludes, list) + + select_kwargs = dict( + sections=sections, + excludes=excludes, year_init=year_init, year_end=year_end, - ).read(encoding=encoding, **kwargs) + ) + + filereader = FileReader( + imodel=imodel, + ext_schema_path=ext_schema_path, + ext_schema_file=ext_schema_file, + ) + + return filereader.read( + source=source, + pd_kwargs=pd_kwargs, + xr_kwargs=xr_kwargs, + convert_kwargs=convert_kwargs, + decode_kwargs=decode_kwargs, + validate_kwargs=validate_kwargs, + select_kwargs=select_kwargs, + ) def read_data( @@ -456,52 +267,25 @@ def read_data( write_data : Write MDF data and validation mask to disk. write_tables : Write CDM tables to disk. """ - - def _update_column_labels(columns): - new_cols = [] - for col in columns: - try: - col_ = ast.literal_eval(col) - except SyntaxError: - col_ = tuple(col.split(":")) - except ValueError: - col_ = col - new_cols.append(col_) - - if all(isinstance(c, tuple) for c in new_cols): - return pd.MultiIndex.from_tuples(new_cols) - - return pd.Index(new_cols) - - def _read_csv(ifile, col_subset=None, **kwargs): - if ifile is None or not os.path.isfile(ifile): - return pd.DataFrame() - - df = pd.read_csv(ifile, delimiter=",", **kwargs) - df.columns = _update_column_labels(df.columns) - if col_subset is not None: - df = df[col_subset] - - return df - - if info is None: - info_dict = {} - else: - info_dict = open_json_file(info) - + info_dict = open_json_file(info) if info else {} dtype = info_dict.get("dtypes", "object") parse_dates = info_dict.get("parse_dates", False) - if encoding is None: - encoding = info_dict.get("encoding", None) + encoding = encoding or info_dict.get("encoding", None) - data = _read_csv( + pd_kwargs = kwargs.copy() + pd_kwargs.setdefault("dtype", dtype) + pd_kwargs.setdefault("parse_dates", parse_dates) + pd_kwargs.setdefault("encoding", encoding) + + data = read_csv( source, col_subset=col_subset, - dtype=dtype, - parse_dates=parse_dates, - encoding=encoding, + **pd_kwargs, ) - mask = _read_csv(mask, col_subset=col_subset, dtype="boolean") + mask = read_csv(mask, col_subset=col_subset, dtype="boolean") + if not mask.empty: + mask = mask.reindex(columns=data.columns) + return DataBundle( data=data, columns=data.columns, diff --git a/cdm_reader_mapper/mdf_reader/schemas/schemas.py b/cdm_reader_mapper/mdf_reader/schemas/schemas.py index cd34f7e2..96ff7718 100755 --- a/cdm_reader_mapper/mdf_reader/schemas/schemas.py +++ b/cdm_reader_mapper/mdf_reader/schemas/schemas.py @@ -9,115 +9,151 @@ from __future__ import annotations -import logging -import os from pathlib import Path +from typing import TypedDict from cdm_reader_mapper.common.json_dict import collect_json_files, combine_dicts from .. import properties -def convert_dtype_to_default(dtype, section, element) -> str: - """Convert data type to defaults (int, float).""" - if dtype is None: - return - elif dtype == "float": - return dtype - elif dtype == "int": - return properties.pandas_int - elif "float" in dtype.lower(): - logging.warning( - f"Set column type of ({section}, {element}) from deprecated {dtype} to float." - ) - return "float" - elif "int" in dtype.lower(): - logging.warning( - f"Set column type of ({section}, {element}) from deprecated {dtype} to int." - ) - return properties.pandas_int - return dtype - - -def _read_schema(schema) -> dict: - """DOCUMENTATION.""" - if not schema["header"]: - if not schema["sections"]: - logging.error( - f"'sections' block needs to be defined in a schema with no header. Error in data model schema file {schema['name']}" - ) - return - schema["header"] = dict() - - if schema["header"].get("multiple_reports_per_line"): - logging.error("Multiple reports per line data model: not yet supported") - return - - # 3.2. Make no section formats be internally treated as 1 section format - if not schema.get("sections"): - if not schema.get("elements"): - logging.error( - f"Data elements not defined in data model schema file {schema['name']} under key 'elements' " - ) - return - schema["sections"] = { - properties.dummy_level: { - "header": {}, - "elements": schema.get("elements"), - } +class SectionDict(TypedDict, total=False): + """ + Schema definition for a single section within a report. + + Attributes + ---------- + header : dict, optional + Metadata or configuration for the section header. + elements : dict, optional + Dictionary of elements/fields contained within the section. + """ + + header: dict + elements: dict + + +class SchemaHeaderDict(TypedDict, total=False): + """ + Schema definition for the report header. + + Attributes + ---------- + parsing_order : list[dict], optional + List of dictionaries defining the order in which header fields are parsed. + delimiter : str, optional + Delimiter used to separate fields in the header. + field_layout : str, optional + Layout or format of the fields (e.g., fixed width, CSV). + format : str, optional + General format type of the header. + encoding : str, optional + Text encoding for the header, e.g., 'utf-8'. + multiple_reports_per_line : bool, optional + Whether multiple reports may appear on a single line. + """ + + parsing_order: list[dict] + delimiter: str + field_layout: str + format: str + encoding: str + multiple_reports_per_line: bool + + +class SchemaDict(TypedDict, total=False): + """ + Complete schema definition for a report. + + Attributes + ---------- + header : SchemaHeaderDict, optional + Configuration for the report header. + sections : dict[str, SectionDict], optional + Mapping of section names to section schemas. + elements : dict, optional + Mapping of element names to their attributes. + name : list[Path], optional + List of Path objects representing schema files or sources. + imodel : str | None, optional + Name of the internal data model, if applicable. + """ + + header: SchemaHeaderDict + sections: dict[str, SectionDict] + elements: dict + name: list[Path] + imodel: str | None + + +def _resolve_schema_files( + *, + imodel: str | None = None, + ext_schema_path: str | None = None, + ext_schema_file: str | None = None, +) -> list[Path]: + """Determine which schema file(s) to use based on the input parameters.""" + if ext_schema_file: + path = Path(ext_schema_file) + if not path.is_file(): + raise FileNotFoundError(f"Can't find input schema file {ext_schema_file}") + return [path] + + if ext_schema_path: + schema_path = Path(ext_schema_path).resolve() + path = schema_path / f"{schema_path.name}.json" + if not path.is_file(): + raise FileNotFoundError(f"Can't find input schema path {ext_schema_path}") + return [path] + + if imodel: + parts = imodel.split("_") + model = parts[0] + if model not in properties.supported_data_models: + raise ValueError(f"Input data model {model} not supported") + + return collect_json_files(*parts, base=f"{properties._base}.schemas") + + raise ValueError( + "One of 'imodel', 'ext_schema_path', or 'ext_schema_file' must be set" + ) + + +def _normalize_schema(schema: SchemaDict) -> SchemaDict: + """Normalize a schema dictionary by ensuring it has sections and a parsing order.""" + header = schema.get("header", {}) + sections = schema.get("sections") + elements = schema.get("elements") + + if not sections: + if not elements: + raise KeyError("Schema has no sections and no elements") + level = properties.dummy_level + dummy_header = { + k: header[k] for k in ("delimiter", "field_layout", "format") if k in header } - schema["header"]["parsing_order"] = [{"s": [properties.dummy_level]}] - schema.pop("elements", None) - schema["sections"][properties.dummy_level]["header"]["delimiter"] = schema[ - "header" - ].get("delimiter") - schema["header"].pop("delimiter", None) - schema["sections"][properties.dummy_level]["header"]["field_layout"] = schema[ - "header" - ].get("field_layout") - schema["header"].pop("field_layout", None) - schema["sections"][properties.dummy_level]["header"]["format"] = schema[ - "header" - ].get("format") - schema["header"].pop("format", None) - - # 3.3. Make parsing order explicit - if not schema["header"].get("parsing_order"): # assume sequential - schema["header"]["parsing_order"] = [{"s": list(schema["sections"].keys())}] - - # 3.4. Make disable_read and field_layout explicit: this is ruled by delimiter being set, - # unless explicitly set - for section in schema["sections"].keys(): - if schema["sections"][section]["header"].get("disable_read"): - continue - else: - schema["sections"][section]["header"]["disable_read"] = False - if not schema["sections"][section]["header"].get("field_layout"): - delimiter = schema["sections"][section]["header"].get("delimiter") - schema["sections"][section]["header"]["field_layout"] = ( - "delimited" if delimiter else "fixed_width" - ) - for element in schema["sections"][section]["elements"].keys(): - column_type = schema["sections"][section]["elements"][element].get( - "column_type" - ) - schema["sections"][section]["elements"][element]["column_type"] = ( - convert_dtype_to_default( - column_type, - section, - element, - ) - ) - return schema - - -def read_schema(imodel=None, ext_schema_path=None, ext_schema_file=None) -> dict: + sections = {level: {"header": dummy_header, "elements": elements}} + schema = {k: v for k, v in schema.items() if k != "elements"} + + header = { + **header, + "parsing_order": header.get("parsing_order") or [{"s": list(sections.keys())}], + } + + return {**schema, "header": header, "sections": sections} + + +def read_schema( + imodel: str | None = None, + ext_schema_path: str | None = None, + ext_schema_file: str | None = None, +) -> SchemaDict: """ - Read a data model schema file. + Load and normalize a data model schema. - Read a data model schema file to a dictionary and - completes it by adding explicitly information the - reader tool needs + Reads a data model schema file into a dictionary and + normalizes it by adding the information required by + the parser. Parameters ---------- @@ -134,99 +170,20 @@ def read_schema(imodel=None, ext_schema_path=None, ext_schema_file=None) -> dict Returns ------- - dict + SchemaDict Data model schema """ - # 1. Validate input - if ext_schema_file: - if not os.path.isfile(ext_schema_file): - logging.error(f"Can't find input schema file {ext_schema_file}") - return - schema_files = Path(ext_schema_file) - elif ext_schema_path: - schema_path = os.path.abspath(ext_schema_path) - schema_name = os.path.basename(schema_path) - schema_files = os.path.join(schema_path, schema_name + ".json") - if not os.path.isfile(schema_files): - logging.error(f"Can't find input schema file {schema_files}") - return - schema_files = Path(schema_files) - else: - imodel = imodel.split("_") - if imodel[0] not in properties.supported_data_models: - logging.error("Input data model " f"{imodel[0]}" " not supported") - return - schema_files = collect_json_files(*imodel, base=f"{properties._base}.schemas") - - if isinstance(schema_files, Path): - schema_files = [schema_files] - - # 2. Get schema - schema = combine_dicts(schema_files, base=f"{properties._base}.schemas") - schema["name"] = schema_files - - # 3. Expand schema - # Fill in the initial schema to "full complexity": to homogenize schema, - # explicitly add info that is implicit to given situations/data models - - # One report per record: make sure later changes are reflected in MULTIPLE - # REPORTS PER RECORD case below if we ever use it! - # Currently only supported case: one report per record (line) - # 3.1. First check for no header case: sequential sections - return _read_schema(schema) - - -def df_schema(df_columns, schema) -> dict: - """ - Create simple data model schema dictionary. - - Create a simple attribute dictionary for the elements - in a dataframe from its data model schema - - Parameters - ---------- - df_columns : list - The columns in the data frame (data elements from - the data model) - schema : dict - The data model schema + schema_files = _resolve_schema_files( + imodel=imodel, + ext_schema_path=ext_schema_path, + ext_schema_file=ext_schema_file, + ) + raw_schema = combine_dicts(schema_files, base=f"{properties._base}.schemas") - Returns - ------- - dict - Data elements attributes - - """ + enriched = { + **raw_schema, + "name": schema_files, + } - def clean_schema(columns, schema): - # Could optionally add cleaning of element descriptors that only apply - # to the initial reading of the data model: field_length, etc.... - for element in list(schema): - if element not in columns: - schema.pop(element) - - def get_index(idx, lst, section): - if len(lst) == 1: - return idx - return (section, idx) - - flat_schema = dict() - for section in schema.get("sections"): - if schema["sections"].get(section).get("header").get("disable_read"): - flat_schema.update({section: {"column_type": "object"}}) - else: - flat_schema.update( - { - get_index(x, list(schema.get("sections")), section): schema[ - "sections" - ] - .get(section) - .get("elements") - .get(x) - for x in schema["sections"].get(section).get("elements") - } - ) - - clean_schema(df_columns, flat_schema) - return flat_schema + return _normalize_schema(enriched) diff --git a/cdm_reader_mapper/mdf_reader/utils/__init__.py b/cdm_reader_mapper/mdf_reader/utils/__init__.py index 015b78b8..338bd945 100755 --- a/cdm_reader_mapper/mdf_reader/utils/__init__.py +++ b/cdm_reader_mapper/mdf_reader/utils/__init__.py @@ -1,6 +1,3 @@ """Common Data Model (CDM) reader utilities.""" from __future__ import annotations - -from .converters import converters # noqa -from .decoders import decoders # noqa diff --git a/cdm_reader_mapper/mdf_reader/utils/configurator.py b/cdm_reader_mapper/mdf_reader/utils/configurator.py deleted file mode 100755 index 43b1358f..00000000 --- a/cdm_reader_mapper/mdf_reader/utils/configurator.py +++ /dev/null @@ -1,267 +0,0 @@ -"""Auxiliary functions and class for reading, converting, decoding and validating MDF files.""" - -from __future__ import annotations - -import ast -import csv -import logging - -import numpy as np -import pandas as pd - -from itertools import zip_longest - -from .. import properties -from . import converters, decoders -from .utilities import convert_dtypes - - -class Configurator: - """Class for configuring MDF reader information.""" - - def __init__( - self, - df=pd.DataFrame(), - schema=None, - order=None, - valid=None, - ): - self.df = df - self.orders = order or [] - self.valid = valid or [] - self.schema = schema or {} - - def _validate_sentinel(self, i, line, sentinel) -> bool: - slen = len(sentinel) - str_start = line[i : i + slen] - return str_start == sentinel - - def _get_index(self, section, order) -> dict | tuple[str, dict]: - if len(self.orders) == 1: - return section - else: - return (order, section) - - def _get_ignore(self, section_dict) -> bool: - ignore = section_dict.get("ignore") - if isinstance(ignore, str): - ignore = ast.literal_eval(ignore) - return ignore - - def _get_dtype(self) -> str: - return properties.pandas_dtypes.get(self.sections_dict.get("column_type")) - - def _get_converter(self) -> callable: - return converters.get(self.sections_dict.get("column_type")) - - def _get_conv_kwargs(self) -> dict: - column_type = self.sections_dict.get("column_type") - if column_type is None: - return - return { - converter_arg: self.sections_dict.get(converter_arg) - for converter_arg in properties.data_type_conversion_args.get(column_type) - } - - def _get_decoder(self) -> callable | None: - encoding = self.sections_dict.get("encoding") - if encoding is None: - return - column_type = self.sections_dict.get("column_type") - if column_type is None: - return - return decoders.get(encoding).get(column_type) - - def _update_dtypes(self, dtypes, index) -> dict: - dtype = self._get_dtype() - if dtype: - dtypes[index] = dtype - return dtypes - - def _update_converters(self, converters, index) -> dict: - converter = self._get_converter() - if converter: - converters[index] = converter - return converters - - def _update_kwargs(self, kwargs, index) -> dict: - conv_kwargs = self._get_conv_kwargs() - if conv_kwargs: - kwargs[index] = conv_kwargs - return kwargs - - def _update_decoders(self, decoders, index) -> dict: - decoder = self._get_decoder() - if decoder: - decoders[index] = decoder - return decoders - - def get_configuration(self) -> dict: - """Get ICOADS data model specific information.""" - disable_reads = [] - dtypes = {} - converters = {} - kwargs = {} - decoders = {} - for order in self.orders: - self.order = order - header = self.schema["sections"][order]["header"] - disable_read = header.get("disable_read") - if disable_read is True: - disable_reads.append(order) - continue - sections = self.schema["sections"][order]["elements"] - for section in sections.keys(): - self.sections_dict = sections[section] - index = self._get_index(section, order) - ignore = (order not in self.valid) or self._get_ignore( - self.sections_dict - ) - if ignore is True: - continue - dtypes = self._update_dtypes(dtypes, index) - converters = self._update_converters(converters, index) - kwargs = self._update_kwargs(kwargs, index) - decoders = self._update_decoders(decoders, index) - - dtypes, parse_dates = convert_dtypes(dtypes) - return { - "convert_decode": { - "converter_dict": converters, - "converter_kwargs": kwargs, - "decoder_dict": decoders, - }, - "self": { - "dtypes": dtypes, - "disable_reads": disable_reads, - "parse_dates": parse_dates, - "encoding": self.schema["header"].get("encoding", "utf-8"), - }, - } - - def open_pandas(self) -> pd.DataFrame: - """Open TextParser to pd.DataSeries.""" - return self.df.apply(lambda x: self._read_line(x[0]), axis=1) - - def _process_section( - self, line: str, i: int, order: str, header: dict, data_dict: dict - ) -> int: - sections = self.schema["sections"][order]["elements"] - section_length = header.get("length", properties.MAX_FULL_REPORT_WIDTH) - delimiter = header.get("delimiter") - field_layout = header.get("field_layout") - sentinel = header.get("sentinel") - bad_sentinel = sentinel is not None and not self._validate_sentinel( - i, line, sentinel - ) - k = i + section_length - - if delimiter and header.get("format") == "delimited": - fields = list(csv.reader([line[i:]], delimiter=delimiter))[0] - for field_name, field in zip_longest( - sections.keys(), fields, fillvalue=None - ): - index = self._get_index(field_name, order) - data_dict[index] = field.strip() if field is not None else None - if field is not None: - i += len(field) - return i - - if delimiter and field_layout != "fixed_width": - logging.error( - f"Delimiter for {order} is set to {delimiter}. " - f"Please specify either format or field_layout in your header schema {header}." - ) - return i - - for section, section_dict in sections.items(): - missing = True - index = self._get_index(section, order) - ignore = (order not in self.valid) or self._get_ignore(section_dict) - na_value = section_dict.get("missing_value") - field_length = section_dict.get( - "field_length", properties.MAX_FULL_REPORT_WIDTH - ) - - j = i if bad_sentinel else i + field_length - if j > k: - missing = False - j = k - - if not ignore: - value = line[i:j] - if not value.strip() or value == na_value: - value = True - if i == j and missing: - value = False - data_dict[index] = value - - if delimiter and line[j : j + len(delimiter)] == delimiter: - j += len(delimiter) - - i = j - - return i - - def _read_line(self, line: str) -> pd.Series: - i = 0 - data_dict = {} - - for order in self.orders: - header = self.schema["sections"][order]["header"] - - if header.get("disable_read") is True: - data_dict[order] = line[i : properties.MAX_FULL_REPORT_WIDTH] - continue - - i = self._process_section(line, i, order, header, data_dict) - - return pd.Series(data_dict) - - def open_netcdf(self) -> pd.DataFrame: - """Open netCDF to pd.Series.""" - - def replace_empty_strings(series): - if series.dtype == "object": - series = series.str.decode("utf-8") - series = series.str.strip() - series = series.map(lambda x: True if x == "" else x) - return series - - missing_values = [] - attrs = {} - renames = {} - disables = [] - for order in self.orders: - self.order = order - header = self.schema["sections"][order]["header"] - disable_read = header.get("disable_read") - if disable_read is True: - disables.append(order) - continue - sections = self.schema["sections"][order]["elements"] - for section in sections.keys(): - self.sections_dict = sections[section] - index = self._get_index(section, order) - ignore = (order not in self.valid) or self._get_ignore( - self.sections_dict - ) - if ignore is True: - continue - if section in self.df.data_vars: - renames[section] = index - elif section in self.df.dims: - renames[section] = index - elif section in self.df.attrs: - attrs[index] = self.df.attrs[section] - else: - missing_values.append(index) - - df = self.df[renames.keys()].to_dataframe().reset_index() - attrs = {k: v.replace("\n", "; ") for k, v in attrs.items()} - df = df.rename(columns=renames) - df = df.assign(**attrs) - df[disables] = np.nan - df = df.apply(lambda x: replace_empty_strings(x)) - df[missing_values] = False - return df diff --git a/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py b/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py new file mode 100755 index 00000000..121eced5 --- /dev/null +++ b/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py @@ -0,0 +1,370 @@ +"""pandas converting operators.""" + +from __future__ import annotations + +from decimal import Decimal, InvalidOperation +from typing import Callable, Any + +import pandas as pd + +from .. import properties +from .utilities import convert_str_boolean + + +def max_decimal_places(*decimals: Decimal) -> int: + """ + Return the maximum number of decimal places among Decimal values. + + Parameters + ---------- + decimals : Decimal + One or more Decimal values. + + Returns + ------- + int + Maximum number of decimal places. + """ + return max( + (-d.as_tuple().exponent if d.as_tuple().exponent < 0 else 0) for d in decimals + ) + + +def to_numeric(x: Any, scale: Decimal, offset: Decimal) -> Decimal | bool: + """ + Convert a value to a scaled Decimal with offset applied. + + Rules + ----- + - Boolean values are returned unchanged + - Empty or invalid values return False + - Strings are stripped and spaces replaced with zeros + - Result is quantized to the maximum decimal precision + of input, scale, or offset + + Parameters + ---------- + x : Any + Input value to convert. + scale : Decimal + Scale factor. + offset : Decimal + Offset value. + + Returns + ------- + Decimal | bool + Converted Decimal value, boolean, or False if invalid. + """ + x = convert_str_boolean(x) + + if isinstance(x, bool): + return x + + if isinstance(x, str): + x = x.strip() + x = x.replace(" ", "0") + + try: + x_dec = Decimal(str(x)) + decimal_places = max_decimal_places(offset, scale, x_dec) + result = offset + x_dec * scale + + if decimal_places == 0: + return result + + return result.quantize(Decimal("1." + "0" * decimal_places)) + + except (InvalidOperation, TypeError, ValueError): + return False + + +class Decoders: + """ + Registry-based decoder dispatcher for column-wise decoding. + + Currently supports Base36 decoding for numeric-like fields. + """ + + def __init__(self, dtype: str, encoding: str = "base36") -> None: + """ + Initialization. + + Parameters + ---------- + dtype : str + Target data type name (e.g. numeric field type) + encoding : str, default "base36" + Encoding scheme to use + """ + self.dtype = dtype + self.encoding = encoding + + self._registry = {"key": self.base36} + + for numeric_type in properties.numeric_types: + self._registry[numeric_type] = self.base36 + + def decoder(self) -> Callable[[pd.Series], pd.Series] | None: + """ + Return the decoder function for the configured dtype and encoding. + + Returns + ------- + callable or None + Decoder function accepting a pandas Series, or None if encoding + is unsupported. + + Raises + ------ + KeyError + If no decoder is registered for the given dtype. + """ + if self.encoding != "base36": + return None + + try: + return self._registry[self.dtype] + except KeyError as exc: + raise KeyError(f"No converter registered for '{self.dtype}'") from exc + + def base36(self, data: pd.Series) -> pd.Series: + """ + Decode a pandas Series from Base36 to stringified base-10 integers. + + Boolean values are preserved. + Invalid values raise ValueError via `int(..., 36)`. + + Parameters + ---------- + data : pd.Series + Input Series containing base36-encoded values + + Returns + ------- + pd.Series + Decoded Series with stringified integers or booleans + """ + + def _base36(x): + x = convert_str_boolean(x) + if isinstance(x, bool): + return x + return str(int(str(x), 36)) + + return data.apply(_base36) + + +class Converters: + """ + Registry-based converter for pandas Series. + + Converts object-typed Series into numeric, datetime, or cleaned object + representations based on the configured dtype. + """ + + def __init__(self, dtype: str) -> None: + """ + Initialization. + + Parameters + ---------- + dtype : str + Target output dtype identifier + """ + self.dtype = dtype + self.numeric_scale = 1.0 if self.dtype == "float" else 1 + self.numeric_offset = 0.0 if self.dtype == "float" else 0 + + self.preprocessing_functions = { + "PPPP": lambda x: ( + str(10000 + int(x)) if isinstance(x, str) and x.startswith("0") else x + ) + } + + self._registry = { + "datetime": self.object_to_datetime, + "str": self.object_to_object, + "object": self.object_to_object, + "key": self.object_to_object, + } + + for numeric_type in properties.numeric_types: + self._registry[numeric_type] = self.object_to_numeric + + def converter(self) -> Callable[..., pd.Series]: + """ + Return the converter function registered for the configured dtype. + + Returns + ------- + callable + Converter function + + Raises + ------ + KeyError + If no converter is registered for the dtype + """ + try: + return self._registry[self.dtype] + except KeyError as exc: + raise KeyError(f"No converter registered for '{self.dtype}'") from exc + + def object_to_numeric( + self, + data: pd.Series, + scale: float | int | None = None, + offset: float | int | None = None, + ) -> pd.Series: + """ + Convert object Series to numeric using Decimal arithmetic. + + - Right spaces are treated as zeros + - Optional scale and offset may be applied + - Boolean values are preserved + - Invalid conversions return False + + Parameters + ---------- + data : pd.Series + Object-typed Series + scale : numeric, optional + Scale factor + offset : numeric, optional + Offset value + + Returns + ------- + pd.Series + Converted Series + """ + if data.dtype != "object": + return data + + scale = scale if scale else self.numeric_scale + offset = offset if offset else self.numeric_offset + + scale = Decimal(str(scale)) + offset = Decimal(str(offset)) + + column_name = data.name + if column_name in self.preprocessing_functions: + data = data.apply(self.preprocessing_functions[column_name]) + + return data.apply(lambda x: to_numeric(x, scale, offset)) + + def object_to_object( + self, + data: pd.Series, + disable_white_strip: bool | str = False, + ) -> pd.Series: + """ + Clean object Series by stripping whitespace and nullifying empty strings. + + Parameters + ---------- + data : pd.Series + Object-typed Series + disable_white_strip : bool or {"l", "r"}, default False + Control whitespace stripping behavior + + Returns + ------- + pd.Series + Cleaned Series + """ + if data.dtype != "object": + return data + + if not disable_white_strip: + data = data.str.strip() + elif disable_white_strip == "l": + data = data.str.rstrip() + elif disable_white_strip == "r": + data = data.str.lstrip() + + return data.apply( + lambda x: None if isinstance(x, str) and (x.isspace() or not x) else x + ) + + def object_to_datetime( + self, + data: pd.Series, + datetime_format: str = "%Y%m%d", + ) -> pd.Series: + """ + Convert object Series to pandas datetime. + + Invalid values are coerced to NaT. + + Parameters + ---------- + data : pd.Series + Object-typed Series + datetime_format : str, default "%Y%m%d" + Datetime parsing format + + Returns + ------- + pd.Series + Datetime Series + """ + if data.dtype != "object": + return data + + return pd.to_datetime(data, format=datetime_format, errors="coerce") + + +def convert_and_decode( + data: pd.DataFrame, + convert_flag: bool = True, + decode_flag: bool = True, + converter_dict: dict[str, Callable[[pd.Series], pd.Series]] | None = None, + converter_kwargs: dict[str, dict] | None = None, + decoder_dict: dict[str, Callable[[pd.Series], pd.Series]] | None = None, +) -> pd.DataFrame: + """Convert and decode data entries by using a pre-defined data model. + + Overwrite attribute `data` with converted and/or decoded data. + + Parameters + ---------- + data : pd.DataFrame + Data to convert and decode. + convert_flag : bool, default True + If True, apply converters to the columns defined in `converter_dict`. + decode_flag : bool, default True + If True, apply decoders to the columns defined in `decoder_dict`. + converter_dict : dict[str, callable], optional + Column-specific converter functions. If None, defaults to empty dict. + converter_kwargs : dict[str, dict], optional + Keyword arguments for each converter function. + decoder_dict : dict[str, callable], optional + Column-specific decoder functions. If None, defaults to empty dict. + + Returns + ------- + pd.DataFrame + DataFrame with converted and decoded columns. + """ + converter_dict = converter_dict or {} + converter_kwargs = converter_kwargs or {} + decoder_dict = decoder_dict or {} + + if decode_flag: + for column, dec_func in decoder_dict.items(): + if column in data.columns: + decoded = dec_func(data[column]) + decoded.index = data[column].index + data[column] = decoded + + if convert_flag: + for column, conv_func in converter_dict.items(): + if column in data.columns: + kwargs = converter_kwargs.get(column, {}) + converted = conv_func(data[column], **kwargs) + converted.index = data[column].index + data[column] = converted + + return data diff --git a/cdm_reader_mapper/mdf_reader/utils/converters.py b/cdm_reader_mapper/mdf_reader/utils/converters.py deleted file mode 100755 index 398be5f6..00000000 --- a/cdm_reader_mapper/mdf_reader/utils/converters.py +++ /dev/null @@ -1,126 +0,0 @@ -"""pandas converting operators.""" - -from __future__ import annotations - -from decimal import Decimal - -import pandas as pd - -from .. import properties -from .utilities import convert_str_boolean - - -def max_decimal_places(*decimals): - """Get maximum number of decimal places for each Decimal number.""" - decimal_places = [ - -d.as_tuple().exponent if d.as_tuple().exponent < 0 else 0 for d in decimals - ] - return max(decimal_places) - - -class df_converters: - """Class for converting pandas DataFrame.""" - - def __init__(self, dtype): - self.dtype = dtype - self.numeric_scale = 1.0 if self.dtype == "float" else 1 - self.numeric_offset = 0.0 if self.dtype == "float" else 0 - self.preprocessing_functions = { - "PPPP": lambda x: ( - str(10000 + int(x)) if isinstance(x, str) and x.startswith("0") else x - ) - } - - def to_numeric(self, data, offset, scale) -> pd.Series: - """Convert object type elements of a pandas series to numeric type.""" - - def _to_numeric(x): - x = convert_str_boolean(x) - if isinstance(x, bool): - return x - if isinstance(x, str): - x = x.strip() - x.replace(" ", "0") - try: - x = Decimal(str(x)) - decimal_places = max_decimal_places(offset, scale, x) - result = offset + x * scale - return result.quantize(Decimal("1." + "0" * decimal_places)) - except ValueError: - return False - - offset = Decimal(str(offset)) - scale = Decimal(str(scale)) - - # Apply preprocessing if a function exists for this column - column_name = data.name - if column_name in self.preprocessing_functions: - data = data.apply(self.preprocessing_functions[column_name]) - - return data.apply(lambda x: _to_numeric(x)) - - def object_to_numeric(self, data, scale=None, offset=None) -> pd.Series: - """ - Convert the object type elements of a pandas series to numeric type. - - Right spaces are treated as zeros. Scale and offset can optionally be applied. - The final data type according to the class dtype. - - Parameters - ---------- - self : dtype, numeric_scale and numeric_offset - Pandas dataframe with a column per report sections. - The sections in the columns as a block strings. - data : pandas.Series - Series with data to convert. Data must be object type - - Keyword Arguments - ----------------- - scale : numeric, optional - Scale to apply after conversion to numeric - offset : numeric, optional - Offset to apply after conversion to numeric - column_name : str, optional - Name of the column being processed - - Returns - ------- - data : pandas.Series - Data series of type self.dtype - - """ - scale = scale if scale else self.numeric_scale - offset = offset if offset else self.numeric_offset - if data.dtype == "object": - data = self.to_numeric(data, offset, scale) - return data - - def object_to_object(self, data, disable_white_strip=False) -> pd.Series: - """DOCUMENTATION.""" - if data.dtype != "object": - return data - - if not disable_white_strip: - data = data.str.strip() - elif disable_white_strip == "l": - data = data.str.rstrip() - elif disable_white_strip == "r": - data = data.str.lstrip() - return data.apply( - lambda x: None if isinstance(x, str) and (x.isspace() or not x) else x - ) - - def object_to_datetime(self, data, datetime_format="%Y%m%d") -> pd.DateTimeIndex: - """DOCUMENTATION.""" - if data.dtype != "object": - return data - return pd.to_datetime(data, format=datetime_format, errors="coerce") - - -converters = dict() -for dtype in properties.numeric_types: - converters[dtype] = df_converters(dtype).object_to_numeric -converters["datetime"] = df_converters("datetime").object_to_datetime -converters["str"] = df_converters("str").object_to_object -converters["object"] = df_converters("object").object_to_object -converters["key"] = df_converters("key").object_to_object diff --git a/cdm_reader_mapper/mdf_reader/utils/decoders.py b/cdm_reader_mapper/mdf_reader/utils/decoders.py deleted file mode 100755 index 53b42205..00000000 --- a/cdm_reader_mapper/mdf_reader/utils/decoders.py +++ /dev/null @@ -1,33 +0,0 @@ -"""pandas decoding operators.""" - -from __future__ import annotations - -from .. import properties -from .utilities import convert_str_boolean - -import pandas as pd - - -class df_decoders: - """DOCUMENTATION.""" - - def __init__(self, dtype): - # Return as object, conversion to actual type in converters only! - self.dtype = "object" - - def base36(self, data) -> pd.Series: - """DOCUMENTATION.""" - - def _base36(x): - x = convert_str_boolean(x) - if isinstance(x, bool): - return x - return str(int(str(x), 36)) - - return data.apply(lambda x: _base36(x)) - - -decoders = {"base36": {}} -for dtype in properties.numeric_types: - decoders["base36"][dtype] = df_decoders(dtype).base36 -decoders["base36"]["key"] = df_decoders("key").base36 diff --git a/cdm_reader_mapper/mdf_reader/utils/filereader.py b/cdm_reader_mapper/mdf_reader/utils/filereader.py index 3f62fe0f..9b556cf6 100755 --- a/cdm_reader_mapper/mdf_reader/utils/filereader.py +++ b/cdm_reader_mapper/mdf_reader/utils/filereader.py @@ -2,206 +2,395 @@ from __future__ import annotations -import csv import logging -import os -from copy import deepcopy -from io import StringIO + +from typing import Callable, Any, Sequence, Mapping import pandas as pd import xarray as xr +from dataclasses import replace +from pandas.io.parsers import TextFileReader + from .. import properties -from ..schemas import schemas -from .configurator import Configurator -from .utilities import validate_path +from .utilities import ( + process_textfilereader, + remove_boolean_values, +) + +from .convert_and_decode import convert_and_decode +from .validators import validate +from .parser import ( + update_xr_config, + update_pd_config, + parse_pandas, + parse_netcdf, + build_parser_config, + ParserConfig, +) + +from cdm_reader_mapper.core.databundle import DataBundle + + +def _apply_or_chunk( + data: pd.DataFrame | TextFileReader, + func: Callable[..., Any], + func_args: Sequence[Any] | None = None, + func_kwargs: Mapping[str, Any] | None = None, + **kwargs: Mapping[str, Any], +): + """Apply a function directly or chunk-wise depending on input type.""" + func_args = func_args or [] + func_kwargs = func_kwargs or {} + if not isinstance(data, TextFileReader): + return func(data, *func_args, **func_kwargs) + return process_textfilereader( + data, + func, + func_args, + func_kwargs, + **kwargs, + ) + + +def _merge_kwargs(*dicts: Mapping[str, Any]) -> dict[str, Any]: + """Merge multiple keyword-argument dictionaries.""" + merged = {} + for d in dicts: + for k in d: + if k in merged: + raise ValueError(f"Duplicate kwarg '{k}' in open_data()") + merged[k] = d[k] + return merged + + +def _apply_multiindex(df: pd.DataFrame) -> pd.DataFrame: + """Convert tuple-based columns to a pandas MultiIndex.""" + if not df.columns.map(lambda x: isinstance(x, tuple)).all(): + return df + + df.columns = pd.MultiIndex.from_tuples( + [col if isinstance(col, tuple) else (None, col) for col in df.columns], + ) + return df + + +def _select_years( + df: pd.DataFrame, + selection: tuple[int | None, int | None], + year_col, +) -> pd.DataFrame: + """Filter rows of a DataFrame by a year range.""" + year_init, year_end = selection + if year_init is None and year_end is None: + return df + + years = pd.to_numeric(df[year_col], errors="coerce") + + mask = pd.Series(True, index=df.index) + + if year_init is not None: + mask &= years >= year_init + + if year_end is not None: + mask &= years <= year_end + + mask &= years.notna() + + return df.loc[mask].reset_index(drop=True) class FileReader: - """Class to read marine-meteorological data.""" + """ + Class to read marine-meteorological data. + + Provides a high-level interface to read, parse, filter, convert, + decode, and validate data from multiple sources (FWF, CSV, NetCDF). + """ def __init__( self, - source, - imodel=None, - ext_schema_path=None, - ext_schema_file=None, - ext_table_path=None, - year_init=None, - year_end=None, + imodel: str | None = None, + ext_schema_path: str | None = None, + ext_schema_file: str | None = None, ): - # 0. VALIDATE INPUT - if not imodel and not ext_schema_path: - logging.error( - "A valid input data model name or path to data model must be provided" - ) - return - if not os.path.isfile(source): - logging.error(f"Can't find input data file {source}") - return - if not validate_path("ext_schema_path", ext_schema_path): - return - - self.source = source - self.imodel = imodel - self.year_init = year_init - self.year_end = year_end - self.ext_table_path = ext_table_path - - # 1. GET DATA MODEL - # Schema reader will return empty if cannot read schema or is not valid - # and will log the corresponding error - # multiple_reports_per_line error also while reading schema - logging.info("READING DATA MODEL SCHEMA FILE...") - if ext_schema_path or ext_schema_file: - self.schema = schemas.read_schema( - ext_schema_path=ext_schema_path, ext_schema_file=ext_schema_file - ) + """ + Initialize FileReader with a data model and parser configuration. + + Parameters + ---------- + imodel : str + Name of the data model (e.g., 'ICOADS'). + args, kwargs + Arguments passed to ``build_parser_config``. + """ + self.imodel: str = imodel + self.config: ParserConfig = build_parser_config( + imodel=imodel, + ext_schema_path=ext_schema_path, + ext_schema_file=ext_schema_file, + ) + + def _process_data( + self, + data: pd.DataFrame | TextFileReader, + convert_flag: bool = False, + decode_flag: bool = False, + converter_dict: dict | None = None, + converter_kwargs: dict | None = None, + decoder_dict: dict | None = None, + validate_flag: bool = False, + ext_table_path: str | None = None, + sections: Sequence[str] | None = None, + excludes: Sequence[str] | None = None, + year_init: int | None = None, + year_end: int | None = None, + config: ParserConfig | None = None, + parse_mode: str = "pandas", + ) -> tuple[pd.DataFrame, pd.DataFrame, ParserConfig]: + """ + Core processing of raw data: parse, filter, convert, decode, validate. + + Parameters + ---------- + data : pandas.DataFrame or TextFileReader + Input data. + convert_flag : bool + Whether to apply converters. + decode_flag : bool + Whether to apply decoders. + converter_dict : dict, optional + Mapping of columns to converter functions. + converter_kwargs : dict, optional + Keyword arguments for converters. + decoder_dict : dict, optional + Mapping of columns to decoder functions. + validate_flag : bool + Whether to apply validation. + ext_table_path : str, optional + Path to external validation tables. + sections : sequence of str, optional + Sections to include. + excludes : sequence of str, optional + Sections to exclude. + year_init : int, optional + Initial year for filtering. + year_end : int, optional + End year for filtering. + config : ParserConfig, optional + Parser configuration. + parse_mode : str + Parsing backend ('pandas' or 'netcdf'). + + Returns + ------- + tuple of (data, mask, config) + - data : pandas.DataFrame with parsed, filtered, converted data + - mask : pandas.DataFrame with boolean mask for validation + - config : ParserConfig updated with final columns + """ + config = config or self.config + + if parse_mode == "pandas": + data = parse_pandas(data, config.order_specs, sections, excludes) + elif parse_mode == "netcdf": + data = parse_netcdf(data, config.order_specs, sections, excludes) else: - self.schema = schemas.read_schema(imodel=imodel) - - def _adjust_schema(self, ds, dtypes) -> dict: - sections = deepcopy(self.schema["sections"]) - for section in sections.keys(): - elements = sections[section]["elements"] - for data_var in elements.keys(): - not_in_data_vars = data_var not in ds.data_vars - not_in_glb_attrs = data_var not in ds.attrs - not_in_data_dims = data_var not in ds.dims - if not_in_data_vars and not_in_glb_attrs and not_in_data_dims: - del self.schema["sections"][section]["elements"][data_var] - continue - for attr, value in elements[data_var].items(): - if value != "__from_file__": - continue - if attr in ds[data_var].attrs: - self.schema["sections"][section]["elements"][data_var][attr] = ( - ds[data_var].attrs[attr] - ) - else: - del self.schema["sections"][section]["elements"][data_var][attr] - - def _select_years(self, df) -> pd.DataFrame: - def get_years_from_datetime(date): - try: - return date.year - except AttributeError: - return date - - if self.year_init is None and self.year_end is None: - return df + raise ValueError("parse_mode must be 'pandas' or 'netcdf'") + + data = _apply_multiindex(data) data_model = self.imodel.split("_")[0] - dates = df[properties.year_column[data_model]] - years = dates.apply(lambda x: get_years_from_datetime(x)) - years = years.astype(int) - - mask = pd.Series([True] * len(years)) - if self.year_init: - mask[years < self.year_init] = False - if self.year_end: - mask[years > self.year_end] = False - - index = mask[mask].index - return df.iloc[index].reset_index(drop=True) - - def _read_pandas(self, **kwargs) -> pd.DataFrame | pd.io.parsers.TextFileReader: - if (enc := kwargs.get("encoding")) is not None: - logging.info(f"Reading with encoding = {enc}") - return pd.read_fwf( - self.source, - header=None, - quotechar="\0", - escapechar="\0", - dtype=object, - skip_blank_lines=False, - **kwargs, - ) + year_col = properties.year_column[data_model] - def _read_netcdf(self, **kwargs) -> xr.Dataset: - ds = xr.open_mfdataset(self.source, **kwargs) - self._adjust_schema(ds, ds.dtypes) - return ds.squeeze() + data = _select_years(data, (year_init, year_end), year_col) - def _read_sections( - self, - TextParser, - order, - valid, - open_with, - ) -> pd.DataFrame: - if open_with == "pandas": - df = Configurator( - df=TextParser, schema=self.schema, order=order, valid=valid - ).open_pandas() - elif open_with == "netcdf": - df = Configurator( - df=TextParser, schema=self.schema, order=order, valid=valid - ).open_netcdf() + converter_dict = converter_dict or config.convert_decode["converter_dict"] + converter_kwargs = converter_kwargs or config.convert_decode["converter_kwargs"] + decoder_dict = decoder_dict or config.convert_decode["decoder_dict"] + + data = convert_and_decode( + data, + convert_flag=convert_flag, + decode_flag=decode_flag, + converter_dict=converter_dict, + converter_kwargs=converter_kwargs, + decoder_dict=decoder_dict, + ) + + if validate_flag: + mask = validate( + data, + imodel=self.imodel, + ext_table_path=ext_table_path, + attributes=config.validation, + disables=config.disable_reads, + ) else: - raise ValueError("open_with has to be one of ['pandas', 'netcdf']") + mask = pd.DataFrame(True, index=data.index, columns=data.columns) - self.columns = df.columns - return self._select_years(df) + data = remove_boolean_values(data, config.dtypes) + config = replace(config, columns=data.columns) - def get_configurations(self, order, valid) -> dict: - """DOCUMENTATION.""" - config_dict = Configurator( - schema=self.schema, order=order, valid=valid - ).get_configuration() - for attr, val in config_dict["self"].items(): - setattr(self, attr, val) - del config_dict["self"] - return config_dict + return data, mask, config def open_data( self, - order, - valid, - chunksize, - open_with="pandas", - encoding: str | None = None, - ) -> pd.DataFrame | pd.io.parsers.TextFileReader: - """DOCUMENTATION.""" - encoding = encoding or self.schema["header"].get("encoding") + source: str, + open_with: str = "pandas", + pd_kwargs: dict | None = None, + xr_kwargs: dict | None = None, + convert_kwargs: dict | None = None, + decode_kwargs: dict | None = None, + validate_kwargs: dict | None = None, + select_kwargs: dict | None = None, + ) -> ( + tuple[pd.DataFrame, pd.DataFrame, ParserConfig] + | tuple[TextFileReader, TextFileReader, ParserConfig] + ): + """ + Open and parse source data according to parser configuration. + + Parameters + ---------- + source : str + Path or pattern for input file(s). + open_with : str + Parser backend: 'pandas' or 'netcdf'. + pd_kwargs: dict, optional + Additional key-word arguments for parsing pandas-readable data. + xr_kwargs: dict, optional + Additional key-word arguments for parsing xarray-readable data. + convert_kwargs: dict, optional + Additional key-word arguments for data conversion. + decode_kwargs: dict, optional + Additional key-word arguments for data decoding. + validate_kwargs: dict, optional + Additional key-word arguments for data validation. + select_kwargs : dict, optional + Additional key-word arguments for selecting/filtering data. + + Returns + ------- + tuple + (data, mask, config) or chunked equivalents if using TextFileReader. + """ + pd_kwargs = dict(pd_kwargs or {}) + xr_kwargs = dict(xr_kwargs or {}) + convert_kwargs = convert_kwargs or {} + decode_kwargs = decode_kwargs or {} + validate_kwargs = validate_kwargs or {} + select_kwargs = select_kwargs or {} + + func_kwargs = _merge_kwargs( + convert_kwargs, + decode_kwargs, + validate_kwargs, + select_kwargs, + ) + func_kwargs["parse_mode"] = open_with + if open_with == "netcdf": - TextParser = self._read_netcdf() + to_parse = xr.open_mfdataset(source, **xr_kwargs).squeeze() + config = update_xr_config(to_parse, self.config) + write_kwargs, read_kwargs = {}, {} elif open_with == "pandas": - TextParser = self._read_pandas( - encoding=encoding, - widths=[properties.MAX_FULL_REPORT_WIDTH], - skiprows=self.skiprows, - chunksize=chunksize, + config = update_pd_config(pd_kwargs, self.config) + pd_kwargs["encoding"] = config.encoding + pd_kwargs.setdefault("widths", [properties.MAX_FULL_REPORT_WIDTH]) + pd_kwargs.setdefault("header", None) + pd_kwargs.setdefault("quotechar", "\0") + pd_kwargs.setdefault("escapechar", "\0") + pd_kwargs.setdefault("dtype", object) + pd_kwargs.setdefault("skip_blank_lines", False) + + write_kwargs = {"encoding": pd_kwargs["encoding"]} + chunksize = pd_kwargs.get("chunksize") + read_kwargs = ( + {"chunksize": chunksize, "dtype": config.dtypes}, + {"chunksize": chunksize, "dtype": "boolean"}, ) - else: - raise ValueError("open_with has to be one of ['pandas', 'netcdf']") - if isinstance(TextParser, pd.DataFrame) or isinstance(TextParser, xr.Dataset): - return self._read_sections(TextParser, order, valid, open_with=open_with) + to_parse = pd.read_fwf(source, **pd_kwargs) else: - data_buffer = StringIO() - for i, df_ in enumerate(TextParser): - df = self._read_sections(df_, order, valid, open_with=open_with) - df.to_csv( - data_buffer, - header=False, - mode="a", - encoding=encoding, - index=False, - quoting=csv.QUOTE_NONE, - sep=properties.internal_delimiter, - quotechar="\0", - escapechar="\0", - ) - data_buffer.seek(0) - data = pd.read_csv( - data_buffer, - names=df.columns, - chunksize=self.chunksize, - dtype=object, - parse_dates=self.parse_dates, - delimiter=properties.internal_delimiter, - quotechar="\0", - escapechar="\0", - ) - return data + raise ValueError("open_with must be 'pandas' or 'netcdf'") + + func_kwargs["config"] = config + + return _apply_or_chunk( + to_parse, + self._process_data, + func_kwargs=func_kwargs, + makecopy=False, + write_kwargs=write_kwargs, + read_kwargs=read_kwargs, + ) + + def read( + self, + source: str, + pd_kwargs: dict | None = None, + xr_kwargs: dict | None = None, + convert_kwargs: dict | None = None, + decode_kwargs: dict | None = None, + validate_kwargs: dict | None = None, + select_kwargs: dict | None = None, + ) -> DataBundle: + """ + Read and process data from the given source. + + Parameters + ---------- + source : str + Path to input file(s). + pd_kwargs: dict, optional + Additional key-word arguments for parsing pandas-readable data. + xr_kwargs: dict, optional + Additional key-word arguments for parsing xarray-readable data. + convert_kwargs: dict, optional + Additional key-word arguments for data conversion. + decode_kwargs: dict, optional + Additional key-word arguments for data decoding. + validate_kwargs: dict, optional + Additional key-word arguments for data validation. + select_kwargs : dict, optional + Additional key-word arguments for selecting/filtering data. + + Notes + ----- + All kwargs are forwarded to ``open_data`` to customize the + parsing, conversion, decoding, validation, and selection steps. + + Returns + ------- + DataBundle + Container with processed data, mask, columns, dtypes, and metadata. + """ + logging.info(f"EXTRACTING DATA FROM MODEL: {self.imodel}") + logging.info("Reading and parsing source data...") + + result = self.open_data( + source, + open_with=properties.open_file.get(self.imodel, "pandas"), + pd_kwargs=pd_kwargs, + xr_kwargs=xr_kwargs, + convert_kwargs=convert_kwargs, + decode_kwargs=decode_kwargs, + validate_kwargs=validate_kwargs, + select_kwargs=select_kwargs, + ) + + if not isinstance(result, tuple) or len(result) != 3: + raise RuntimeError("open_data() must return (data, mask, config)") + + data, mask, config = result + + return DataBundle( + data=data, + columns=config.columns, + dtypes=config.dtypes, + parse_dates=config.parse_dates, + encoding=config.encoding, + mask=mask, + imodel=self.imodel, + ) diff --git a/cdm_reader_mapper/mdf_reader/utils/parser.py b/cdm_reader_mapper/mdf_reader/utils/parser.py new file mode 100755 index 00000000..3ba2e9ae --- /dev/null +++ b/cdm_reader_mapper/mdf_reader/utils/parser.py @@ -0,0 +1,645 @@ +"""Auxiliary functions and class for reading, converting, decoding and validating MDF files.""" + +from __future__ import annotations + +import csv +import logging + +from dataclasses import dataclass, replace +from copy import deepcopy +from itertools import zip_longest +from typing import TypedDict, Any, Iterable + +import numpy as np +import pandas as pd +import xarray as xr + +from .. import properties +from ..schemas.schemas import read_schema, SchemaDict +from .utilities import convert_dtypes + +from .convert_and_decode import Converters, Decoders + + +class OrderSpec(TypedDict): + """ + Parsing specification for a single section. + + Defines the header configuration, element layout, and parsing mode + (fixed-width or delimited) for a section. + """ + + header: dict[str, Any] + elements: dict[str, dict[str, Any]] + is_delimited: bool + + +@dataclass(frozen=True) +class ParserConfig: + """ + Configuration for dataset parsing. + + Parameters + ---------- + order_specs : dict + Column ordering specifications. + disable_reads : list[str] + Columns or sources to skip during parsing. + dtypes : dict + Column data type mappings. + parse_dates : list[str] + Columns to parse as datetimes. + convert_decode : dict + Value conversion or decoding rules. + validation : dict + Validation rules for parsed data. + encoding : str + Text encoding used when reading input data. + columns : pd.Index or pd.MultiIndex or None, optional + Explicit column index to apply. If None, inferred from input. + """ + + order_specs: OrderSpec + disable_reads: list[str] + dtypes: dict + parse_dates: list[str] + convert_decode: dict + validation: dict + encoding: str + columns: pd.Index | pd.MultiIndex | None = None + + +def _get_index(section: str, order: str, length: int) -> str | tuple[str, str]: + """Build an index key based on section count.""" + return section if length == 1 else (order, section) + + +def _get_ignore(section_dict: dict[str, Any]) -> bool: + """Determine whether a section should be ignored.""" + ignore = section_dict.get("ignore", False) + if isinstance(ignore, str): + ignore = ignore.lower() in {"true", "1", "yes"} + return bool(ignore) + + +def _convert_dtype_to_default(dtype: str | None) -> str | None: + """Normalize deprecated or aliased dtype strings.""" + if dtype is None: + return None + elif dtype == "float": + return dtype + elif dtype == "int": + return properties.pandas_int + elif "float" in dtype.lower(): + logging.warning(f"Set column type from deprecated {dtype} to float.") + return "float" + elif "int" in dtype.lower(): + logging.warning(f"Set column type from deprecated {dtype} to int.") + return properties.pandas_int + return dtype + + +def _parse_fixed_width( + line: str, + i: int, + header: dict[str, Any], + elements: dict[str, dict[str, Any]], + sections: set | None, + excludes: set, + out: dict[Any, Any], +) -> int: + """Parse a fixed-width section of a line into an output dictionary.""" + section_length = header.get("length", properties.MAX_FULL_REPORT_WIDTH) + delimiter = header.get("delimiter") + sentinel = header.get("sentinel") + + section_end = i + section_length + bad_sentinel = sentinel is not None and not line.startswith(sentinel, i) + line_len = len(line) + delim_len = len(delimiter) if delimiter else 0 + + for spec in elements.values(): + field_length = spec.get("field_length", 0) + index = spec.get("index") + ignore = spec.get("ignore", False) + missing_value = spec.get("missing_value") + + missing = True + j = i if bad_sentinel else i + field_length + if j > section_end: + missing = False + j = section_end + + if not ignore: + key = index[0] if isinstance(index, tuple) else index + if (sections is None or key in sections) and key not in excludes: + if i < j: + value = line[i:j] + if not value.strip() or value == missing_value: + value = True + else: + value = False if missing else True + + out[index] = value + + if ( + delimiter + and j + delim_len <= line_len + and line[j : j + delim_len] == delimiter + ): + j += delim_len + + i = j + + return i + + +def _parse_delimited( + line: str, + i: int, + header: dict[str, Any], + elements: dict[str, dict[str, Any]], + sections: set | None, + excludes: set, + out: dict[Any, Any], +) -> int: + """Parse a delimiter-separated section of a line into an output dictionary.""" + delimiter = header["delimiter"] + fields = next(csv.reader([line[i:]], delimiter=delimiter)) + + for element, value in zip_longest(elements.keys(), fields): + index = elements[element]["index"] + key = index[0] if isinstance(index, tuple) else index + + if (sections is None or key in sections) and key not in excludes: + out[index] = value.strip() if value is not None else None + + return len(line) + + +def _parse_line( + line: str, + order_specs: dict[str, OrderSpec], + sections: set | None, + excludes: set, +) -> dict[str, dict[Any, Any]]: + """Parse a line using the provided parser configuration.""" + i = 0 + out = {} + max_width = properties.MAX_FULL_REPORT_WIDTH + + for order, spec in order_specs.items(): + header = spec["header"] + elements = spec["elements"] + + if header.get("disable_read"): + if order not in excludes: + out[order] = line[i : i + max_width] + i += header.get("length", max_width) + continue + + if spec["is_delimited"]: + i = _parse_delimited(line, i, header, elements, sections, excludes, out) + else: + i = _parse_fixed_width(line, i, header, elements, sections, excludes, out) + + return out + + +def parse_pandas( + df: pd.DataFrame, + order_specs: dict[str, OrderSpec], + sections: Iterable[str] | None = None, + excludes: Iterable[str] | None = None, +) -> pd.DataFrame: + """ + Parse a pandas DataFrame containing raw record lines. + + Each row of the input DataFrame is expected to contain a single + fixed-width or delimiter-separated record, which is parsed according + to the provided order specifications. + + Parameters + ---------- + df : pandas.DataFrame + Input DataFrame with exactly one column (column index ``0``), + where each row contains a raw record string. + order_specs : dict[str, OrderSpec] + Mapping of section names to parsing specifications. Each specification + defines the header configuration, element layout, and parsing mode + for a section. + sections : iterable of str or None + Section names to include. If None, all sections are parsed. + excludes : iterable of str or None + Section names to exclude from parsing. + + Returns + ------- + pandas.DataFrame + DataFrame constructed from parsed records. Columns are derived + from element indices and may be strings or tuples. + + Examples + -------- + Example ``order_specs`` structure:: + + order_specs = { + "core": { + "header": { + "sentinel": None, + "length": 108, + }, + "elements": { + "YR": { + "index": ("core", "YR"), + "field_length": 4, + "ignore": False, + "column_type": "Int64", + "missing_value": None, + }, + "MO": { + "index": ("core", "MO"), + "field_length": 2, + "ignore": False, + "column_type": "Int64", + "missing_value": None, + }, + }, + "is_delimited": False, + } + } + + Notes + ----- + - Ignored elements (``ignore=True``) are skipped. + - Disabled sections (``disable_read=True``) are included as raw strings in the output. + - Missing elements are filled with ``False``. + - Object-type columns are stripped, decoded from UTF-8 if necessary, and empty + strings are replaced with ``True``. + - No type conversion is performed at this stage. + """ + col = df.columns[0] + + sections = set(sections) if sections is not None else None + excludes = set(excludes) if excludes else set() + + records = df[col].map( + lambda line: _parse_line(line, order_specs, sections, excludes) + ) + return pd.DataFrame.from_records(records.to_list()) + + +def parse_netcdf( + ds: xr.Dataset, + order_specs: dict[str, OrderSpec], + sections: Iterable[str] | None = None, + excludes: Iterable[str] | None = None, +) -> pd.DataFrame: + """ + Parse an xarray Dataset into a pandas DataFrame based on order specifications. + + This function converts an xarray Dataset into a tabular pandas DataFrame + according to parsing rules defined in `order_specs`. Data variables, dimensions, + and global attributes are mapped to columns as specified, with ignored or missing + elements handled automatically. + + Parameters + ---------- + ds : xarray.Dataset + Input Dataset containing data variables, dimensions, and attributes. + order_specs : dict[str, OrderSpec] + Mapping of section names to parsing specifications. Each specification + defines the header configuration, element layout, and parsing mode + for a section. + sections : iterable of str or None + Section names to include. If None, all sections are parsed. + excludes : iterable of str or None + Section names to exclude from parsing. + + Returns + ------- + pandas.DataFrame + DataFrame constructed from the Dataset according to the parsing specification. + Columns are derived from element indices. Missing fields are filled with + False, disabled sections with NaN, and empty strings are converted to True. + + Examples + -------- + Example ``order_specs`` structure:: + + order_specs = { + "global_attributes": { + "header": { + "disable_read": True, + }, + "elements": { + "title": { + "index": ("global_attributes", "title"), + "ignore": False, + "column_type": "str", + "missing_value": None, + }, + "institution": { + "index": ("global_attributes", "institution"), + "ignore": False, + "column_type": "str", + "missing_value": None, + }, + }, + "is_delimited": False, + } + } + + Notes + ----- + - Variables, dimensions, and global attributes in `ds` are mapped to columns + according to the element `index`. + - Ignored elements (`ignore=True`) are skipped. + - Disabled sections (`disable_read=True`) are added as columns filled with NaN. + - Missing elements are added as columns filled with False. + - Object-type columns are decoded from UTF-8, stripped, and empty strings + replaced with True. + """ + sections = set(sections) if sections is not None else None + excludes = set(excludes) if excludes else set() + + missing_values = [] + attrs = {} + renames = {} + disables = [] + + data_vars = ds.data_vars + dims = ds.dims + ds_attrs = ds.attrs + + for order, ospec in order_specs.items(): + if sections is not None and order not in sections: + continue + if order in excludes: + continue + + header = ospec.get("header", {}) + if header.get("disable_read") is True: + disables.append(order) + continue + + for element, espec in ospec.get("elements", {}).items(): + if espec.get("ignore"): + continue + + index = espec["index"] + + if element in data_vars or element in dims: + renames[element] = index + elif element in ds_attrs: + attrs[index] = ds_attrs[element] + else: + missing_values.append(index) + + df = ds[list(renames)].to_dataframe().reset_index() + df = df[list(renames)].rename(columns=renames) + + if attrs: + df = df.assign(**{k: v.replace("\n", "; ") for k, v in attrs.items()}) + + if disables: + df[disables] = np.nan + + obj_cols = df.select_dtypes(include="object").columns + for col in obj_cols: + print(df[col]) + print(df[col].str) + s = df[col].str.decode("utf-8").str.strip() + df[col] = s.map(lambda x: True if x == "" else x) + + if missing_values: + df[missing_values] = False + + return df + + +def build_parser_config( + imodel: str | None = None, + ext_schema_path: str | None = None, + ext_schema_file: str | None = None, +) -> ParserConfig: + """ + Build a ParserConfig from a normalized schema definition. + + This function reads a schema definition and constructs a fully populated + :py:class:`ParserConfig` instance. The resulting configuration contains + parsing order specifications, data types, converters, decoders, validation + rules, and encoding information required to parse raw input records. + + Parameters + ---------- + imodel : str or None, optional + Internal model identifier used to locate the schema. + ext_schema_path : str or None, optional + Path to an external schema directory. + ext_schema_file : str or None, optional + Filename of an external schema definition. + + Returns + ------- + ParserConfig + Fully initialized parser configuration derived from the schema. + + Notes + ----- + - Section parsing order is derived from ``schema["header"]["parsing_order"]``. + - Sections marked with ``disable_read=True`` are recorded in + ``ParserConfig.disable_reads``. + - Elements marked as ignored or disabled are excluded from dtype, + conversion, and validation setup. + - Column indices may be strings or tuples depending on the number of + sections in the schema. + - Deprecated or aliased column types are normalized via + ``_convert_dtype_to_default``. + - Converter and decoder functions are resolved dynamically based on + column type and encoding. + - Validation rules may include value ranges and code tables, as defined + in the schema. + """ + schema: SchemaDict = read_schema( + imodel=imodel, + ext_schema_path=ext_schema_path, + ext_schema_file=ext_schema_file, + ) + + orders = [ + order + for group in schema["header"]["parsing_order"] + for section_list in group.values() + for order in section_list + ] + olength = len(orders) + + dtypes: dict[Any, Any] = {} + validation: dict[Any, dict[str, Any]] = {} + order_specs: dict[str, OrderSpec] = {} + disable_reads: list[str] = [] + converters: dict[Any, Any] = {} + converter_kwargs: dict[Any, dict[str, Any]] = {} + decoders: dict[Any, Any] = {} + + for order in orders: + section = schema["sections"][order] + header = section["header"] + elements = section.get("elements", {}) + + if header.get("disable_read"): + disable_reads.append(order) + + element_specs: dict[str, dict[str, Any]] = {} + for name, meta in elements.items(): + index = _get_index(name, order, olength) + ignore = _get_ignore(meta) + + element_specs[name] = { + "index": index, + "ignore": ignore, + "missing_value": meta.get("missing_value"), + "field_length": meta.get( + "field_length", properties.MAX_FULL_REPORT_WIDTH + ), + } + + if ignore or meta.get("disable_read", False): + continue + + ctype = _convert_dtype_to_default(meta.get("column_type")) + dtype = properties.pandas_dtypes.get(ctype) + if dtype is not None: + dtypes[index] = dtype + + conv_func = Converters(ctype).converter() + if conv_func: + converters[index] = conv_func + + conv_args = { + k: meta.get(k) + for k in properties.data_type_conversion_args.get(ctype, []) + } + if conv_args: + converter_kwargs[index] = conv_args + + encoding = meta.get("encoding") + if encoding: + dec_func = Decoders(ctype, encoding).decoder() + if dec_func: + decoders[index] = dec_func + + validation[index] = {} + if ctype: + validation[index]["column_type"] = ctype + for k in ("valid_min", "valid_max", "codetable"): + if meta.get(k) is not None: + validation[index][k] = meta[k] + + order_specs[order] = OrderSpec( + header=header, + elements=element_specs, + is_delimited=header.get("format") == "delimited", + ) + + dtypes, parse_dates = convert_dtypes(dtypes) + + return ParserConfig( + order_specs=order_specs, + disable_reads=disable_reads, + dtypes=dtypes, + parse_dates=parse_dates, + convert_decode={ + "converter_dict": converters, + "converter_kwargs": converter_kwargs, + "decoder_dict": decoders, + }, + validation=validation, + encoding=schema["header"].get("encoding", "utf-8"), + ) + + +def update_xr_config(ds: xr.Dataset, config: ParserConfig) -> ParserConfig: + """ + Update a ParserConfig instance using metadata from an xarray Dataset. + + This function adjusts the parser configuration based on the contents of + the provided Dataset. Elements not present in the Dataset are marked as + ignored, and validation rules marked as ``"__from_file__"`` are populated + from Dataset variable attributes when available. + + Parameters + ---------- + ds : xarray.Dataset + Input Dataset containing data variables, dimensions, and attributes. + config : ParserConfig + Existing parser configuration. + + Returns + ------- + ParserConfig + Updated parser configuration with modified order specifications and + validation rules derived from the Dataset. + """ + new_order_specs = deepcopy(config.order_specs) + new_validation = deepcopy(config.validation) + + for order, ospecs in new_order_specs.items(): + elements = ospecs["elements"] + + for element, especs in elements.items(): + if ( + element not in ds.data_vars + and element not in ds.attrs + and element not in ds.dims + ): + especs["ignore"] = True + continue + + index = especs.get("index") + if index not in new_validation: + continue + + for attr in list(new_validation[index].keys()): + if new_validation[index][attr] != "__from_file__": + continue + + ds_attrs = ds[element].attrs + if attr in ds_attrs: + new_validation[index][attr] = ds_attrs[attr] + else: + new_validation[index].pop(attr, None) + + return replace( + config, + order_specs=new_order_specs, + validation=new_validation, + ) + + +def update_pd_config(pd_kwargs: dict[str, Any], config: ParserConfig) -> ParserConfig: + """ + Update a ParserConfig instance using pandas keyword arguments. + + Currently, only the ``encoding`` option is supported. If an encoding + is provided in ``pd_kwargs``, a new ParserConfig instance is returned + with the updated encoding. Otherwise, the original configuration is + returned unchanged. + + Parameters + ---------- + pd_kwargs : dict[str, Any] + Keyword arguments intended for pandas I/O functions. + config : ParserConfig + Existing parser configuration. + + Returns + ------- + ParserConfig + Updated parser configuration if applicable, otherwise the original + configuration. + """ + if "encoding" in pd_kwargs and pd_kwargs["encoding"]: + return replace(config, encoding=pd_kwargs["encoding"]) + return config diff --git a/cdm_reader_mapper/mdf_reader/utils/utilities.py b/cdm_reader_mapper/mdf_reader/utils/utilities.py index 67f4930b..5b47ef2c 100755 --- a/cdm_reader_mapper/mdf_reader/utils/utilities.py +++ b/cdm_reader_mapper/mdf_reader/utils/utilities.py @@ -2,12 +2,229 @@ from __future__ import annotations +import ast +import csv import logging import os +from io import StringIO +from pathlib import Path +from typing import Any, Iterable, Callable + +import pandas as pd + +from .. import properties + +from cdm_reader_mapper.common.pandas_TextParser_hdlr import make_copy + + +def as_list(x: str | Iterable[Any] | None) -> list[Any] | None: + """ + Ensure the input is a list; keep None as None. + + Parameters + ---------- + x : str, iterable, or None + Input value to convert. Strings become single-element lists. + Other iterables are converted to a list preserving iteration order. + If None is passed, None is returned. + + Returns + ------- + list or None + Converted list or None if input was None. + + Notes + ----- + Sets are inherently unordered; the resulting list may not have a predictable order. + """ + if x is None: + return None + if isinstance(x, str): + return [x] + return list(x) + + +def as_path(value: str | os.PathLike, name: str) -> Path: + """ + Ensure the input is a Path-like object. + + Parameters + ---------- + value : str or os.PathLike + The value to convert to a Path. + name : str + Name of the parameter, used in error messages. + + Returns + ------- + pathlib.Path + Path object representing `value`. + + Raises + ------ + TypeError + If `value` is not a string or Path-like object. + """ + if isinstance(value, (str, os.PathLike)): + return Path(value) + raise TypeError(f"{name} must be str or Path-like") + + +def join(col: Any | Iterable[Any]) -> str: + """ + Join multi-level columns as a colon-separated string. + + Parameters + ---------- + col : any or iterable of any + A column name, which may be a single value or a list/tuple of values. + + Returns + ------- + str + Colon-separated string if input is iterable, or string of the single value. + """ + if isinstance(col, (list, tuple)): + return ":".join(str(c) for c in col) + return str(col) + + +def update_dtypes(dtypes: dict[str, Any], columns: Iterable[str]) -> dict[str, Any]: + """ + Filter dtypes dictionary to only include columns present in 'columns'. + + Parameters + ---------- + dtypes : dict + Dictionary mapping column names to their data types. + columns : iterable of str + List of columns to keep. + + Returns + ------- + dict + Filtered dictionary containing only keys present in 'columns'. + """ + if isinstance(dtypes, dict): + dtypes = {k: v for k, v in dtypes.items() if k in columns} + return dtypes + + +def update_column_names( + dtypes: dict[str, Any] | str, col_o: str, col_n: str +) -> dict[str, Any] | str: + """ + Rename a column in a dtypes dictionary if it exists. + + Parameters + ---------- + dtypes : dict or str + Dictionary mapping column names to data types, or a string. + col_o : str + Original column name to rename. + col_n : str + New column name. + + Returns + ------- + dict or str + Updated dictionary with column renamed, or string unchanged. + """ + if isinstance(dtypes, str): + return dtypes + if col_o != col_n and col_o in dtypes.keys(): + dtypes[col_n] = dtypes[col_o] + del dtypes[col_o] + return dtypes + + +def update_column_labels(columns: Iterable[str | tuple]) -> pd.Index | pd.MultiIndex: + """ + Convert string column labels to tuples if needed, producing a pandas Index or MultiIndex. + + This function attempts to parse each column label: + - If the label is a string representation of a tuple (e.g., "('A','B')"), it will be converted to a tuple. + - If the label is a string containing a colon (e.g., "A:B"), it will be split into a tuple ("A", "B"). + - Otherwise, the label is left unchanged. + + If all resulting labels are tuples, a pandas MultiIndex is returned. + Otherwise, a regular pandas Index is returned. + + Parameters + ---------- + columns : iterable of str or tuple + Column labels to convert. + + Returns + ------- + pd.Index or pd.MultiIndex + Converted column labels as a pandas Index or MultiIndex. + """ + new_cols = [] + all_tuples = True + + for col in columns: + try: + col_ = ast.literal_eval(col) + except Exception: + if isinstance(col, str) and ":" in col: + col_ = tuple(col.split(":")) + else: + col_ = col + all_tuples &= isinstance(col_, tuple) + new_cols.append(col_) + + if all_tuples: + return pd.MultiIndex.from_tuples(new_cols) + return pd.Index(new_cols) + + +def read_csv(filepath, col_subset=None, **kwargs) -> pd.DataFrame: + """ + Safe CSV reader that handles missing files and column subsets. + + Parameters + ---------- + filepath : str or Path or None + Path to the CSV file. + col_subset : list of str, optional + Subset of columns to read from the CSV. + kwargs : any + Additional keyword arguments passed to pandas.read_csv. + + Returns + ------- + pd.DataFrame + The CSV as a DataFrame. Empty if file does not exist. + """ + if filepath is None or not Path(filepath).is_file(): + logging.warning(f"File not found: {filepath}") + return pd.DataFrame() + + df = pd.read_csv(filepath, delimiter=",", **kwargs) + df.columns = update_column_labels(df.columns) + if col_subset is not None: + df = df[col_subset] + + return df + def convert_dtypes(dtypes) -> tuple[str]: - """Convert datetime to object.""" + """ + Convert datetime columns to object dtype and return columns to parse as dates. + + Parameters + ---------- + dtypes : dict[str, str] + Dictionary mapping column names to pandas dtypes. + + Returns + ------- + tuple + - Updated dtypes dictionary (datetime converted to object). + - List of columns originally marked as datetime. + """ parse_dates = [] for key, value in dtypes.items(): if value == "datetime": @@ -17,60 +234,57 @@ def convert_dtypes(dtypes) -> tuple[str]: def validate_arg(arg_name, arg_value, arg_type) -> bool: - """Validate input argument is as expected type. + """ + Validate that the input argument is of the expected type. Parameters ---------- arg_name : str - Name of the argument - arg_value : arg_type - Value of the argument + Name of the argument. + arg_value : Any + Value of the argument. arg_type : type - Type of the argument + Expected type of the argument. Returns ------- - boolean: - Returns True if type of `arg_value` equals `arg_type` + bool + True if `arg_value` is of type `arg_type` or None. + + Raises + ------ + ValueError + If `arg_value` is not of type `arg_type` and not None. """ if arg_value and not isinstance(arg_value, arg_type): - logging.error( - f"Argument {arg_name} must be {arg_type}, input type is {type(arg_value)}" + raise ValueError( + f"Argument {arg_name} must be {arg_type} or None, not {type(arg_value)}" ) - return False - return True - - -def validate_path(arg_name, arg_value) -> bool: - """Validate input argument is an existing directory. - - Parameters - ---------- - arg_name : str - Name of the argument - arg_value : str - Value of the argument - Returns - ------- - boolean - Returns True if `arg_name` is an existing directory. - """ - if arg_value and not os.path.isdir(arg_value): - logging.error(f"{arg_name} could not find path {arg_value}") - return False return True -def adjust_dtype(dtype, df) -> dict: - """Adjust dtypes to DataFrame.""" +def _adjust_dtype(dtype, df) -> dict: + """Filter dtype dictionary to only include columns present in the DataFrame.""" if not isinstance(dtype, dict): return dtype return {k: v for k, v in dtype.items() if k in df.columns} def convert_str_boolean(x) -> str | bool: - """Convert str boolean value to boolean value.""" + """ + Convert string boolean values 'True'/'False' to Python booleans. + + Parameters + ---------- + x : Any + Input value. + + Returns + ------- + bool or original value + True if 'True', False if 'False', else original value. + """ if x == "True": x = True if x == "False": @@ -78,11 +292,135 @@ def convert_str_boolean(x) -> str | bool: return x -def remove_boolean_values(x) -> str | None: - """Remove boolean values.""" +def _remove_boolean_values(x) -> str | None: + """Remove boolean values or string representations of boolean.""" x = convert_str_boolean(x) - if x is True: - return - if x is False: - return + if x is True or x is False: + return None return x + + +def remove_boolean_values(data, dtypes) -> pd.DataFrame: + """ + Remove boolean values from a DataFrame and adjust dtypes. + + Parameters + ---------- + data : pd.DataFrame + Input data. + dtypes : dict + Dictionary mapping column names to desired dtypes. + + Returns + ------- + pd.DataFrame + DataFrame with booleans removed and dtype adjusted. + """ + data = data.map(_remove_boolean_values) + dtype = _adjust_dtype(dtypes, data) + return data.astype(dtype) + + +def process_textfilereader( + reader: Iterable[pd.DataFrame], + func: Callable, + func_args: tuple = (), + func_kwargs: dict[str, Any] | None = None, + read_kwargs: dict[str, Any] | tuple[dict[str, Any], ...] | None = None, + write_kwargs: dict[str, Any] | None = None, + makecopy: bool = True, +) -> tuple[pd.DataFrame, ...]: + """ + Process a stream of DataFrames using a function and return processed results. + + Each DataFrame from `reader` is passed to `func`, which can return one or more + DataFrames or other outputs. DataFrame outputs are concatenated in memory and + returned as a tuple along with any additional non-DataFrame outputs. + + Parameters + ---------- + reader : Iterable[pd.DataFrame] + An iterable of DataFrames (e.g., a CSV reader returning chunks). + func : Callable + Function to apply to each DataFrame. + func_args : tuple, optional + Positional arguments passed to `func`. + func_kwargs : dict, optional + Keyword arguments passed to `func`. + read_kwargs : dict or tuple of dict, optional + Arguments to pass to `pd.read_csv` when reconstructing output DataFrames. + write_kwargs : dict, optional + Arguments to pass to `DataFrame.to_csv` when buffering output. + makecopy : bool, default True + If True, makes a copy of each input DataFrame before processing. + + Returns + ------- + tuple + A tuple containing: + - One or more processed DataFrames (in the same order as returned by `func`) + - Any additional outputs from `func` that are not DataFrames + """ + if func_kwargs is None: + func_kwargs = {} + if read_kwargs is None: + read_kwargs = {} + if write_kwargs is None: + write_kwargs = {} + + buffers = [] + columns = [] + + if makecopy is True: + reader = make_copy(reader) + + output_add = [] + + for df in reader: + outputs = func(df, *func_args, **func_kwargs) + if not isinstance(outputs, tuple): + outputs = (outputs,) + + output_dfs = [] + first_chunk = not buffers + + for out in outputs: + if isinstance(out, pd.DataFrame): + output_dfs.append(out) + elif first_chunk: + output_add.append(out) + + if not buffers: + buffers = [StringIO() for _ in output_dfs] + columns = [out.columns for out in output_dfs] + + for buffer, out_df in zip(buffers, output_dfs): + out_df.to_csv( + buffer, + header=False, + mode="a", + index=False, + quoting=csv.QUOTE_NONE, + sep=properties.internal_delimiter, + quotechar="\0", + escapechar="\0", + **write_kwargs, + ) + + if isinstance(read_kwargs, dict): + read_kwargs = tuple(read_kwargs for _ in range(len(buffers))) + + result_dfs = [] + for buffer, cols, rk in zip(buffers, columns, read_kwargs): + buffer.seek(0) + result_dfs.append( + pd.read_csv( + buffer, + names=cols, + delimiter=properties.internal_delimiter, + quotechar="\0", + escapechar="\0", + **rk, + ) + ) + return tuple(result_dfs + output_add) diff --git a/cdm_reader_mapper/mdf_reader/utils/validators.py b/cdm_reader_mapper/mdf_reader/utils/validators.py index 8a4d2738..d4d84057 100755 --- a/cdm_reader_mapper/mdf_reader/utils/validators.py +++ b/cdm_reader_mapper/mdf_reader/utils/validators.py @@ -1,233 +1,222 @@ -"""Validate entries.""" +"""Data validation module.""" from __future__ import annotations import logging - import numpy as np import pandas as pd +from typing import Any, Iterable + from .. import properties from ..codes import codes -from ..schemas import schemas from .utilities import convert_str_boolean -def validate_datetime(elements, data) -> pd.DataFrame: - """DOCUMENTATION.""" - - def is_date_object(object): - if hasattr(object, "year"): - return True - - mask = pd.DataFrame(index=data.index, data=False, columns=elements) - mask[elements] = ( - data[elements].apply(np.vectorize(is_date_object)) | data[elements].isna() - ) - return mask - - -def validate_numeric(elements, data, schema) -> pd.DataFrame: - """DOCUMENTATION.""" - - # Find thresholds in schema. Flag if not available -> warn - def _to_numeric(x): - if x is None: - return np.nan - x = convert_str_boolean(x) - if isinstance(x, bool): - return x - try: - return float(x) - except ValueError: - return False - - data[elements] = data[elements].map(_to_numeric) - mask = pd.DataFrame(index=data.index, data=False, columns=elements) - lower = {x: schema.get(x).get("valid_min", -np.inf) for x in elements} - upper = {x: schema.get(x).get("valid_max", np.inf) for x in elements} - - set_elements = [ - x for x in lower.keys() if lower.get(x) != -np.inf and upper.get(x) != np.inf - ] - - if len([x for x in elements if x not in set_elements]) > 0: - logging.warning( - "Data numeric elements with missing upper or lower threshold: {}".format( - ",".join([str(x) for x in elements if x not in set_elements]) - ) - ) - logging.warning( - "Corresponding upper and/or lower bounds set to +/-inf for validation" - ) - mask[elements] = ( - (data[elements] >= [lower.get(x) for x in elements]) - & (data[elements] <= [upper.get(x) for x in elements]) - ) | data[elements].isna() - return mask - - -def validate_str(elements, data) -> pd.DataFrame: - """DOCUMENTATION.""" - return pd.DataFrame(index=data.index, data=True, columns=elements) - - -def validate_codes(elements, data, schema, imodel, ext_table_path) -> pd.DataFrame: - """DOCUMENTATION.""" - mask = pd.DataFrame(index=data.index, data=False, columns=elements) - for element in elements: - code_table_name = schema.get(element).get("codetable") - if not code_table_name: - logging.error(f"Code table not defined for element {element}") - logging.warning("Element mask set to False") - continue +def _is_false(x: Any) -> bool: + """Check if a value is exactly False.""" + return x is False - table = codes.read_table( - code_table_name, - imodel=imodel, - ext_table_path=ext_table_path, - ) - if not table: - continue - dtype = properties.pandas_dtypes.get(schema.get(element).get("column_type")) +def _is_true(x: Any) -> bool: + """Check if a value is exactly False.""" + return x is True + + +def validate_datetime(series: pd.Series) -> pd.Series: + """ + Validate that entries in a pandas Series can be converted to datetime. + + Missing values are treated as valid. + + Parameters + ---------- + series : pd.Series + Series of object values to validate + + Returns + ------- + pd.Series + Boolean Series indicating valid entries + """ + dates = pd.to_datetime(series, errors="coerce") + return dates.notna() | series.isna() + + +def validate_numeric( + series: pd.Series, valid_min: float, valid_max: float +) -> pd.Series: + """ + Validate that entries in a pandas Series are numeric and within a range. + + - Converts boolean-like strings to bools. + - Invalid or missing values are marked as False unless missing (NaN). + + Parameters + ---------- + series : pd.Series + Series of object values to validate + valid_min : float + Minimum valid value + valid_max : float + Maximum valid value + + Returns + ------- + pd.Series + Boolean Series indicating valid entries + """ + converted = series.apply(convert_str_boolean) + numeric = pd.to_numeric(converted, errors="coerce") + valid_range = numeric.between(valid_min, valid_max) + return valid_range | series.isna() + - table_keys = list(table.keys()) - validation_df = data[element] - value = validation_df.astype(dtype).astype("str") - valid = validation_df.notna() - mask_ = value.isin(table_keys) - mask[element] = mask_.where(valid, True) | validation_df.isna() +def validate_str(series: pd.Series) -> pd.Series: + """ + Validate that entries in a pandas Series are strings. - return mask + Currently all values are treated as valid. + Parameters + ---------- + series : pd.Series + Series of object values to validate + + Returns + ------- + pd.Series + Boolean Series with all True + """ + return pd.Series(True, index=series.index, dtype="boolean") -def _get_elements(elements, element_atts, key) -> list[str]: - def _condition(x): - column_types = element_atts.get(x).get("column_type") - if key == "numeric_types": - return column_types in properties.numeric_types - return column_types == key - return [x for x in elements if _condition(x)] +def validate_codes( + series: pd.Series, code_table: Iterable[Any], column_type: str +) -> pd.Series: + """ + Validate that entries in a pandas Series exist in a provided code table. + Missing values are treated as valid. -def _element_tuples(numeric_elements, datetime_elements, coded_elements) -> bool: - ele_tpl = [ - isinstance(x, tuple) - for x in numeric_elements + datetime_elements + coded_elements - ] - return any(ele_tpl) + Parameters + ---------- + series : pd.Series + Series of object values to validate + code_table : Iterable + Allowed codes for validation + column_type : str + Column type for dtype lookup (via properties.pandas_dtypes) + Returns + ------- + pd.Series + Boolean Series indicating valid entries + """ + if not code_table: + logging.error(f"Code table not found for element {series.name}") + return pd.Series(False, index=series.index) -def _mask_boolean(x, boolean) -> bool: - x = convert_str_boolean(x) - if x is boolean: - return True - return False + keys = set(code_table) + dtype = properties.pandas_dtypes.get(column_type, object) + converted = series.astype(dtype) + as_str = converted.astype(str) + return converted.isna() | as_str.isin(keys) def validate( - data, - imodel, - ext_table_path, - schema, - disables=None, + data: pd.DataFrame, + imodel: str, + ext_table_path: str, + attributes: dict[str, dict[str, Any]], + disables: list[str] | None = None, ) -> pd.DataFrame: - """Validate data. + """ + Validate a pandas DataFrame according to a data model and code tables. + + Each column is validated based on its `column_type` attribute. Supports: + - Numeric types: checked against valid_min and valid_max + - Keys: checked against a code table + - Datetime and string: validated using simple validators + - Explicit boolean literals ("True"/"False") override column validation Parameters ---------- - data: pd.DataFrame - DataFrame for validation. - imodel: str - Name of internally available input data model. - e.g. icoads_r300_d704 - ext_table_path: str - Path to the code tables for an external data model - schema: dict - Data model schema. - disables: list, optional - List of column names to be ignored. + data : pd.DataFrame + Input data to validate. + imodel : str + Name of the internal data model, e.g., 'icoads_r300_d704'. + ext_table_path : str + Path to external code tables for validation. + attributes : dict[str, dict] + Dictionary of column attributes (e.g., type, valid ranges, codetable). + disables : list[str], optional + Columns to skip during validation. Returns ------- pd.DataFrame - Validated boolean mask. + Boolean mask of the same shape as `data`. True indicates a valid entry. """ - logging.basicConfig( - format="%(levelname)s\t[%(asctime)s](%(filename)s)\t%(message)s", - level=logging.INFO, - datefmt="%Y%m%d %H:%M:%S", - filename=None, - ) - # Check input if not isinstance(data, pd.DataFrame): logging.error("input data must be a pandas DataFrame.") - return + return None - mask = pd.DataFrame(index=data.index, columns=data.columns, dtype="boolean") + mask = pd.DataFrame(pd.NA, index=data.index, columns=data.columns, dtype="boolean") if data.empty: return mask - # Get the data elements from the input data: might be just a subset of - # data model and flatten the schema to get a simple and sequential list - # of elements included in the input data - elements = [x for x in data if x not in disables] - element_atts = schemas.df_schema(elements, schema) - - # See what elements we need to validate - numeric_elements = _get_elements(elements, element_atts, "numeric_types") - datetime_elements = _get_elements(elements, element_atts, "datetime") - coded_elements = _get_elements(elements, element_atts, "key") - str_elements = _get_elements(elements, element_atts, "str") - - if _element_tuples(numeric_elements, datetime_elements, coded_elements): - validated_columns = pd.MultiIndex.from_tuples( - list(set(numeric_elements + coded_elements + datetime_elements)) - ) - else: - validated_columns = list( - set(numeric_elements + coded_elements + datetime_elements) - ) - - mask[numeric_elements] = validate_numeric(numeric_elements, data, element_atts) - - # 2. Table coded elements - # See following: in multiple keys code tables, the non parameter element, - # won't have a code_table attribute in the element_atts: - # So we need to check the code_table.keys files in addition to the element_atts - # Additionally, a YEAR key can fail in one table, but be compliant with anbother, then, how would we mask this? - # also, a YEAR defined as an integer, will undergo its own check..... - # So I think we need to check nested keys as a whole, and mask only the actual parameterized element: - # Get the full list of keys combinations (tuples, triplets...) and check the column combination against that: if it fails, mark the element! - # Need to see how to grab the YEAR part of a datetime when YEAR comes from a datetime element - # pd.DatetimeIndex(df['_datetime']).year - if len(coded_elements) > 0: - mask[coded_elements] = validate_codes( - coded_elements, - data, - element_atts, - imodel, - ext_table_path, - ) - - # 3. Datetime elements - mask[datetime_elements] = validate_datetime(datetime_elements, data) - - # 4. str elements - mask[str_elements] = validate_str(str_elements, data) - - # 5. Set False values - mask[validated_columns] = mask[validated_columns].mask( - data[validated_columns].map(_mask_boolean, boolean=False), - False, - ) - - mask[validated_columns] = mask[validated_columns].mask( - data[validated_columns].map(_mask_boolean, boolean=True), - True, - ) - - mask[disables] = np.nan + disables = disables or [] + elements = [col for col in data.columns if col not in disables] + element_atts = { + element: attributes[element] for element in elements if element in attributes + } + + validated_columns = [] + validated_dtypes = set(properties.numeric_types) | {"datetime", "key"} + + basic_functions = { + "datetime": validate_datetime, + "str": validate_str, + } + + for column in data.columns: + if column in disables or column not in attributes: + continue + + series = data[column] + column_atts = element_atts.get(column, {}) + column_type = column_atts.get("column_type") + + if column_type in properties.numeric_types: + valid_min = column_atts.get("valid_min", -np.inf) + valid_max = column_atts.get("valid_max", np.inf) + column_mask = validate_numeric(series, valid_min, valid_max) + elif column_type == "key": + code_table_name = column_atts.get("codetable") + code_table = codes.read_table( + code_table_name, imodel=imodel, ext_table_path=ext_table_path + ) + column_mask = validate_codes(series, code_table, column_type) + elif column_type in basic_functions: + column_mask = basic_functions[column_type](series) + else: + logging.warning( + f"Unknown column_type '{column_type}' for column '{column}'" + ) + continue + + mask[column] = column_mask + if column_type in validated_dtypes: + validated_columns.append(column) + + # Explicit boolean literals ("True"/"False") override validation results + if validated_columns: + validated_columns = list(dict.fromkeys(validated_columns)) + to_bool = data[validated_columns].applymap(convert_str_boolean) + false_mask = to_bool.applymap(_is_false) + true_mask = to_bool.applymap(_is_true) + mask[validated_columns] = mask[validated_columns].mask(false_mask, False) + mask[validated_columns] = mask[validated_columns].mask(true_mask, True) + return mask.astype("boolean") diff --git a/cdm_reader_mapper/mdf_reader/writer.py b/cdm_reader_mapper/mdf_reader/writer.py index 9dabc272..a2d45fcf 100755 --- a/cdm_reader_mapper/mdf_reader/writer.py +++ b/cdm_reader_mapper/mdf_reader/writer.py @@ -5,33 +5,21 @@ import json import logging from io import StringIO as StringIO +from pathlib import Path import pandas as pd -from cdm_reader_mapper.common import get_filename -from cdm_reader_mapper.common.pandas_TextParser_hdlr import make_copy +from .utils.utilities import join, update_column_names, update_dtypes - -def _update_dtypes(dtypes, columns) -> dict: - if isinstance(dtypes, dict): - dtypes = {k: v for k, v in dtypes.items() if k in columns} - return dtypes - - -def _update_col_names(dtypes, col_o, col_n) -> str | dict: - if isinstance(dtypes, str): - return dtypes - if col_o in dtypes.keys(): - dtypes[col_n] = dtypes[col_o] - del dtypes[col_o] - return dtypes +from ..common import get_filename +from ..common.pandas_TextParser_hdlr import make_copy def write_data( data, mask=None, - dtypes={}, - parse_dates=False, + dtypes: dict | None = None, + parse_dates: list | bool = False, encoding="utf-8", out_dir=".", prefix=None, @@ -100,30 +88,29 @@ def write_data( ---- Use this function after reading MDF data. """ - - def _join(col): - if isinstance(col, (list, tuple)): - return ":".join(col) - return col + dtypes = dtypes or {} + if isinstance(parse_dates, bool): + parse_dates = [] if not isinstance(data, pd.io.parsers.TextFileReader): - data = [data] + data_list = [data] else: - data = make_copy(data) + data_list = make_copy(data) if mask is None: mask = pd.DataFrame() if not isinstance(mask, pd.io.parsers.TextFileReader): - mask = [mask] + mask_list = [mask] else: - mask = make_copy(mask) + mask_list = make_copy(mask) - info = {} - info["dtypes"] = dtypes - info["parse_dates"] = [_join(parse_date) for parse_date in parse_dates] + info = {"dtypes": dtypes.copy(), "parse_dates": [join(p) for p in parse_dates]} logging.info(f"WRITING DATA TO FILES IN: {out_dir}") + out_dir = Path(out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + filename_data = get_filename( [prefix, "data", suffix], path=out_dir, extension=extension ) @@ -133,37 +120,40 @@ def _join(col): filename_info = get_filename( [prefix, "info", suffix], path=out_dir, extension="json" ) - for i, (data_df, mask_df) in enumerate(zip(data, mask)): + + for i, (data_df, mask_df) in enumerate(zip(data_list, mask_list)): if col_subset is not None: data_df = data_df[col_subset] mask_df = mask_df[col_subset] - header = False - mode = "a" + + if isinstance(data_df, pd.Series): + data_df = data_df.to_frame() + if isinstance(mask_df, pd.Series): + mask_df = mask_df.to_frame() + + mode = "w" if i == 0 else "a" + header = [join(c) for c in data_df.columns] if i == 0 else False + if i == 0: - mode = "w" - header = [] - info["dtypes"] = _update_dtypes(info["dtypes"], data_df.columns) + info["dtypes"] = update_dtypes(info["dtypes"], data_df.columns) for col in data_df.columns: - col_ = _join(col) - header.append(col_) - info["dtypes"] = _update_col_names(info["dtypes"], col, col_) + info["dtypes"] = update_column_names(info["dtypes"], col, join(col)) - info["parse_dates"] = [ - parse_date for parse_date in info["parse_dates"] if parse_date in header - ] + info["parse_dates"] = [p for p in info["parse_dates"] if p in header] info["encoding"] = encoding - kwargs = { - "header": header, - "mode": mode, - "encoding": encoding, - "index": False, - "sep": delimiter, - } - data_df.to_csv(filename_data, **kwargs) + csv_kwargs = dict( + header=header, + mode=mode, + index=False, + sep=delimiter, + encoding=encoding, + **kwargs, + ) + + data_df.to_csv(filename_data, **csv_kwargs) if not mask_df.empty: - mask_df.to_csv(filename_mask, **kwargs) + mask_df.to_csv(filename_mask, **csv_kwargs) - if info: - with open(filename_info, "w") as fileObj: - json.dump(info, fileObj, indent=4) + with open(filename_info, "w") as fileObj: + json.dump(info, fileObj, indent=4) diff --git a/cdm_reader_mapper/metmetpy/correct.py b/cdm_reader_mapper/metmetpy/correct.py index 66901f24..85f85603 100755 --- a/cdm_reader_mapper/metmetpy/correct.py +++ b/cdm_reader_mapper/metmetpy/correct.py @@ -64,8 +64,8 @@ import pandas as pd -from cdm_reader_mapper.common import logging_hdlr, pandas_TextParser_hdlr -from cdm_reader_mapper.common.json_dict import collect_json_files, combine_dicts +from ..common import logging_hdlr, pandas_TextParser_hdlr +from ..common.json_dict import collect_json_files, combine_dicts from . import properties from .datetime import correction_functions as corr_f_dt diff --git a/cdm_reader_mapper/metmetpy/validate.py b/cdm_reader_mapper/metmetpy/validate.py index 87640116..f8180a02 100755 --- a/cdm_reader_mapper/metmetpy/validate.py +++ b/cdm_reader_mapper/metmetpy/validate.py @@ -61,8 +61,8 @@ import pandas as pd -from cdm_reader_mapper.common import logging_hdlr, pandas_TextParser_hdlr -from cdm_reader_mapper.common.json_dict import collect_json_files, combine_dicts +from ..common import logging_hdlr, pandas_TextParser_hdlr +from ..common.json_dict import collect_json_files, combine_dicts from . import properties from .datetime import model_datetimes diff --git a/tests/test_mdf_reader.py b/tests/test_mdf_reader.py index 8c194735..59c75f50 100755 --- a/tests/test_mdf_reader.py +++ b/tests/test_mdf_reader.py @@ -2,14 +2,24 @@ import os +import numpy as np import pandas as pd import pytest -from cdm_reader_mapper import test_data +from cdm_reader_mapper import test_data, DataBundle from cdm_reader_mapper.mdf_reader.reader import ( read_mdf, read_data, + validate_read_mdf_args, ) +from cdm_reader_mapper.mdf_reader.utils.filereader import _apply_multiindex + + +def _get_columns(columns, select): + if isinstance(columns, pd.MultiIndex): + return columns.get_level_values(0).isin(select) + mask = [(type(c) is tuple and c[0] in select) or (c in select) for c in columns] + return np.array(mask) def _drop_rows(df, drops): @@ -37,19 +47,22 @@ def _read_mdf_test_data(data_model, select=None, drop=None, drop_idx=None, **kwa result.mask = result.mask.read() if select: - expected.data = expected.data[select] - expected.mask = expected.mask[select] + selected = _get_columns(expected.data.columns, select) + expected.data = expected.data.loc[:, selected] + expected.mask = expected.mask.loc[:, selected] if drop: - result.data = result.data.drop(columns=drop) - result.mask = result.mask.drop(columns=drop) - expected.data = expected.data.drop(columns=drop) - expected.mask = expected.mask.drop(columns=drop) + unselected = _get_columns(expected.data.columns, drop) + expected.data = expected.data.loc[:, ~unselected] + expected.mask = expected.mask.loc[:, ~unselected] if drop_idx: expected.data = _drop_rows(expected.data, drop_idx) expected.mask = _drop_rows(expected.mask, drop_idx) + expected.data = _apply_multiindex(expected.data) + expected.mask = _apply_multiindex(expected.mask) + pd.testing.assert_frame_equal(result.data, expected.data) pd.testing.assert_frame_equal(result.mask, expected.mask) @@ -78,7 +91,7 @@ def _read_mdf_test_data(data_model, select=None, drop=None, drop_idx=None, **kwa "gdac", ], ) -def test_read_mdf_test_data(data_model): +def test_read_mdf_test_data_basic(data_model): _read_mdf_test_data(data_model) @@ -137,19 +150,38 @@ def test_read_mdf_test_data_kwargs(data_model, kwargs): "data_model, kwargs, select", [ ("icoads_r300_d714", {"sections": ["c99"], "chunksize": 3}, ["c99"]), + ("icoads_r300_d714", {"sections": ["c99"]}, ["c99"]), + ("icoads_r300_d714", {"sections": "c99"}, ["c99"]), ( "icoads_r300_d714", - {"sections": ["core", "c99"], "chunksize": 3}, + {"sections": ["core", "c99"]}, ["core", "c99"], ), + ("craid", {"sections": ["drifter_measurements"]}, ["drifter_measurements"]), ], ) def test_read_mdf_test_data_select(data_model, kwargs, select): _read_mdf_test_data(data_model, select=select, **kwargs) -def test_read_mdf_test_data_drop(): - _read_mdf_test_data("icoads_r300_mixed", drop=["c99"], encoding="cp1252") +@pytest.mark.parametrize( + "data_model, kwargs, drop", + [ + ("icoads_r300_d714", {"excludes": ["c98"]}, ["c98"]), + ("icoads_r300_d714", {"excludes": "c98"}, ["c98"]), + ("icoads_r300_d714", {"excludes": ["c5", "c98"]}, ["c5", "c98"]), + ("icoads_r300_mixed", {"excludes": ["c99"], "encoding": "cp1252"}, ["c99"]), + ("icoads_r300_mixed", {"excludes": "c99", "encoding": "cp1252"}, ["c99"]), + ( + "craid", + {"excludes": ["drifter_measurements", "drifter_history"]}, + ["drifter_measurements", "drifter_history"], + ), + ("gdac", {"excludes": "AAAA"}, ["AAAA"]), + ], +) +def test_read_mdf_test_data_exclude(data_model, kwargs, drop): + _read_mdf_test_data(data_model, drop=drop, **kwargs) @pytest.mark.parametrize( @@ -168,3 +200,278 @@ def test_read_mdf_test_data_drop(): ) def test_read_mdf_test_data_drop_idx(data_model, kwargs, drop_idx): _read_mdf_test_data(data_model, drop_idx=drop_idx, **kwargs) + + +def test_read_data_basic(): + data_model = "icoads_r300_d721" + data = test_data[f"test_{data_model}"]["mdf_data"] + mask = test_data[f"test_{data_model}"]["mdf_mask"] + info = test_data[f"test_{data_model}"]["mdf_info"] + db = read_data(data, mask, info) + + assert isinstance(db, DataBundle) + + for attr in [ + "data", + "mask", + "columns", + "dtypes", + "parse_dates", + "encoding", + "imodel", + "mode", + ]: + assert hasattr(db, attr) + + assert isinstance(db.data, pd.DataFrame) + assert isinstance(db.mask, pd.DataFrame) + assert isinstance(db.columns, pd.MultiIndex) + assert isinstance(db.dtypes, dict) + assert isinstance(db.parse_dates, list) + assert isinstance(db.encoding, str) + assert db.encoding == "cp1252" + assert db.imodel is None + assert isinstance(db.mode, str) + assert db.mode == "data" + assert len(db) == 5 + assert db.shape == (5, 341) + assert db.size == 1705 + + +def test_read_data_no_mask(): + data_model = "icoads_r300_d721" + data = test_data[f"test_{data_model}"]["mdf_data"] + info = test_data[f"test_{data_model}"]["mdf_info"] + db = read_data(data, info=info) + + assert isinstance(db, DataBundle) + + for attr in [ + "data", + "mask", + "columns", + "dtypes", + "parse_dates", + "encoding", + "imodel", + "mode", + ]: + assert hasattr(db, attr) + + assert isinstance(db.data, pd.DataFrame) + assert isinstance(db.mask, pd.DataFrame) + assert isinstance(db.columns, pd.MultiIndex) + assert isinstance(db.dtypes, dict) + assert isinstance(db.parse_dates, list) + assert isinstance(db.encoding, str) + assert db.encoding == "cp1252" + assert db.imodel is None + assert isinstance(db.mode, str) + assert db.mode == "data" + assert len(db) == 5 + assert db.shape == (5, 341) + assert db.size == 1705 + + +def test_read_data_no_info(): + data_model = "icoads_r300_d721" + data = test_data[f"test_{data_model}"]["mdf_data"] + + db = read_data(data) + + assert isinstance(db, DataBundle) + + for attr in [ + "data", + "mask", + "columns", + "dtypes", + "parse_dates", + "encoding", + "imodel", + "mode", + ]: + assert hasattr(db, attr) + + assert isinstance(db.data, pd.DataFrame) + assert isinstance(db.mask, pd.DataFrame) + assert isinstance(db.columns, pd.MultiIndex) + assert db.dtypes == "object" + assert db.parse_dates is False + assert db.encoding is None + assert db.imodel is None + assert isinstance(db.mode, str) + assert db.mode == "data" + assert len(db) == 5 + assert db.shape == (5, 341) + assert db.size == 1705 + + +def test_read_data_col_subset(): + data_model = "icoads_r300_d721" + data = test_data[f"test_{data_model}"]["mdf_data"] + info = test_data[f"test_{data_model}"]["mdf_info"] + db = read_data(data, info=info, col_subset="core") + + assert isinstance(db, DataBundle) + + for attr in [ + "data", + "mask", + "columns", + "dtypes", + "parse_dates", + "encoding", + "imodel", + "mode", + ]: + assert hasattr(db, attr) + + assert isinstance(db.data, pd.DataFrame) + assert isinstance(db.mask, pd.DataFrame) + assert isinstance(db.columns, pd.Index) + assert isinstance(db.dtypes, dict) + assert isinstance(db.parse_dates, list) + assert isinstance(db.encoding, str) + assert db.encoding == "cp1252" + assert db.imodel is None + assert isinstance(db.mode, str) + assert db.mode == "data" + assert len(db) == 5 + assert db.shape == (5, 48) + assert db.size == 240 + + +def test_read_data_encoding(): + data_model = "icoads_r300_d721" + data = test_data[f"test_{data_model}"]["mdf_data"] + db = read_data(data, encoding="cp1252") + + assert isinstance(db, DataBundle) + + for attr in [ + "data", + "mask", + "columns", + "dtypes", + "parse_dates", + "encoding", + "imodel", + "mode", + ]: + assert hasattr(db, attr) + + assert isinstance(db.data, pd.DataFrame) + assert isinstance(db.mask, pd.DataFrame) + assert isinstance(db.columns, pd.Index) + assert db.dtypes == "object" + assert db.parse_dates is False + assert isinstance(db.encoding, str) + assert db.encoding == "cp1252" + assert db.imodel is None + assert isinstance(db.mode, str) + assert db.mode == "data" + assert len(db) == 5 + assert db.shape == (5, 341) + assert db.size == 1705 + + +def test_validate_read_mdf_args_pass(tmp_path): + source = tmp_path / "file.mdf" + source.touch() + + validate_read_mdf_args( + source=source, + imodel=object(), + ext_schema_path=None, + ext_schema_file=None, + year_init=2000, + year_end=2020, + chunksize=100, + skiprows=0, + ) + + +def test_validate_read_mdf_args_invalid_source(tmp_path): + with pytest.raises(FileNotFoundError): + validate_read_mdf_args( + source=tmp_path / "missing.mdf", + imodel=object(), + ext_schema_path=None, + ext_schema_file=None, + year_init=None, + year_end=None, + chunksize=None, + skiprows=0, + ) + + +def test_validate_read_mdf_args_missing_all_sources(tmp_path): + source = tmp_path / "file.mdf" + source.touch() + + with pytest.raises( + ValueError, + match="One of imodel or ext_schema_path/ext_schema_file must be provided", + ): + validate_read_mdf_args( + source=source, + imodel=None, + ext_schema_path=None, + ext_schema_file=None, + year_init=None, + year_end=None, + chunksize=None, + skiprows=0, + ) + + +def test_validate_read_mdf_args_invalid_chunksize(tmp_path): + source = tmp_path / "file.mdf" + source.touch() + + with pytest.raises(ValueError, match="chunksize must be a positive integer"): + validate_read_mdf_args( + source=source, + imodel=object(), + ext_schema_path=None, + ext_schema_file=None, + year_init=None, + year_end=None, + chunksize=0, + skiprows=0, + ) + + +def test_validate_read_mdf_args_invalid_skiprows(tmp_path): + source = tmp_path / "file.mdf" + source.touch() + + with pytest.raises(ValueError, match="skiprows must be >= 0"): + validate_read_mdf_args( + source=source, + imodel=object(), + ext_schema_path=None, + ext_schema_file=None, + year_init=None, + year_end=None, + chunksize=None, + skiprows=-1, + ) + + +def test_validate_read_mdf_args_invalid_years(tmp_path): + source = tmp_path / "file.mdf" + source.touch() + + with pytest.raises(ValueError, match="year_init must be <= year_end"): + validate_read_mdf_args( + source=source, + imodel=object(), + ext_schema_path=None, + ext_schema_file=None, + year_init=2021, + year_end=2020, + chunksize=None, + skiprows=0, + ) diff --git a/tests/test_mdf_writer.py b/tests/test_mdf_writer.py new file mode 100755 index 00000000..fc0a4ca1 --- /dev/null +++ b/tests/test_mdf_writer.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +import json + +import pandas as pd +import pytest # noqa + +from pandas.testing import assert_frame_equal + +from cdm_reader_mapper.mdf_reader.writer import ( + write_data, +) + + +def test_write_data_basic(tmp_path): + data = pd.DataFrame( + { + "A": [1, 2, 3], + "B": ["1", "2", "3"], + } + ) + mask = pd.DataFrame( + { + "A": [True, True, False], + "B": [False, True, True], + } + ) + info = { + "dtypes": {"A": "int", "B": "str"}, + "parse_dates": [], + "encoding": "utf-8", + } + + write_data( + data, + mask=mask, + out_dir=tmp_path, + prefix="test_write", + suffix="basic", + **info, + ) + + data_file = tmp_path / "test_write-data-basic.csv" + mask_file = tmp_path / "test_write-mask-basic.csv" + info_file = tmp_path / "test_write-info-basic.json" + + assert data_file.is_file() + assert mask_file.is_file() + assert info_file.is_file() + + with open(info_file) as read_file: + info_res = json.load(read_file) + + assert info_res == info + + data_res = pd.read_csv(data_file, dtype=info["dtypes"]) + assert_frame_equal(data, data_res) + + mask_res = pd.read_csv(mask_file, dtype="bool") + assert_frame_equal(mask, mask_res) + + +def test_write_data_col_subset(tmp_path): + data = pd.DataFrame( + { + "A": [1, 2, 3], + "B": ["1", "2", "3"], + } + ) + mask = pd.DataFrame( + { + "A": [True, True, False], + "B": [False, True, True], + } + ) + info = { + "dtypes": {"A": "int"}, + "parse_dates": [], + "encoding": "utf-8", + } + subset = "A" + + write_data( + data, + mask=mask, + out_dir=tmp_path, + prefix="test_write", + suffix="subset", + col_subset=subset, + **info, + ) + + data_file = tmp_path / "test_write-data-subset.csv" + mask_file = tmp_path / "test_write-mask-subset.csv" + info_file = tmp_path / "test_write-info-subset.json" + + assert data_file.is_file() + assert mask_file.is_file() + assert info_file.is_file() + + with open(info_file) as read_file: + info_res = json.load(read_file) + + assert info_res == info + + data_res = pd.read_csv(data_file, dtype=info["dtypes"]) + assert_frame_equal(data[[subset]], data_res) + + mask_res = pd.read_csv(mask_file, dtype="bool") + assert_frame_equal(mask[[subset]], mask_res) diff --git a/tests/test_reader_codes.py b/tests/test_reader_codes.py new file mode 100755 index 00000000..66b6c5ed --- /dev/null +++ b/tests/test_reader_codes.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +import pytest +from pathlib import Path +import json + +from cdm_reader_mapper.mdf_reader.codes.codes import read_table + + +@pytest.fixture +def tmp_json_file(tmp_path: Path) -> tuple[Path, dict]: + """Create a temporary JSON file and return path and data.""" + data = {"A": {"value": 1}, "B": {"value": 2}} + file_path = tmp_path / "test_table.json" + file_path.write_text(json.dumps(data), encoding="utf-8") + return file_path, data + + +def test_read_table_with_imodel(): + result = read_table("ICOADS.C99.SEALUMI", imodel="icoads_r300_d781") + assert isinstance(result, dict) + assert result == {"0": "no", "1": "yes", "9": "missing", "8": "unknown"} + + +def test_read_table_with_external_file(tmp_json_file): + file_path, expected_data = tmp_json_file + result = read_table("test_table", ext_table_path=str(file_path.parent)) + assert isinstance(result, dict) + assert result == expected_data + + +def test_read_table_with_missing_file(): + with pytest.raises(FileNotFoundError): + read_table("nonexistent_table", ext_table_path="tmp") + + +def test_read_table_requires_input(): + with pytest.raises(ValueError): + read_table("table_without_path_or_model") diff --git a/tests/test_reader_convert_and_decode.py b/tests/test_reader_convert_and_decode.py new file mode 100755 index 00000000..56c3c170 --- /dev/null +++ b/tests/test_reader_convert_and_decode.py @@ -0,0 +1,205 @@ +from __future__ import annotations + +import pandas as pd +import pytest + +from decimal import Decimal + +from cdm_reader_mapper.mdf_reader.utils.convert_and_decode import ( + max_decimal_places, + to_numeric, + Decoders, + Converters, + convert_and_decode, +) +from cdm_reader_mapper.mdf_reader import properties + + +@pytest.fixture +def sample_series(): + return pd.Series(["A", "Z", "10", "1Z"]) + + +@pytest.fixture +def numeric_series(): + return pd.Series(["1", "2 ", "3", "False", "bad"], dtype="object", name="NUM") + + +@pytest.fixture +def sample_df(): + return pd.DataFrame( + { + "NUM": ["1", "2 ", "3", "False", "bad"], # object type + "KEY": ["a", "b", "c", "d", "e"], # for decoder + } + ) + + +def test_max_decimal_places(): + assert max_decimal_places(Decimal("1"), Decimal("2.34")) == 2 + assert max_decimal_places(Decimal("1.200"), Decimal("3.4")) == 3 + assert max_decimal_places(Decimal("5")) == 0 + + +@pytest.mark.parametrize( + "value, scale, offset, expected", + [ + ("10", Decimal("0.1"), Decimal("0"), Decimal("1.0")), + ("10", Decimal("1"), Decimal("5"), Decimal("15")), + ("3.5", Decimal("2"), Decimal("1.00"), Decimal("8.00")), + (" 2 ", Decimal("1"), Decimal("0"), Decimal("2")), + ("", Decimal("1"), Decimal("0"), False), + ("abc", Decimal("1"), Decimal("0"), False), + ], +) +def test_to_numeric_valid(value, scale, offset, expected): + assert to_numeric(value, scale, offset) == expected + + +def test_to_numeric_boolean_passthrough(): + assert to_numeric(True, Decimal("1"), Decimal("0")) is True + assert to_numeric(False, Decimal("1"), Decimal("0")) is False + + +def test_to_numeric_space_replacement(): + assert to_numeric("1 2", Decimal("1"), Decimal("0")) == Decimal("102") + + +def test_to_numeric_precision_preserved(): + result = to_numeric("1.234", Decimal("0.1"), Decimal("0.00")) + assert result == Decimal("0.123") + + +def test_base36_decoding_basic(sample_series): + dec = Decoders(dtype="key") + decoder = dec.decoder() + + result = decoder(sample_series) + + assert list(result) == ["10", "35", "36", "71"] + + +def test_base36_preserves_boolean(): + series = pd.Series(["True", "False", "A"]) + dec = Decoders(dtype="key") + + result = dec.decoder()(series) + + assert result.tolist() == [True, False, "10"] + + +def test_converter_numeric(numeric_series): + conv = Converters(dtype=next(iter(properties.numeric_types))) + func = conv.converter() + + result = func(numeric_series) + + assert result.iloc[0] == Decimal("1") + assert result.iloc[1] == Decimal("2") + assert result.iloc[2] == Decimal("3") + assert result.iloc[3] is False + assert result.iloc[4] is False + + +def test_numeric_with_scale_offset(): + conv = Converters(dtype="float") + series = pd.Series(["1", "2"]) + + result = conv.object_to_numeric(series, scale=10, offset=5) + + assert result.tolist() == [Decimal("15"), Decimal("25")] + + +def test_preprocessing_function_pppp(): + conv = Converters(dtype=next(iter(properties.numeric_types))) + series = pd.Series(["0123"], name="PPPP") + + result = conv.object_to_numeric(series) + + assert result.iloc[0] == Decimal("10123") + + +def test_object_to_object_strip(): + conv = Converters(dtype="object") + series = pd.Series([" a ", "", " ", "b"]) + + result = conv.object_to_object(series) + + assert result.tolist() == ["a", None, None, "b"] + + +def test_object_to_object_disable_strip(): + conv = Converters(dtype="object") + series = pd.Series([" a ", "b "]) + + result = conv.object_to_object(series, disable_white_strip="l") + + assert result.tolist() == [" a", "b"] + + +def test_object_to_datetime(): + conv = Converters(dtype="datetime") + series = pd.Series(["20240101", "bad"]) + + result = conv.object_to_datetime(series) + + assert pd.notna(result.iloc[0]) + assert pd.isna(result.iloc[1]) + + +def test_unknown_dtype_raises(): + with pytest.raises(KeyError): + Converters("unknown").converter() + + +def test_convert_and_decode_basic(): + df = pd.DataFrame({"A": ["1", "2", "3"], "B": ["x", "y", "z"]}) + + converter_dict = { + "A": lambda s: s.apply(lambda x: Decimal(x) * 2), + "B": lambda s: s.str.upper(), + } + converter_kwargs = {"A": {}, "B": {}} + + decoder_dict = {"A": lambda s: s.apply(lambda x: str(int(x) + 1))} + + out = convert_and_decode( + df.copy(), + convert_flag=True, + decode_flag=True, + converter_dict=converter_dict, + converter_kwargs=converter_kwargs, + decoder_dict=decoder_dict, + ) + + assert out["A"].iloc[0] == Decimal(4) + assert out["A"].iloc[1] == Decimal(6) + assert out["B"].iloc[0] == "X" + + +def test_convert_and_decode_with_converters_and_decoders(sample_df): + df = sample_df.copy() + + conv = Converters(dtype="int") + converter_dict = {"NUM": conv.converter()} + converter_kwargs = {"NUM": {}} + + dec = Decoders(dtype="key") + decoder_dict = {"KEY": dec.decoder()} + + out = convert_and_decode( + df, + convert_flag=True, + decode_flag=True, + converter_dict=converter_dict, + converter_kwargs=converter_kwargs, + decoder_dict=decoder_dict, + ) + + expected_nums = [Decimal("1"), Decimal("2"), Decimal("3"), False, False] + for i, val in enumerate(expected_nums): + assert out["NUM"].iloc[i] == val + + expected_keys = ["10", "11", "12", "13", "14"] + for i, val in enumerate(expected_keys): + assert out["KEY"].iloc[i] == val diff --git a/tests/test_reader_filereader.py b/tests/test_reader_filereader.py new file mode 100755 index 00000000..89badf3a --- /dev/null +++ b/tests/test_reader_filereader.py @@ -0,0 +1,353 @@ +from __future__ import annotations + +import pytest + +import pandas as pd +import xarray as xr + +from io import StringIO + +from pandas.io.parsers import TextFileReader +from pandas.testing import assert_frame_equal, assert_index_equal + +from cdm_reader_mapper import DataBundle + +from cdm_reader_mapper.mdf_reader.utils.parser import OrderSpec, ParserConfig + +from cdm_reader_mapper.mdf_reader.utils.filereader import ( + _apply_or_chunk, + _merge_kwargs, + _apply_multiindex, + _select_years, + FileReader, +) + + +def f(x, y): + return x + y + + +def test_merge_kwargs_success(): + out = _merge_kwargs({"a": 1}, {"b": 2}) + assert out == {"a": 1, "b": 2} + + +def test_merge_kwargs_duplicate_key(): + with pytest.raises(ValueError): + _merge_kwargs({"a": 1}, {"a": 2}) + + +def test_apply_multiindex_no_tuples(): + df = pd.DataFrame({"a": [1], "b": [2]}) + out = _apply_multiindex(df) + assert out.columns.equals(df.columns) + + +def test_apply_multiindex_with_tuples(): + df = pd.DataFrame({("core", "YR"): [2010], ("core", "MO"): [7]}) + out = _apply_multiindex(df) + assert isinstance(out.columns, pd.MultiIndex) + assert out.columns.tolist() == [("core", "YR"), ("core", "MO")] + + +def test_select_years_no_selection(): + df = pd.DataFrame({"YR": [2000, 2001]}) + out = _select_years(df, (None, None), "YR") + pd.testing.assert_frame_equal(out, df) + + +def test_select_years_range(): + df = pd.DataFrame({"YR": [1999, 2000, 2001, 2002]}) + out = _select_years(df, (2000, 2001), "YR") + assert out["YR"].tolist() == [2000, 2001] + + +def test_select_years_handles_non_numeric(): + df = pd.DataFrame({"YR": ["2000", "bad", "2001"]}) + out = _select_years(df, (2000, 2001), "YR") + assert out["YR"].tolist() == ["2000", "2001"] + + +def test_apply_or_chunk_dataframe(): + df = pd.DataFrame({"test": [1, 2, 3, 4]}) + out = _apply_or_chunk(df, f, func_args=[2]) + assert isinstance(out, pd.DataFrame) + assert_frame_equal(out, pd.DataFrame({"test": [3, 4, 5, 6]})) + + +def test_apply_or_chunk_textfilereader(): + buffer = StringIO("test\n1\n2\n3\n4") + read_kwargs = {"chunksize": 2} + reader = pd.read_csv(buffer, **read_kwargs) + (out,) = _apply_or_chunk(reader, f, func_args=[2], read_kwargs=read_kwargs) + assert isinstance(out, TextFileReader) + assert_frame_equal(out.read(), pd.DataFrame({"test": [3, 4, 5, 6]})) + + +@pytest.fixture +def dtypes(): + return { + ("core", "YR"): "Int64", + ("core", "MO"): "Int64", + ("core", "DY"): "Int64", + ("core", "HR"): "Int64", + } + + +@pytest.fixture +def fake_pandas_df(): + data = { + "0": [ + "2010 7 1 100", + "2010 7 2 200", + "2010 7 3 300", + ] + } + return pd.DataFrame(data) + + +@pytest.fixture +def fake_pandas_df_file(fake_pandas_df, tmp_path): + file_path = tmp_path / "fake_dataframe.csv" + fake_pandas_df.to_csv(file_path, header=False, index=False) + return file_path + + +@pytest.fixture +def fake_xr_dataset(): + return xr.Dataset( + { + "YR": ("time", [2010, 2010, 2010]), + "MO": ("time", [7, 7, 7]), + "DY": ("time", [1, 2, 3]), + "HR": ("time", [10, 20, 30]), + }, + coords={"time": [0, 1, 2]}, + attrs={"source": "fake"}, + ) + + +@pytest.fixture +def fake_xr_dataset_file(fake_xr_dataset, tmp_path): + file_path = tmp_path / "fake_dataset.nc" + fake_xr_dataset.to_netcdf(file_path) + return file_path + + +@pytest.fixture +def fake_out_dataset(dtypes): + data = { + ("core", "YR"): [2010, 2010, 2010], + ("core", "MO"): [7, 7, 7], + ("core", "DY"): [1, 2, 3], + ("core", "HR"): [10, 20, 30], + } + df = pd.DataFrame(data) + + for col, dtype in dtypes.items(): + df[col] = df[col].astype(dtype) + + return df + + +@pytest.fixture +def fake_config(dtypes): + order_specs = { + "core": OrderSpec( + header={"length": 12, "field_layout": "fixed_width"}, + elements={ + "YR": {"index": ("core", "YR"), "ignore": False, "field_length": 4}, + "MO": {"index": ("core", "MO"), "ignore": False, "field_length": 2}, + "DY": {"index": ("core", "DY"), "ignore": False, "field_length": 2}, + "HR": {"index": ("core", "HR"), "ignore": False, "field_length": 4}, + }, + is_delimited=False, + ) + } + return ParserConfig( + order_specs=order_specs, + disable_reads=[], + dtypes=dtypes, + parse_dates=[], + convert_decode={ + "converter_dict": {}, + "converter_kwargs": {}, + "decoder_dict": {}, + }, + validation={}, + encoding="utf-8", + ) + + +@pytest.fixture +def reader_pd(fake_config): + r = FileReader("icoads") + # override config for test + r.config = fake_config + return r + + +@pytest.fixture +def reader_xr(fake_config): + r = FileReader("craid") + # override config for test + r.config = fake_config + return r + + +def test_process_data_pandas(reader_pd, fake_pandas_df, fake_out_dataset): + data, mask, config = reader_pd._process_data( + fake_pandas_df, + convert_flag=False, + decode_flag=False, + converter_dict=None, + converter_kwargs=None, + decoder_dict=None, + validate_flag=False, + ext_table_path=None, + sections=None, + excludes=None, + config=reader_pd.config, + parse_mode="pandas", + ) + + assert isinstance(data, pd.DataFrame) + assert isinstance(mask, pd.DataFrame) + assert_index_equal(data.columns, mask.columns) + assert len(data) == len(mask) + + assert config.columns is not None + + assert_frame_equal(data, fake_out_dataset) + assert_index_equal(data.columns, config.columns) + + assert mask.all().all() + + +def test_process_data_netcdf(reader_xr, fake_xr_dataset, fake_out_dataset): + data, mask, config = reader_xr._process_data( + fake_xr_dataset, + convert_flag=False, + decode_flag=False, + converter_dict=None, + converter_kwargs=None, + decoder_dict=None, + validate_flag=False, + ext_table_path=None, + sections=None, + excludes=None, + config=reader_xr.config, + parse_mode="netcdf", + ) + + assert isinstance(data, pd.DataFrame) + assert isinstance(mask, pd.DataFrame) + assert_index_equal(data.columns, mask.columns) + assert len(data) == len(mask) + + assert config.columns is not None + + assert_frame_equal(data, fake_out_dataset) + assert_index_equal(data.columns, config.columns) + + assert mask.all().all() + + +def test_open_data_pandas(reader_pd, fake_pandas_df_file, fake_out_dataset): + data, mask, config = reader_pd.open_data( + fake_pandas_df_file, + open_with="pandas", + ) + assert isinstance(data, pd.DataFrame) + assert isinstance(mask, pd.DataFrame) + assert_index_equal(data.columns, mask.columns) + assert len(data) == len(mask) + + assert config.columns is not None + + assert_frame_equal(data, fake_out_dataset) + assert_index_equal(data.columns, config.columns) + + assert mask.all().all() + + +def test_open_data_netcdf(reader_xr, fake_xr_dataset_file, fake_out_dataset): + data, mask, config = reader_xr.open_data( + fake_xr_dataset_file, + open_with="netcdf", + ) + assert isinstance(data, pd.DataFrame) + assert isinstance(mask, pd.DataFrame) + assert_index_equal(data.columns, mask.columns) + assert len(data) == len(mask) + + assert config.columns is not None + + assert_frame_equal(data, fake_out_dataset) + assert_index_equal(data.columns, config.columns) + + assert mask.all().all() + + +def test_read_pandas(reader_pd, fake_pandas_df_file, dtypes, fake_out_dataset): + databundle = reader_pd.read( + fake_pandas_df_file, + ) + assert isinstance(databundle, DataBundle) + assert hasattr(databundle, "data") + assert hasattr(databundle, "mask") + assert hasattr(databundle, "columns") + assert hasattr(databundle, "dtypes") + assert hasattr(databundle, "parse_dates") + assert hasattr(databundle, "encoding") + assert hasattr(databundle, "imodel") + + data = databundle.data + mask = databundle.mask + + assert isinstance(data, pd.DataFrame) + assert isinstance(mask, pd.DataFrame) + assert_index_equal(data.columns, mask.columns) + assert len(data) == len(mask) + assert_frame_equal(data, fake_out_dataset) + + assert_index_equal(data.columns, databundle.columns) + + assert mask.all().all() + + assert databundle.dtypes == dtypes + assert databundle.parse_dates == [] + assert databundle.encoding == "utf-8" + assert databundle.imodel == reader_pd.imodel + + +def test_read_netcdf(reader_xr, fake_xr_dataset_file, dtypes, fake_out_dataset): + databundle = reader_xr.read( + fake_xr_dataset_file, + ) + assert isinstance(databundle, DataBundle) + assert hasattr(databundle, "data") + assert hasattr(databundle, "mask") + assert hasattr(databundle, "columns") + assert hasattr(databundle, "dtypes") + assert hasattr(databundle, "parse_dates") + assert hasattr(databundle, "encoding") + assert hasattr(databundle, "imodel") + + data = databundle.data + mask = databundle.mask + + assert isinstance(data, pd.DataFrame) + assert isinstance(mask, pd.DataFrame) + assert_index_equal(data.columns, mask.columns) + assert len(data) == len(mask) + assert_frame_equal(data, fake_out_dataset) + + assert_index_equal(data.columns, databundle.columns) + + assert mask.all().all() + + assert databundle.dtypes == dtypes + assert databundle.parse_dates == [] + assert databundle.encoding == "utf-8" + assert databundle.imodel == reader_xr.imodel diff --git a/tests/test_reader_parser.py b/tests/test_reader_parser.py new file mode 100755 index 00000000..b7f4e254 --- /dev/null +++ b/tests/test_reader_parser.py @@ -0,0 +1,596 @@ +from __future__ import annotations + +import pytest # noqa + +import logging + +import pandas as pd +import xarray as xr # noqa + +from pandas.testing import assert_frame_equal + +from types import MethodType + +from cdm_reader_mapper.mdf_reader.utils.parser import ( + _get_index, + _get_ignore, + _convert_dtype_to_default, + _parse_fixed_width, + _parse_delimited, + _parse_line, + parse_pandas, + parse_netcdf, # noqa + update_pd_config, + update_xr_config, + ParserConfig, + build_parser_config, +) + + +@pytest.fixture +def order_specs(): + return { + "core": { + "header": {}, + "elements": { + "YR": {"index": ("core", "YR"), "field_length": 4}, + "MO": {"index": ("core", "MO"), "field_length": 2}, + "DY": {"index": ("core", "DY"), "field_length": 2}, + "HR": {"index": ("core", "HR"), "field_length": 4}, + }, + "is_delimited": False, + }, + "c1": { + "header": {"sentinel": " 165"}, + "elements": { + "ATTI": {"index": ("c1", "ATTI"), "field_length": 2}, + "ATTL": {"index": ("c1", "ATTL"), "field_length": 2}, + "BSI": {"index": ("c1", "BSI"), "field_length": 1}, + }, + "is_delimited": False, + }, + "c5": { + "header": {"sentinel": " 594"}, + "elements": { + "ATTI": {"index": ("c5", "ATTI"), "field_length": 2}, + "ATTL": {"index": ("c5", "ATTL"), "field_length": 2}, + "OS": {"index": ("c5", "OS"), "field_length": 1}, + "OP": {"index": ("c5", "OP"), "field_length": 1}, + }, + "is_delimited": False, + }, + "c98": { + "header": {"sentinel": "9815"}, + "elements": { + "ATTI": {"index": ("c98", "ATTI"), "field_length": 2}, + "ATTL": {"index": ("c98", "ATTL"), "field_length": 2, "ignore": True}, + "UID": {"index": ("c98", "UID"), "field_length": 6}, + }, + "is_delimited": False, + }, + "c99_data": { + "header": {"delimiter": "}"}, + "elements": { + "control_No": {"index": ("c99_data", "control_No")}, + "name": {"index": ("c99_data", "name")}, + }, + "is_delimited": True, + }, + } + + +@pytest.fixture +def base_config_pd(): + return ParserConfig( + order_specs={}, + disable_reads=[], + dtypes={}, + parse_dates=[], + convert_decode={}, + validation={}, + encoding="utf-8", + columns=None, + ) + + +@pytest.fixture +def base_config_xr(): + return ParserConfig( + order_specs={ + "core": { + "elements": { + "TEMP": { + "index": ("core", "TEMP"), + "ignore": False, + }, + "PRES": { + "index": ("core", "PRES"), + "ignore": False, + }, + } + } + }, + disable_reads=[], + dtypes={}, + parse_dates=[], + convert_decode={}, + validation={ + ("core", "TEMP"): {"units": "__from_file__"}, + ("core", "PRES"): {"units": "__from_file__"}, + }, + encoding="utf-8", + columns=None, + ) + + +def test_get_index_single_length(): + assert _get_index("AT", "_SECTION_", 1) == "AT" + + +def test_get_index_multiple_length(): + assert _get_index("AT", "core", 2) == ("core", "AT") + + +@pytest.mark.parametrize( + "value, expected", + [ + (True, True), + (False, False), + ("true", True), + ("True", True), + ("1", True), + ("yes", True), + ("false", False), + ("0", False), + ("no", False), + ], +) +def test_get_ignore_string_and_bool_values(value, expected): + assert _get_ignore({"ignore": value}) is expected + + +def test_get_ignore_missing_key(): + assert _get_ignore({}) is False + + +def test_convert_dtype_none(): + assert _convert_dtype_to_default(None) is None + + +def test_convert_dtype_float(): + assert _convert_dtype_to_default("float") == "float" + + +def test_convert_dtype_int(): + assert _convert_dtype_to_default("int") == "Int64" + + +def test_convert_deprecated_float(caplog): + with caplog.at_level(logging.WARNING): + result = _convert_dtype_to_default("Float64") + assert result == "float" + assert "deprecated" in caplog.text + + +def test_convert_deprecated_int(caplog): + with caplog.at_level(logging.WARNING): + result = _convert_dtype_to_default("Int32") + assert result == "Int64" + assert "deprecated" in caplog.text + + +def test_convert_unknown_dtype(): + assert _convert_dtype_to_default("string") == "string" + + +@pytest.mark.parametrize( + "line, header, elements, exp_end, exp_out", + [ + ( + "2010 7 1 ", + {}, + { + "YR": {"index": ("core", "YR"), "field_length": 4}, + "MO": {"index": ("core", "MO"), "field_length": 2}, + "DY": {"index": ("core", "DY"), "field_length": 2}, + "HR": {"index": ("core", "HR"), "field_length": 4}, + }, + 12, + { + ("core", "YR"): "2010", + ("core", "MO"): " 7", + ("core", "DY"): " 1", + ("core", "HR"): True, + }, + ), + ( + " 165 ", + {"sentinel": " 165"}, + { + "ATTI": {"index": ("c1", "ATTI"), "field_length": 2}, + "ATTL": {"index": ("c1", "ATTL"), "field_length": 2}, + "BSI": {"index": ("c1", "BSI"), "field_length": 1}, + }, + 5, + { + ("c1", "ATTI"): " 1", + ("c1", "ATTL"): "65", + ("c1", "BSI"): True, + }, + ), + ( + "9815IS7NQU", + {"sentinel": " 594"}, + { + "ATTI": {"index": ("c5", "ATTI"), "field_length": 2}, + "ATTL": {"index": ("c5", "ATTL"), "field_length": 2}, + "OS": {"index": ("c5", "OS"), "field_length": 1}, + "OP": {"index": ("c5", "OP"), "field_length": 1}, + }, + 0, + { + ("c5", "ATTI"): False, + ("c5", "ATTL"): False, + ("c5", "OS"): False, + ("c5", "OP"): False, + }, + ), + ( + "9815IS7NQU", + {"sentinel": "9815"}, + { + "ATTI": {"index": ("c98", "ATTI"), "field_length": 2}, + "ATTL": {"index": ("c98", "ATTL"), "field_length": 2, "ignore": True}, + "UID": {"index": ("c98", "UID"), "field_length": 6}, + }, + 10, + { + ("c98", "ATTI"): "98", + ("c98", "UID"): "IS7NQU", + }, + ), + ], +) +def test_parse_fixed_width(line, header, elements, exp_end, exp_out): + out = {} + end = _parse_fixed_width( + line=line, + i=0, + header=header, + elements=elements, + sections=None, + excludes=set(), + out=out, + ) + + assert end == exp_end + assert out == exp_out + + +@pytest.mark.parametrize( + "sections, excludes, exp_out", + [ + ( + ["core"], + set(), + { + ("core", "YR"): "2010", + ("core", "MO"): " 7", + ("core", "DY"): " 1", + ("core", "HR"): True, + }, + ), + (["c1"], set(), {}), + (None, ["core"], {}), + ( + None, + ["c1"], + { + ("core", "YR"): "2010", + ("core", "MO"): " 7", + ("core", "DY"): " 1", + ("core", "HR"): True, + }, + ), + ], +) +def test_parse_fixed_width_kwargs(sections, excludes, exp_out): + out = {} + elements = { + "YR": {"index": ("core", "YR"), "field_length": 4}, + "MO": {"index": ("core", "MO"), "field_length": 2}, + "DY": {"index": ("core", "DY"), "field_length": 2}, + "HR": {"index": ("core", "HR"), "field_length": 4}, + } + end = _parse_fixed_width( + line="2010 7 1 ", + i=0, + header={}, + elements=elements, + sections=sections, + excludes=excludes, + out=out, + ) + + assert end == 12 + assert out == exp_out + + +def test_parse_delimited(): + line = "13615}Peder Aneus" + header = {"delimiter": "}"} + elements = { + "control_No": {"index": ("c99_data", "control_No")}, + "name": {"index": ("c99_data", "name")}, + } + out = {} + end = _parse_delimited( + line=line, + i=0, + header=header, + elements=elements, + sections=None, + excludes=set(), + out=out, + ) + + assert end == len(line) + assert out == { + ("c99_data", "control_No"): "13615", + ("c99_data", "name"): "Peder Aneus", + } + + +def test_parse_line(order_specs): + line = "2010 7 1 165 9815IS7NQU13615}Peder Aneus" + out = _parse_line( + line=line, + order_specs=order_specs, + sections=None, + excludes=set(), + ) + + assert out == { + ("core", "YR"): "2010", + ("core", "MO"): " 7", + ("core", "DY"): " 1", + ("core", "HR"): True, + ("c1", "ATTI"): " 1", + ("c1", "ATTL"): "65", + ("c1", "BSI"): True, + ("c5", "ATTI"): False, + ("c5", "ATTL"): False, + ("c5", "OS"): False, + ("c5", "OP"): False, + ("c98", "ATTI"): "98", + ("c98", "UID"): "IS7NQU", + ("c99_data", "control_No"): "13615", + ("c99_data", "name"): "Peder Aneus", + } + + +def test_parse_pandas(order_specs): + df = pd.DataFrame( + [ + "2010 7 1 165 9815IS7NQU13615}Peder Aneus", + "2010 7 20100 165 9815IS7NQU13615}Peder Aneus", + "2010 7 30200 165 9815IS7NQU13615}Peder Aneus", + ] + ) + out = parse_pandas( + df=df, + order_specs=order_specs, + ) + + data = { + ("core", "YR"): ["2010", "2010", "2010"], + ("core", "MO"): [" 7", " 7", " 7"], + ("core", "DY"): [" 1", " 2", " 3"], + ("core", "HR"): [True, "0100", "0200"], + ("c1", "ATTI"): [" 1", " 1", " 1"], + ("c1", "ATTL"): ["65", "65", "65"], + ("c1", "BSI"): [True, True, True], + ("c5", "ATTI"): [False, False, False], + ("c5", "ATTL"): [False, False, False], + ("c5", "OS"): [False, False, False], + ("c5", "OP"): [False, False, False], + ("c98", "ATTI"): ["98", "98", "98"], + ("c98", "UID"): ["IS7NQU", "IS7NQU", "IS7NQU"], + ("c99_data", "control_No"): ["13615", "13615", "13615"], + ("c99_data", "name"): ["Peder Aneus", "Peder Aneus", "Peder Aneus"], + } + + exp = pd.DataFrame(data, columns=list(data.keys())) + + assert_frame_equal(out, exp) + + +def test_parse_netcdf(order_specs): + ds = xr.Dataset( + { + "YR": ("time", [2010, 2010, 2010]), + "MO": ("time", [7, 7, 7]), + "DY": ("time", [1, 2, 3]), + "HR": ("time", [10, 20, 30]), + }, + coords={"time": [0, 1, 2]}, + attrs={"source": "fake"}, + ) + out = parse_netcdf( + ds=ds, + order_specs=order_specs, + ) + + data = { + ("core", "YR"): [2010, 2010, 2010], + ("core", "MO"): [7, 7, 7], + ("core", "DY"): [1, 2, 3], + ("core", "HR"): [10, 20, 30], + ("c1", "ATTI"): [False, False, False], + ("c1", "ATTL"): [False, False, False], + ("c1", "BSI"): [False, False, False], + ("c5", "ATTI"): [False, False, False], + ("c5", "ATTL"): [False, False, False], + ("c5", "OS"): [False, False, False], + ("c5", "OP"): [False, False, False], + ("c98", "ATTI"): [False, False, False], + ("c98", "UID"): [False, False, False], + ("c99_data", "control_No"): [False, False, False], + ("c99_data", "name"): [False, False, False], + } + + exp = pd.DataFrame(data, columns=list(data.keys())) + + assert_frame_equal(out, exp) + + +def test_update_pd_config_updates_encoding(base_config_pd): + pd_kwargs = {"encoding": "latin-1"} + + new_config = update_pd_config(pd_kwargs, base_config_pd) + + assert new_config.encoding == "latin-1" + assert base_config_pd.encoding == "utf-8" + assert new_config is not base_config_pd + + +def test_update_pd_config_no_encoding_key(base_config_pd): + pd_kwargs = {"sep": ","} + + new_config = update_pd_config(pd_kwargs, base_config_pd) + + assert new_config is base_config_pd + + +def test_update_pd_config_empty_encoding(base_config_pd): + pd_kwargs = {"encoding": ""} + + new_config = update_pd_config(pd_kwargs, base_config_pd) + + assert new_config is base_config_pd + + +def test_update_pd_config_none_encoding(base_config_pd): + pd_kwargs = {"encoding": None} + + new_config = update_pd_config(pd_kwargs, base_config_pd) + + assert new_config is base_config_pd + + +def test_update_xr_config_ignores_missing_elements(base_config_xr): + ds = xr.Dataset( + data_vars={ + "TEMP": xr.DataArray([1, 2, 3], attrs={"units": "K"}), + } + ) + + new_config = update_xr_config(ds, base_config_xr) + + elements = new_config.order_specs["core"]["elements"] + assert elements["PRES"]["ignore"] is True + assert elements["TEMP"]["ignore"] is False + + +def test_update_xr_config_populates_validation_from_attrs(base_config_xr): + ds = xr.Dataset( + data_vars={ + "TEMP": xr.DataArray([1, 2, 3], attrs={"units": "K"}), + "PRES": xr.DataArray([1010, 1011, 1012], attrs={"units": "hPa"}), + } + ) + + new_config = update_xr_config(ds, base_config_xr) + + assert new_config.validation[("core", "TEMP")]["units"] == "K" + assert new_config.validation[("core", "PRES")]["units"] == "hPa" + + +def test_update_xr_config_removes_missing_validation_attrs(base_config_xr): + ds = xr.Dataset( + data_vars={ + "TEMP": xr.DataArray([1, 2, 3], attrs={}), + "PRES": xr.DataArray([1010, 1011, 1012], attrs={"units": "hPa"}), + } + ) + + new_config = update_xr_config(ds, base_config_xr) + + assert "units" not in new_config.validation[("core", "TEMP")] + assert new_config.validation[("core", "PRES")]["units"] == "hPa" + + +def test_update_xr_config_does_not_mutate_original(base_config_xr): + ds = xr.Dataset( + data_vars={ + "TEMP": xr.DataArray([1, 2, 3], attrs={"units": "K"}), + } + ) + + _ = update_xr_config(ds, base_config_xr) + + assert base_config_xr.order_specs["core"]["elements"]["PRES"]["ignore"] is False + assert base_config_xr.validation[("core", "TEMP")]["units"] == "__from_file__" + + +def test_build_parser_config_imodel(): + config = build_parser_config("icoads") + + assert isinstance(config, ParserConfig) + + assert hasattr(config, "order_specs") + assert isinstance(config.order_specs, dict) + assert "core" in config.order_specs + spec = config.order_specs["core"] + assert isinstance(spec, dict) + assert "header" in spec + assert isinstance(spec["header"], dict) + assert "elements" in spec + assert isinstance(spec["elements"], dict) + assert "is_delimited" in spec + assert isinstance(spec["is_delimited"], bool) + + assert hasattr(config, "disable_reads") + assert isinstance(config.disable_reads, list) + assert all(isinstance(x, str) for x in config.disable_reads) + + assert hasattr(config, "dtypes") + assert isinstance(config.dtypes, dict) + assert all(isinstance(x, tuple) for x in config.dtypes.keys()) + assert all(isinstance(x, str) for x in config.dtypes.values()) + + assert hasattr(config, "parse_dates") + assert isinstance(config.parse_dates, list) + assert config.parse_dates == [] + + assert hasattr(config, "convert_decode") + assert isinstance(config.convert_decode, dict) + + assert "converter_dict" in config.convert_decode + converter_dict = config.convert_decode["converter_dict"] + assert isinstance(converter_dict, dict) + assert all(isinstance(x, tuple) for x in converter_dict.keys()) + assert all(isinstance(x, MethodType) for x in converter_dict.values()) + + assert "converter_kwargs" in config.convert_decode + converter_kwargs = config.convert_decode["converter_kwargs"] + assert isinstance(converter_kwargs, dict) + assert all(isinstance(x, tuple) for x in converter_kwargs.keys()) + assert all(isinstance(x, dict) for x in converter_kwargs.values()) + + assert "decoder_dict" in config.convert_decode + decoder_dict = config.convert_decode["converter_dict"] + assert isinstance(decoder_dict, dict) + assert all(isinstance(x, tuple) for x in decoder_dict.keys()) + assert all(isinstance(x, MethodType) for x in decoder_dict.values()) + + assert hasattr(config, "validation") + assert isinstance(config.validation, dict) + assert all(isinstance(x, tuple) for x in config.validation.keys()) + assert all(isinstance(x, dict) for x in config.validation.values()) + + assert hasattr(config, "encoding") + assert isinstance(config.encoding, str) + + assert hasattr(config, "columns") + assert config.columns is None diff --git a/tests/test_reader_schemas.py b/tests/test_reader_schemas.py new file mode 100755 index 00000000..07598933 --- /dev/null +++ b/tests/test_reader_schemas.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +import pytest +import json + +from cdm_reader_mapper.mdf_reader.schemas.schemas import ( + _resolve_schema_files, + _normalize_schema, + read_schema, +) + + +@pytest.fixture +def tmp_schema_file(tmp_path): + schema_data = { + "header": {"delimiter": ","}, + "sections": {"sec1": {"elements": {"a": 1, "b": 2}}}, + } + path = tmp_path / "schema" + path.mkdir(exist_ok=True) + file_path = tmp_path / "schema" / "schema.json" + file_path.write_text(json.dumps(schema_data)) + return file_path, schema_data + + +def test_resolve_schema_file_by_file(tmp_schema_file): + file_path, _ = tmp_schema_file + result = _resolve_schema_files(ext_schema_file=str(file_path)) + assert isinstance(result, list) + assert result[0] == file_path + + +def test_resolve_schema_file_by_path(tmp_path): + dir_path = tmp_path / "myschema" + dir_path.mkdir() + schema_file = dir_path / "myschema.json" + schema_file.write_text(json.dumps({"header": {}})) + + result = _resolve_schema_files(ext_schema_path=str(dir_path)) + assert len(result) == 1 + assert result[0] == schema_file.resolve() + + +def test_resolve_schema_file_missing_file(tmp_path): + missing_file = tmp_path / "does_not_exist.json" + with pytest.raises(FileNotFoundError): + _resolve_schema_files(ext_schema_file=str(missing_file)) + + +def test_resolve_schema_file_missing_path(tmp_path): + missing_dir = tmp_path / "nonexistent_dir" + with pytest.raises(FileNotFoundError): + _resolve_schema_files(ext_schema_path=str(missing_dir)) + + +def test_resolve_schema_file_no_input(): + with pytest.raises(ValueError): + _resolve_schema_files() + + +def test_normalize_schema_with_sections(): + schema = { + "header": {"delimiter": ","}, + "sections": {"sec1": {"elements": {"a": 1}}}, + } + result = _normalize_schema(schema) + assert "sections" in result + assert result["header"]["parsing_order"] == [{"s": ["sec1"]}] + + +def test_normalize_schema_missing_sections_and_elements(): + schema = {"header": {"delimiter": ","}} + with pytest.raises(KeyError): + _normalize_schema(schema) + + +def test_normalize_schema_preserves_existing_parsing_order(): + schema = { + "header": {"delimiter": ",", "parsing_order": [{"s": ["sec1"]}]}, + "sections": {"sec1": {"elements": {"x": 1}}}, + } + result = _normalize_schema(schema) + assert result["header"]["parsing_order"] == [{"s": ["sec1"]}] + + +def test_read_schema_with_imodel(): + result = read_schema(imodel="icoads") + assert isinstance(result, dict) + assert "header" in result + assert "sections" in result + assert "name" in result + + +def test_read_schema_with_ext_file(tmp_schema_file): + file_path, _ = tmp_schema_file + + result = read_schema(ext_schema_file=str(file_path)) + assert isinstance(result, dict) + assert "sections" in result + assert result["sections"]["sec1"]["elements"] == {"a": 1, "b": 2} + assert result["name"] == [file_path] + + +def test_read_schema_with_ext_path(tmp_schema_file): + file_path, _ = tmp_schema_file + result = read_schema(ext_schema_path=str(file_path.parent)) + assert isinstance(result, dict) + assert "sections" in result + assert result["sections"]["sec1"]["elements"] == {"a": 1, "b": 2} + assert result["name"] == [file_path] + + +def test_read_schema_requires_input(): + with pytest.raises(ValueError): + read_schema(imodel=None, ext_schema_path=None, ext_schema_file=None) diff --git a/tests/test_reader_utilities.py b/tests/test_reader_utilities.py new file mode 100755 index 00000000..f4a46639 --- /dev/null +++ b/tests/test_reader_utilities.py @@ -0,0 +1,272 @@ +from __future__ import annotations + +import pandas as pd +import pytest + +from io import StringIO +from pandas.io.parsers import TextFileReader +from pathlib import Path + +from cdm_reader_mapper.mdf_reader.utils.utilities import ( + as_list, + as_path, + join, + update_dtypes, + update_column_names, + update_column_labels, + read_csv, + convert_dtypes, + validate_arg, + _adjust_dtype, + convert_str_boolean, + _remove_boolean_values, + remove_boolean_values, + process_textfilereader, +) + + +def make_parser(text: str, chunksize: int = 1) -> pd.io.parsers.TextFileReader: + """Helper: create a TextFileReader similar to user code.""" + buffer = StringIO(text) + return pd.read_csv(buffer, chunksize=chunksize) + + +@pytest.fixture +def sample_reader() -> pd.io.parsers.TextFileReader: + buffer = StringIO("A,B\n1,2\n3,4\n") + return pd.read_csv(buffer, chunksize=1) + + +@pytest.fixture +def tmp_csv_file(tmp_path): + data = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) + file_path = tmp_path / "test.csv" + data.to_csv(file_path, index=False) + return file_path, data + + +def sample_func(df): + df_new = df * 2 + extra = {"note": "first_chunk_only"} + return df_new, extra + + +def sample_func_only_df(df): + return df * 2 + + +@pytest.mark.parametrize( + "input_value, expected", + [ + (None, None), + ("hello", ["hello"]), + ([1, 2, 3], [1, 2, 3]), + ((4, 5), [4, 5]), + ], +) +def test_as_list(input_value, expected): + result = as_list(input_value) + assert result == expected + + +def test_as_list_with_set_order_warning(): + s = {"a", "b"} # sets are unordered + result = as_list(s) + assert set(result) == s + + +def test_as_path_with_string(tmp_path): + p = tmp_path / "file.txt" + result = as_path(str(p), "test_param") + assert isinstance(result, Path) + assert result == p + + +def test_as_path_with_pathlike(tmp_path): + p = tmp_path / "file.txt" + result = as_path(p, "test_param") + assert isinstance(result, Path) + assert result == p + + +def test_as_path_with_invalid_type(): + with pytest.raises(TypeError): + as_path(123, "number_param") + + +@pytest.mark.parametrize( + "input_col, expected", + [ + ("single", "single"), + (["a", "b"], "a:b"), + (("x", "y", "z"), "x:y:z"), + ([1, 2], "1:2"), + (42, "42"), + ], +) +def test_join(input_col, expected): + assert join(input_col) == expected + + +def test_update_dtypes(): + dtypes = {"A": int, "B": float, "C": str} + columns = ["A", "C"] + expected = {"A": int, "C": str} + assert update_dtypes(dtypes, columns) == expected + + +def test_update_dtypes_with_empty_columns(): + dtypes = {"A": int, "B": float} + assert update_dtypes(dtypes, []) == {} + + +def test_update_column_names_dict(): + dtypes = {"A": int, "B": float} + updated = update_column_names(dtypes.copy(), "A", "X") + assert updated == {"X": int, "B": float} + + +def test_update_column_names_no_change(): + dtypes = {"A": int} + updated = update_column_names(dtypes.copy(), "B", "Y") + assert updated == {"A": int} + + +def test_update_column_names_string_input(): + value = "some string" + assert update_column_names(value, "A", "X") == "some string" + + +def test_update_column_labels_simple_strings(): + cols = ["A", "B", "C"] + result = update_column_labels(cols) + assert isinstance(result, pd.Index) + assert list(result) == ["A", "B", "C"] + + +def test_update_column_labels_colon_strings(): + cols = ["A:B", "C:D"] + result = update_column_labels(cols) + assert isinstance(result, pd.MultiIndex) + assert result.tolist() == [("A", "B"), ("C", "D")] + + +def test_update_column_labels_tuple_strings(): + cols = ["('A','B')", "('C','D')"] + result = update_column_labels(cols) + assert isinstance(result, pd.MultiIndex) + assert result.tolist() == [("A", "B"), ("C", "D")] + + +def test_update_column_labels_mixed(): + cols = ["A", "('B','C')", "D:E"] + result = update_column_labels(cols) + assert isinstance(result, pd.Index) # Not all tuples + assert result.tolist() == ["A", ("B", "C"), ("D", "E")] + + +def test_read_csv_file_exists(tmp_csv_file): + file_path, data = tmp_csv_file + df = read_csv(file_path) + pd.testing.assert_frame_equal(df, data) + + +def test_read_csv_file_missing(tmp_path): + missing_file = tmp_path / "missing.csv" + df = read_csv(missing_file) + assert df.empty + + +def test_read_csv_with_col_subset(tmp_csv_file): + file_path, _ = tmp_csv_file + df = read_csv(file_path, col_subset=["B"]) + assert list(df.columns) == ["B"] + + +def test_convert_dtypes_basic(): + dtypes = {"A": "int", "B": "datetime", "C": "float"} + updated, dates = convert_dtypes(dtypes) + assert updated["B"] == "object" + assert dates == ["B"] + + +def test_validate_arg_correct_type(): + assert validate_arg("x", 5, int) + + +def test_validate_arg_none(): + assert validate_arg("x", None, int) + + +def test_validate_arg_wrong_type(): + with pytest.raises(ValueError): + validate_arg("x", "hello", int) + + +def test_convert_str_boolean(): + assert convert_str_boolean("True") is True + assert convert_str_boolean("False") is False + assert convert_str_boolean("hello") == "hello" + assert convert_str_boolean(1) == 1 + + +def test_remove_boolean_values_helper(): + assert _remove_boolean_values("True") is None + assert _remove_boolean_values("False") is None + assert _remove_boolean_values(True) is None + assert _remove_boolean_values(False) is None + assert _remove_boolean_values("abc") == "abc" + + +def test_adjust_dtype(): + df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) + dtype = {"A": "int", "B": "float", "C": "str"} + adjusted = _adjust_dtype(dtype, df) + assert adjusted == {"A": "int", "B": "float"} + assert _adjust_dtype("str", df) == "str" + + +def test_remove_boolean_values(): + df = pd.DataFrame({"A": ["True", "False", "hello"], "B": [1, 2, 3]}) + dtypes = {"A": "object", "B": "int"} + result = remove_boolean_values(df, dtypes) + assert result.loc[0, "A"] is None + assert result.loc[1, "A"] is None + assert result.loc[2, "A"] == "hello" + assert result["B"].dtype.name == "int64" + + +def test_process_textfilereader(sample_reader): + reader_out, extra_out = process_textfilereader( + sample_reader, sample_func, read_kwargs={"chunksize": 1} + ) + assert isinstance(reader_out, TextFileReader) + df_out = reader_out.read() + assert df_out.shape == (2, 2) + assert df_out["A"].iloc[0] == 2 + assert df_out["B"].iloc[1] == 8 + assert extra_out == {"note": "first_chunk_only"} + + +def test_process_textfilereader_only_df(sample_reader): + (reader_out,) = process_textfilereader( + sample_reader, sample_func_only_df, read_kwargs={"chunksize": 1} + ) + print(reader_out) + assert isinstance(reader_out, TextFileReader) + df_out = reader_out.read() + assert df_out.shape == (2, 2) + assert df_out["A"].iloc[0] == 2 + assert df_out["B"].iloc[1] == 8 + + +def test_process_textfilereader_makecopy_flag(sample_reader): + reader_out, extra_out = process_textfilereader( + sample_reader, sample_func, makecopy=True, read_kwargs={"chunksize": 1} + ) + assert isinstance(reader_out, TextFileReader) + df_out = reader_out.read() + assert df_out.shape == (2, 2) + assert df_out["A"].iloc[0] == 2 + assert df_out["B"].iloc[1] == 8 + assert extra_out == {"note": "first_chunk_only"} diff --git a/tests/test_reader_validator.py b/tests/test_reader_validator.py new file mode 100755 index 00000000..d7c17b7a --- /dev/null +++ b/tests/test_reader_validator.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +import numpy as np +import pandas as pd +import pytest + +from cdm_reader_mapper.mdf_reader.utils.validators import ( + _is_true, + _is_false, + validate_datetime, + validate_numeric, + validate_str, + validate_codes, + validate, +) + + +@pytest.fixture +def sample_series(): + return pd.Series(["20200101", "bad", None, "20221231"], dtype="object") + + +@pytest.fixture +def numeric_series(): + return pd.Series(["1", "2", "3", "False", "bad"], dtype="object") + + +@pytest.fixture +def code_series(): + return pd.Series(["A", "B", "C", None, "X"], dtype="object") + + +def test_is_true_false(): + assert _is_true(True) is True + assert _is_true(False) is False + assert _is_false(False) is True + assert _is_false(True) is False + assert _is_true(1) is False + assert _is_false(0) is False + + +def test_validate_datetime(sample_series): + result = validate_datetime(sample_series) + expected = pd.Series([True, False, True, True]) + pd.testing.assert_series_equal(result, expected) + + +def test_validate_numeric(numeric_series): + result = validate_numeric(numeric_series, 1, 3) + expected = pd.Series([True, True, True, False, False]) + pd.testing.assert_series_equal(result, expected) + + +def test_validate_str(numeric_series): + result = validate_str(numeric_series) + expected = pd.Series([True] * len(numeric_series), dtype="boolean") + pd.testing.assert_series_equal(result, expected) + + +def test_validate_codes(code_series): + codes = ["A", "B", "C"] + result = validate_codes(code_series, codes, "str") + expected = pd.Series([True, True, True, True, False]) + pd.testing.assert_series_equal(result, expected) + + +@pytest.fixture +def sample_df(): + return pd.DataFrame( + { + "NUM": ["1", "2", "bad", np.nan, "5"], + "KEY": ["0", "1", "2", "9", np.nan], + "STR": ["foo", "bar", "baz", "", np.nan], + "DATE": ["20220101", "20220202", "bad_date", np.nan, "20220505"], + "BOOL": ["True", "False", "TRUE", "FALSE", None], + } + ) + + +@pytest.fixture +def attributes(): + return { + "NUM": {"column_type": "int", "valid_min": 1, "valid_max": 5}, + "KEY": {"column_type": "key", "codetable": "ICOADS.C0.A"}, + "STR": {"column_type": "str"}, + "DATE": {"column_type": "datetime"}, + "BOOL": {"column_type": "int"}, # treat boolean literals as numeric override + } + + +def test_validate_all_columns(sample_df, attributes): + mask = validate( + sample_df, imodel="icoads", ext_table_path=None, attributes=attributes + ) + + expected_num = [True, True, False, True, True] + assert mask["NUM"].tolist() == expected_num + + expected_key = [True, True, True, False, True] + assert mask["KEY"].tolist() == expected_key + + expected_key = [True, True, True, True, True] + assert mask["STR"].tolist() == expected_key + + expected_date = [True, True, False, True, True] + assert mask["DATE"].tolist() == expected_date + + expected_bool = [True, False, False, False, True] + print(mask["BOOL"]) + assert mask["BOOL"].tolist() == expected_bool