From 3f84840796b7ad7668b42db67774cf14f68f0619 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Fri, 15 Mar 2024 13:53:25 +0100 Subject: [PATCH 01/68] delete code blocks containing TextFileReader --- cdm_reader_mapper/cdm_mapper/mapper.py | 20 ++--- cdm_reader_mapper/cdm_mapper/table_writer.py | 9 +- cdm_reader_mapper/mdf_reader/read.py | 86 +++---------------- .../mdf_reader/utils/auxiliary.py | 55 +----------- .../metmetpy/datetime/correct.py | 22 +---- .../metmetpy/platform_type/correct.py | 27 +----- cdm_reader_mapper/operations/corrections.py | 20 ----- cdm_reader_mapper/operations/inspect.py | 24 +----- cdm_reader_mapper/operations/select.py | 23 ++--- 9 files changed, 38 insertions(+), 248 deletions(-) diff --git a/cdm_reader_mapper/cdm_mapper/mapper.py b/cdm_reader_mapper/cdm_mapper/mapper.py index 4c11750d..bc61799d 100755 --- a/cdm_reader_mapper/cdm_mapper/mapper.py +++ b/cdm_reader_mapper/cdm_mapper/mapper.py @@ -3,9 +3,8 @@ Created on Thu Apr 11 13:45:38 2019 -Maps data contained in a pandas DataFrame (or pd.io.parsers.TextFileReader) to -the C3S Climate Data Store Common Data Model (CDM) header and observational -tables using the mapping information available in the tool's mapping library +Maps data contained in a pandas DataFrame to the C3S Climate Data Store Common Data Model (CDM) +header and observational tables using the mapping information available in the tool's mapping library for the input data model. @author: iregon @@ -17,7 +16,7 @@ import numpy as np import pandas as pd -from cdm_reader_mapper.common import logging_hdlr, pandas_TextParser_hdlr +from cdm_reader_mapper.common import logging_hdlr from . import properties from .codes.codes_hdlr import codes_hdlr @@ -190,7 +189,7 @@ def _map(imodel, data, data_atts, cdm_subset=None, codes_subset=None, log_level= """ Map to the C3S Climate Data Store Common Data Model (CDM). - Maps a pandas DataFrame (or pd.io.parsers.TextFileReader) to the C3S Climate Data Store Common Data Model (CDM) + Maps a pandas DataFrame to the C3S Climate Data Store Common Data Model (CDM) header and observational tables using mapping information from the input data model (imodel). Parameters @@ -201,7 +200,7 @@ def _map(imodel, data, data_atts, cdm_subset=None, codes_subset=None, log_level= 2. A specific mapping from generic data model to CDM, like map a SID-DCK from IMMA1's core and attachments to CDM in a specific way. e.g. ``~/cdm-mapper/lib/mappings/icoads_r3000_d704`` data: input data to map - e.g. a pandas.Dataframe or io.parsers.TextFileReader objects or in-memory text streams (io.StringIO object). + e.g. a pandas.Dataframe. data_atts: dictionary with the {element_name:element_attributes} of the data. Type: string. cdm_subset: subset of CDM model tables to map. @@ -350,7 +349,7 @@ def map_model( 2. A specific mapping from generic data model to CDM, like map a SID-DCK from IMMA1’s core and attachments to CDM in a specific way. e.g. ``cdm/library/mappings/icoads_r3000_d704`` - data: pd.DataFrame, pd.parser.TextFileReader or io.String + data: pd.DataFrame input data to map. data_atts: dict dictionary with the {element_name:element_attributes} of the data. @@ -386,13 +385,6 @@ def map_model( return else: data = [data] - elif isinstance(data, pd.io.parsers.TextFileReader): - logger.debug("Input is a pd.TextFileReader") - not_empty = pandas_TextParser_hdlr.is_not_empty(data) - if not not_empty: - logger.error("Input data is empty") - return - else: logger.error("Input data type " f"{type(data)}" " not supported") return diff --git a/cdm_reader_mapper/cdm_mapper/table_writer.py b/cdm_reader_mapper/cdm_mapper/table_writer.py index 02ec6aa3..14a4c695 100755 --- a/cdm_reader_mapper/cdm_mapper/table_writer.py +++ b/cdm_reader_mapper/cdm_mapper/table_writer.py @@ -4,8 +4,7 @@ Created on Thu Apr 11 13:45:38 2019 Exports tables written in the C3S Climate Data Store Common Data Model (CDM) format to ascii files, -The tables format is contained in a python dictionary, stored as an attribute in a pandas.DataFrame -(or pd.io.parsers.TextFileReader). +The tables format is contained in a python dictionary, stored as an attribute in a pandas.DataFrame. This module uses a set of printer functions to "print" element values to a string object before exporting them to a final ascii file. @@ -234,8 +233,7 @@ def table_to_ascii( Export a cdm table to an ascii file. Exports tables written in the C3S Climate Data Store Common Data Model (CDM) format to ascii files. - The tables format is contained in a python dictionary, stored as an attribute in a ``pandas.DataFrame`` - (or ``pd.io.parsers.TextFileReader``). + The tables format is contained in a python dictionary, stored as an attribute in a ``pandas.DataFrame``. Parameters ---------- @@ -331,8 +329,7 @@ def cdm_to_ascii( Exports a complete cdm file with multiple tables written in the C3S Climate Data Store Common Data Model (CDM) format to ascii files. - The tables format is contained in a python dictionary, stored as an attribute in a ``pandas.DataFrame`` - (or ``pd.io.parsers.TextFileReader``). + The tables format is contained in a python dictionary, stored as an attribute in a ``pandas.DataFrame``. Parameters ---------- diff --git a/cdm_reader_mapper/mdf_reader/read.py b/cdm_reader_mapper/mdf_reader/read.py index 63e44591..52ecb640 100755 --- a/cdm_reader_mapper/mdf_reader/read.py +++ b/cdm_reader_mapper/mdf_reader/read.py @@ -8,8 +8,6 @@ import pandas as pd -from cdm_reader_mapper.common.pandas_TextParser_hdlr import make_copy - from . import properties from .schema import schemas from .utils.auxiliary import _FileReader, validate_arg, validate_path @@ -30,14 +28,12 @@ class MDFFileReader(_FileReader): Attributes ---------- - data : pd.DataFrame or pd.io.parsers.TextFileReader - a pandas.DataFrame or pandas.io.parsers.TextFileReader - with the output data + data : pd.DataFrame + a pandas.DataFrame with the output data attrs : dict a dictionary with the output data elements attributes - mask : pd.DataFrame or pd.io.parsers.TextFileReader - a pandas.DataFrame or pandas.io.parsers.TextFileReader - with the output data validation mask + mask : pd.DataFrame + a pandas.DataFrame with the output data validation mask """ def __init__(self, *args, **kwargs): @@ -90,49 +86,13 @@ def convert_and_decode_entries( if decode is not True: decoder_dict = {} - if isinstance(self.data, pd.DataFrame): - self.data = self._convert_and_decode_df( - self.data, - converter_dict, - converter_kwargs, - decoder_dict, - ) - self.data = self.data.astype(dtype) - else: - data_buffer = StringIO() - for i, df_ in enumerate(self.data): - df = self._convert_and_decode_df( - df_, - converter_dict, - converter_kwargs, - decoder_dict, - ) - df.to_csv( - data_buffer, - header=False, - mode="a", - encoding="utf-8", - index=False, - quoting=csv.QUOTE_NONE, - sep=properties.internal_delimiter, - quotechar="\0", - escapechar="\0", - ) - data_buffer.seek(0) - date_columns = [] - for i, element in enumerate(list(dtype)): - if dtype.get(element) == "datetime": - date_columns.append(i) - self.data = pd.read_csv( - data_buffer, - names=df.columns, - chunksize=self.chunksize, - dtype=dtype, - parse_dates=date_columns, - delimiter=properties.internal_delimiter, - quotechar="\0", - escapechar="\0", - ) + self.data = self._convert_and_decode_df( + self.data, + converter_dict, + converter_kwargs, + decoder_dict, + ) + self.data = self.data.astype(dtype) return self def validate_entries( @@ -142,27 +102,7 @@ def validate_entries( Fill attribute `valid` with boolean mask. """ - if isinstance(self.data, pd.DataFrame): - self.mask = self._validate_df(self.data) - - else: - data_buffer = StringIO() - TextParser_ = make_copy(self.data) - for i, df_ in enumerate(TextParser_): - mask_ = self._validate_df(df_) - mask_.to_csv( - data_buffer, - header=False, - mode="a", - encoding="utf-8", - index=False, - ) - data_buffer.seek(0) - self.mask = pd.read_csv( - data_buffer, - names=df_.columns, - chunksize=self.chunksize, - ) + self.mask = self._validate_df(self.data) return self def read( @@ -231,7 +171,7 @@ def read( sections = read_sections_list # 2.2 Homogeneize input data to an iterable with dataframes: - # a list with a single dataframe or a pd.io.parsers.TextFileReader + # a list with a single dataframe logging.info("Getting data string from source...") # self.configurations = self._get_configurations(read_sections_list, sections) self.configurations = self._get_configurations(read_sections_list, sections) diff --git a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py index a9dc7093..8cc02bfd 100755 --- a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py +++ b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py @@ -10,7 +10,6 @@ import pandas as pd -from cdm_reader_mapper.common import pandas_TextParser_hdlr from cdm_reader_mapper.common.getting_files import get_files from .. import properties @@ -408,7 +407,6 @@ def _open_data( self, order, valid, - chunksize, ): TextParser = self._read_pandas( encoding=self.schema["header"].get("encoding"), @@ -417,49 +415,8 @@ def _open_data( chunksize=chunksize, ) - if isinstance(TextParser, pd.DataFrame): - df, self.missings = self._read_sections(TextParser, order, valid) - return df - else: - data_buffer = StringIO() - missings_buffer = StringIO() - for i, df_ in enumerate(TextParser): - df, missings = self._read_sections(df_, order, valid) - missings.to_csv( - missings_buffer, - header=False, - mode="a", - encoding="utf-8", - index=False, - ) - df.to_csv( - data_buffer, - header=False, - mode="a", - encoding="utf-8", - index=False, - quoting=csv.QUOTE_NONE, - sep=properties.internal_delimiter, - quotechar="\0", - escapechar="\0", - ) - missings_buffer.seek(0) - self.missings = pd.read_csv( - missings_buffer, - names=missings.columns, - chunksize=None, - ) - data_buffer.seek(0) - return pd.read_csv( - data_buffer, - names=df.columns, - chunksize=self.chunksize, - dtype=object, - parse_dates=self.parse_dates, - delimiter=properties.internal_delimiter, - quotechar="\0", - escapechar="\0", - ) + df, self.missings = self._read_sections(TextParser, order, valid) + return df def _convert_and_decode_df( self, @@ -504,12 +461,8 @@ def _validate_df(self, df): def _dump_atts(self, out_atts, out_path): """Dump attributes to atts.json.""" - if not isinstance(self.data, pd.io.parsers.TextFileReader): - data = [self.data] - valid = [self.mask] - else: - data = pandas_TextParser_hdlr.make_copy(self.data) - valid = pandas_TextParser_hdlr.make_copy(self.mask) + data = [self.data] + valid = [self.mask] logging.info(f"WRITING DATA TO FILES IN: {out_path}") for i, (data_df, valid_df) in enumerate(zip(data, valid)): header = False diff --git a/cdm_reader_mapper/metmetpy/datetime/correct.py b/cdm_reader_mapper/metmetpy/datetime/correct.py index f584064e..f749025f 100755 --- a/cdm_reader_mapper/metmetpy/datetime/correct.py +++ b/cdm_reader_mapper/metmetpy/datetime/correct.py @@ -88,23 +88,5 @@ def correct(data, data_model, deck, log_level="INFO"): replacements".format() ) - if isinstance(data, pd.DataFrame): - data = correct_it(data, data_model, deck, log_level="INFO") - return data - elif isinstance(data, pd.io.parsers.TextFileReader): - read_params = [ - "chunksize", - "names", - "dtype", - "parse_dates", - "date_parser", - "infer_datetime_format", - ] - read_dict = {x: data.orig_options.get(x) for x in read_params} - buffer = StringIO() - data_ = pandas_TextParser_hdlr.make_copy(data) - for df in data_: - df = correct_it(df, data_model, deck, log_level="INFO") - df.to_csv(buffer, header=False, index=False, mode="a") - buffer.seek(0) - return pd.read_csv(buffer, **read_dict) + data = correct_it(data, data_model, deck, log_level="INFO") + return data diff --git a/cdm_reader_mapper/metmetpy/platform_type/correct.py b/cdm_reader_mapper/metmetpy/platform_type/correct.py index c945b3ab..7aae9ce9 100755 --- a/cdm_reader_mapper/metmetpy/platform_type/correct.py +++ b/cdm_reader_mapper/metmetpy/platform_type/correct.py @@ -116,27 +116,6 @@ def correct(data, dataset, data_model, deck, log_level="INFO"): ) return - if isinstance(data, pd.DataFrame): - data = correct_it( - data, dataset, data_model, deck, pt_col, fix_methods, log_level="INFO" - ) - return data - elif isinstance(data, pd.io.parsers.TextFileReader): - read_params = [ - "chunksize", - "names", - "dtype", - "parse_dates", - "date_parser", - "infer_datetime_format", - ] - read_dict = {x: data.orig_options.get(x) for x in read_params} - buffer = StringIO() - for df in data: - df = correct_it( - df, dataset, data_model, deck, pt_col, fix_methods, log_level="INFO" - ) - df.to_csv(buffer, header=False, index=False, mode="a") - - buffer.seek(0) - return pd.read_csv(buffer, **read_dict) + return correct_it( + data, dataset, data_model, deck, pt_col, fix_methods, log_level="INFO" + ) diff --git a/cdm_reader_mapper/operations/corrections.py b/cdm_reader_mapper/operations/corrections.py index 378a770a..ce63e017 100755 --- a/cdm_reader_mapper/operations/corrections.py +++ b/cdm_reader_mapper/operations/corrections.py @@ -16,8 +16,6 @@ import pandas as pd from textdistance import levenshtein -from cdm_reader_mapper.common import pandas_TextParser_hdlr - # %% extract NOC_corrections/duplicates def gen_files(data, dataset, correction_path, yr, mo): @@ -70,7 +68,6 @@ def gen_files(data, dataset, correction_path, yr, mo): if not df.empty: dup, dup_f = get_dup(data, dataset) if os.path.exists(fn): - print(fn) df1 = pd.read_csv( fn, delimiter="|", @@ -227,23 +224,6 @@ def corrections(data, dataset, correction_path, yr, mo): if isinstance(data, pd.DataFrame): gen_files(data.copy(), dataset, correction_path, yr, mo) return - elif isinstance(data, pd.io.parsers.TextFileReader): - # read_params = [ - # "chunksize", - # "names", - # "dtype", - # "parse_dates", - # "date_parser", - # "infer_datetime_format", - # ] - # read_dict = {x: data.orig_options.get(x) for x in read_params} - buffer = StringIO() - data_copy = pandas_TextParser_hdlr.make_copy(data) - for dt in data_copy: - gen_files(dt, dataset, correction_path, yr, mo) - buffer.seek(0) - data_copy.close() - data = pandas_TextParser_hdlr.restore(data) def split_list(n): diff --git a/cdm_reader_mapper/operations/inspect.py b/cdm_reader_mapper/operations/inspect.py index 8c04e2db..cf3ae9e7 100755 --- a/cdm_reader_mapper/operations/inspect.py +++ b/cdm_reader_mapper/operations/inspect.py @@ -11,8 +11,6 @@ import numpy as np import pandas as pd -from cdm_reader_mapper.common import pandas_TextParser_hdlr - def count_by_cat_i(serie): """Count unique values.""" @@ -23,27 +21,9 @@ def count_by_cat_i(serie): def get_length(data): """Get length of pandas object.""" - if not isinstance(data, pd.io.parsers.TextFileReader): - return len(data) - else: - return pandas_TextParser_hdlr.get_length(data) + return len(data) def count_by_cat(data, col): """Count unique values.""" - if not isinstance(data, pd.io.parsers.TextFileReader): - return count_by_cat_i(data[col]) - else: - data_cp = pandas_TextParser_hdlr.make_copy(data) - count_dicts = [] - for df in data_cp: - count_dicts.append(count_by_cat_i(df[col])) - - data_cp.close() - cats = [list(x.keys()) for x in count_dicts] - cats = list({x for y in cats for x in y}) - cats.sort - count_dict = {} - for cat in cats: - count_dict[cat] = sum([x.get(cat) for x in count_dicts if x.get(cat)]) - return count_dict + return count_by_cat_i(data[col]) diff --git a/cdm_reader_mapper/operations/select.py b/cdm_reader_mapper/operations/select.py index 859cbcb6..ff315d71 100755 --- a/cdm_reader_mapper/operations/select.py +++ b/cdm_reader_mapper/operations/select.py @@ -12,7 +12,6 @@ import pandas as pd -from cdm_reader_mapper.common import pandas_TextParser_hdlr # Need to define a general thing for the parser() functions, like we did with # the dataframe_apply_index(), because they are all the same but for the @@ -21,7 +20,7 @@ # The index of the resulting dataframe(s) is reinitialized here, it does not # inherit from parent df # -# data is a dataframe or a TextFileReader +# data is a dataframe def dataframe_apply_index( @@ -114,12 +113,7 @@ def parser(data_parser, mask_parser, col, out_rejected=False, in_index=False): return output - if not isinstance(data, pd.io.parsers.TextFileReader): - output = dataframe( - data, mask[col], out_rejected=out_rejected, in_index=in_index - ) - else: - output = parser(data, mask, col, out_rejected=out_rejected, in_index=in_index) + output = parser(data, mask, col, out_rejected=out_rejected, in_index=in_index) if len(output) > 1: return output @@ -202,12 +196,8 @@ def parser(data_parser, col, values, out_rejected=False, in_index=False): col = list(selection.keys())[0] values = list(selection.values())[0] - if not isinstance(data, pd.io.parsers.TextFileReader): - output = dataframe( - data, col, values, out_rejected=out_rejected, in_index=in_index - ) - else: - output = parser(data, col, values, out_rejected=out_rejected, in_index=in_index) + + output = parser(data, col, values, out_rejected=out_rejected, in_index=in_index) if len(output) > 1: return output @@ -255,10 +245,7 @@ def parser(data_parser, index, out_rejected=False): output.append(pd.read_csv(out_buffer, **read_dict)) return output - if not isinstance(data, pd.io.parsers.TextFileReader): - output = dataframe(data, index, out_rejected=out_rejected) - else: - output = parser(data, index, out_rejected=out_rejected) + output = parser(data, index, out_rejected=out_rejected) if len(output) > 1: return output From 5b3f16942e4235994d9a91ffabf1385afe9a2247 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Fri, 15 Mar 2024 14:06:08 +0100 Subject: [PATCH 02/68] get rid of chunksize --- cdm_reader_mapper/mdf_reader/read.py | 16 ++-------------- cdm_reader_mapper/mdf_reader/utils/auxiliary.py | 2 -- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/read.py b/cdm_reader_mapper/mdf_reader/read.py index 52ecb640..824d9e0b 100755 --- a/cdm_reader_mapper/mdf_reader/read.py +++ b/cdm_reader_mapper/mdf_reader/read.py @@ -46,7 +46,6 @@ def convert_and_decode_entries( converter_dict=None, converter_kwargs=None, decoder_dict=None, - dtype=None, ): """Convert and decode data entries by using a pre-defined data model. @@ -67,10 +66,6 @@ def convert_and_decode_entries( decoder_dict: dict, optional Functions for decoding values in specific columns. If None use information from a pre-defined data model. - dtype: dtype or dict of {Hashable: dtype}, optional - Data type(s) to apply to either the whole dataset or individual columns. - If None use information from a pre-defined data model. - Use only if data is read with chunksizes. """ if converter_dict is None: converter_dict = self.configurations["convert_decode"]["converter_dict"] @@ -78,14 +73,13 @@ def convert_and_decode_entries( converter_kwargs = self.configurations["convert_decode"]["converter_kwargs"] if decoder_dict is None: decoder_dict = self.configurations["convert_decode"]["decoder_dict"] - if dtype is None: - dtype = self.configurations["convert_decode"]["dtype"] if convert is not True: converter_dict = {} converter_kwargs = {} if decode is not True: decoder_dict = {} + dtype = self.configurations["convert_decode"]["dtype"] self.data = self._convert_and_decode_df( self.data, converter_dict, @@ -107,7 +101,6 @@ def validate_entries( def read( self, - chunksize=None, sections=None, skiprows=0, out_path=None, @@ -123,8 +116,6 @@ def read( Parameters ---------- - chunksize : int, optional - Number of reports per chunk. sections : list, optional List with subset of data model sections to output, optional If None read pre-defined data model sections. @@ -151,14 +142,11 @@ def read( # 0. VALIDATE INPUT if not validate_arg("sections", sections, list): return - if not validate_arg("chunksize", chunksize, int): - return if not validate_arg("skiprows", skiprows, int): return if not validate_path("out_path", out_path): return - self.chunksize = chunksize self.skiprows = skiprows # 2. READ AND VALIDATE DATA @@ -175,7 +163,7 @@ def read( logging.info("Getting data string from source...") # self.configurations = self._get_configurations(read_sections_list, sections) self.configurations = self._get_configurations(read_sections_list, sections) - self.data = self._open_data(read_sections_list, sections, chunksize=chunksize) + self.data = self._open_data(read_sections_list, sections) ## 2.3. Extract, read and validate data in same loop # logging.info("Extracting and reading sections") diff --git a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py index 8cc02bfd..8fc64877 100755 --- a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py +++ b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py @@ -412,9 +412,7 @@ def _open_data( encoding=self.schema["header"].get("encoding"), widths=[properties.MAX_FULL_REPORT_WIDTH], skiprows=self.skiprows, - chunksize=chunksize, ) - df, self.missings = self._read_sections(TextParser, order, valid) return df From bad91e711aa2a568e7e2980e20f067aa1f99897f Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Fri, 15 Mar 2024 14:15:31 +0100 Subject: [PATCH 03/68] pre-commit --- cdm_reader_mapper/cdm_mapper/mapper.py | 2 +- cdm_reader_mapper/mdf_reader/read.py | 10 ++++------ cdm_reader_mapper/mdf_reader/utils/auxiliary.py | 3 --- cdm_reader_mapper/metmetpy/datetime/correct.py | 5 +---- cdm_reader_mapper/metmetpy/platform_type/correct.py | 5 +---- cdm_reader_mapper/operations/corrections.py | 1 - cdm_reader_mapper/operations/inspect.py | 1 - cdm_reader_mapper/operations/select.py | 3 +-- 8 files changed, 8 insertions(+), 22 deletions(-) diff --git a/cdm_reader_mapper/cdm_mapper/mapper.py b/cdm_reader_mapper/cdm_mapper/mapper.py index bc61799d..5c0e3569 100755 --- a/cdm_reader_mapper/cdm_mapper/mapper.py +++ b/cdm_reader_mapper/cdm_mapper/mapper.py @@ -3,7 +3,7 @@ Created on Thu Apr 11 13:45:38 2019 -Maps data contained in a pandas DataFrame to the C3S Climate Data Store Common Data Model (CDM) +Maps data contained in a pandas DataFrame to the C3S Climate Data Store Common Data Model (CDM) header and observational tables using the mapping information available in the tool's mapping library for the input data model. diff --git a/cdm_reader_mapper/mdf_reader/read.py b/cdm_reader_mapper/mdf_reader/read.py index 824d9e0b..0cd65f02 100755 --- a/cdm_reader_mapper/mdf_reader/read.py +++ b/cdm_reader_mapper/mdf_reader/read.py @@ -2,13 +2,11 @@ from __future__ import annotations -import csv import logging from io import StringIO as StringIO import pandas as pd -from . import properties from .schema import schemas from .utils.auxiliary import _FileReader, validate_arg, validate_path @@ -81,10 +79,10 @@ def convert_and_decode_entries( dtype = self.configurations["convert_decode"]["dtype"] self.data = self._convert_and_decode_df( - self.data, - converter_dict, - converter_kwargs, - decoder_dict, + self.data, + converter_dict, + converter_kwargs, + decoder_dict, ) self.data = self.data.astype(dtype) return self diff --git a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py index 8fc64877..126097ef 100755 --- a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py +++ b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py @@ -2,17 +2,14 @@ from __future__ import annotations -import csv import json import logging import os -from io import StringIO import pandas as pd from cdm_reader_mapper.common.getting_files import get_files -from .. import properties from ..schema import schemas from ..validate import validate from . import converters, decoders diff --git a/cdm_reader_mapper/metmetpy/datetime/correct.py b/cdm_reader_mapper/metmetpy/datetime/correct.py index f749025f..8d0550d2 100755 --- a/cdm_reader_mapper/metmetpy/datetime/correct.py +++ b/cdm_reader_mapper/metmetpy/datetime/correct.py @@ -31,11 +31,8 @@ import json import os -from io import StringIO -import pandas as pd - -from cdm_reader_mapper.common import logging_hdlr, pandas_TextParser_hdlr +from cdm_reader_mapper.common import logging_hdlr from cdm_reader_mapper.common.getting_files import get_files from .. import properties diff --git a/cdm_reader_mapper/metmetpy/platform_type/correct.py b/cdm_reader_mapper/metmetpy/platform_type/correct.py index 7aae9ce9..061a5942 100755 --- a/cdm_reader_mapper/metmetpy/platform_type/correct.py +++ b/cdm_reader_mapper/metmetpy/platform_type/correct.py @@ -40,9 +40,6 @@ import json import os -from io import StringIO - -import pandas as pd from cdm_reader_mapper.common import logging_hdlr from cdm_reader_mapper.common.getting_files import get_files @@ -117,5 +114,5 @@ def correct(data, dataset, data_model, deck, log_level="INFO"): return return correct_it( - data, dataset, data_model, deck, pt_col, fix_methods, log_level="INFO" + data, dataset, data_model, deck, pt_col, fix_methods, log_level="INFO" ) diff --git a/cdm_reader_mapper/operations/corrections.py b/cdm_reader_mapper/operations/corrections.py index ce63e017..e504c8a5 100755 --- a/cdm_reader_mapper/operations/corrections.py +++ b/cdm_reader_mapper/operations/corrections.py @@ -10,7 +10,6 @@ import logging import math import os -from io import StringIO import numpy as np import pandas as pd diff --git a/cdm_reader_mapper/operations/inspect.py b/cdm_reader_mapper/operations/inspect.py index cf3ae9e7..b43deba3 100755 --- a/cdm_reader_mapper/operations/inspect.py +++ b/cdm_reader_mapper/operations/inspect.py @@ -9,7 +9,6 @@ from __future__ import annotations import numpy as np -import pandas as pd def count_by_cat_i(serie): diff --git a/cdm_reader_mapper/operations/select.py b/cdm_reader_mapper/operations/select.py index ff315d71..a653f081 100755 --- a/cdm_reader_mapper/operations/select.py +++ b/cdm_reader_mapper/operations/select.py @@ -12,7 +12,6 @@ import pandas as pd - # Need to define a general thing for the parser() functions, like we did with # the dataframe_apply_index(), because they are all the same but for the # selection applied!!!!! @@ -196,7 +195,7 @@ def parser(data_parser, col, values, out_rejected=False, in_index=False): col = list(selection.keys())[0] values = list(selection.values())[0] - + output = parser(data, col, values, out_rejected=out_rejected, in_index=in_index) if len(output) > 1: From 7b69471e09758dd333f160993175cd4d8328437f Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Fri, 15 Mar 2024 14:15:47 +0100 Subject: [PATCH 04/68] delete chunked tests --- tests/test_cdm.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/tests/test_cdm.py b/tests/test_cdm.py index b6f7c9ba..87476a86 100755 --- a/tests/test_cdm.py +++ b/tests/test_cdm.py @@ -6,8 +6,6 @@ from ._testing_cdm_suite import _testing_suite -# from _testing_cdm_suite import _testing_suite - def test_read_imma1_buoys_nosupp(): _testing_suite( @@ -194,26 +192,3 @@ def test_read_imma1_buoys_codes_subset(): codes_subset=["platform_sub_type", "wind_direction"], suffix="063_714", ) - - -# C. TESTS TO TEST CHUNKING -# ----------------------------------------------------------------------------- -# FROM FILE: WITH AND WITHOUT SUPPLEMENTAL -def test_read_imma1_buoys_nosupp_chunks(): - "NOT WORKING: textfilereader" - _testing_suite( - **test_data.test_063_714, - cdm_name="icoads_r3000_d714", - suffix="063_714", - chunksize=10000, - ) - - -def test_read_imma1_buoys_supp_chunks(): - _testing_suite( - **test_data.test_063_714, - sections="c99", - suffix="063_714", - chunksize=10000, - mapping=False, - ) From ae1998c5154fa4bc5dd024de45045751239d8358 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 10 Apr 2024 09:29:38 +0200 Subject: [PATCH 05/68] do not use chunksize --- cdm_reader_mapper/mdf_reader/read.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/read.py b/cdm_reader_mapper/mdf_reader/read.py index 1afdc6dc..8c637a84 100755 --- a/cdm_reader_mapper/mdf_reader/read.py +++ b/cdm_reader_mapper/mdf_reader/read.py @@ -7,6 +7,7 @@ import pandas as pd +from . import properties from .schema import schemas from .utils.auxiliary import _FileReader, validate_arg, validate_path @@ -161,12 +162,11 @@ def read( logging.info("Getting data string from source...") # self.configurations = self._get_configurations(read_sections_list, sections) self.configurations = self._get_configurations(read_sections_list, sections) - + self.data = self._open_data( read_sections_list, sections, open_with=properties.open_file[self.imodel], - chunksize=chunksize, ) ## 2.3. Extract, read and validate data in same loop From 46a46692589934d65f2e5b07d984fcb7af3c2a58 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 10 Apr 2024 09:30:47 +0200 Subject: [PATCH 06/68] do not import unused packages --- cdm_reader_mapper/mdf_reader/utils/auxiliary.py | 5 ++--- cdm_reader_mapper/metmetpy/datetime/correct.py | 2 -- cdm_reader_mapper/metmetpy/platform_type/correct.py | 4 ---- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py index 7b98a8a2..93dc7e0b 100755 --- a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py +++ b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py @@ -5,9 +5,7 @@ import json import logging import os - from copy import deepcopy -from io import StringIO import numpy as np import pandas as pd @@ -15,6 +13,7 @@ from cdm_reader_mapper.common.getting_files import get_files +from .. import properties from ..schema import schemas from ..validate import validate from . import converters, decoders @@ -496,7 +495,7 @@ def _open_data( ) df, self.missings = self._read_sections( - TextParser, order, valid, open_with=open_with + TextParser, order, valid, open_with=open_with ) return df diff --git a/cdm_reader_mapper/metmetpy/datetime/correct.py b/cdm_reader_mapper/metmetpy/datetime/correct.py index f3a41a3e..f16ff2ac 100755 --- a/cdm_reader_mapper/metmetpy/datetime/correct.py +++ b/cdm_reader_mapper/metmetpy/datetime/correct.py @@ -31,8 +31,6 @@ import json -from io import StringIO - from cdm_reader_mapper.common import logging_hdlr from cdm_reader_mapper.common.getting_files import get_files diff --git a/cdm_reader_mapper/metmetpy/platform_type/correct.py b/cdm_reader_mapper/metmetpy/platform_type/correct.py index 772d4751..f793d6c7 100755 --- a/cdm_reader_mapper/metmetpy/platform_type/correct.py +++ b/cdm_reader_mapper/metmetpy/platform_type/correct.py @@ -40,10 +40,6 @@ import json -from io import StringIO - -import pandas as pd - from cdm_reader_mapper.common import logging_hdlr from cdm_reader_mapper.common.getting_files import get_files From 50fd66dab8e682ee6c09b1352dc683a7db803813 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 10 Apr 2024 09:31:11 +0200 Subject: [PATCH 07/68] get rid of TextFileReader objects --- cdm_reader_mapper/operations/select.py | 50 +------------------------- 1 file changed, 1 insertion(+), 49 deletions(-) diff --git a/cdm_reader_mapper/operations/select.py b/cdm_reader_mapper/operations/select.py index e99f277c..f7a3a2f0 100755 --- a/cdm_reader_mapper/operations/select.py +++ b/cdm_reader_mapper/operations/select.py @@ -65,55 +65,7 @@ def dataframe( idx_out_offset=idx_out_offset, ) - def parser(data_parser, mask_parser, out_rejected=False, in_index=False): - mask_cp = pandas_TextParser_hdlr.make_copy(mask_parser) - read_params = [ - "chunksize", - "names", - "dtype", - "parse_dates", - "date_parser", - "infer_datetime_format", - ] - read_dict = {x: data_parser.orig_options.get(x) for x in read_params} - in_buffer = StringIO() - if out_rejected: - out_buffer = StringIO() - if in_index: - index = [] - idx_in_offset = 0 - idx_out_offset = 0 - for df, mask_df in zip(data_parser, mask_cp): - o = dataframe( - df, - mask_df, - out_rejected=out_rejected, - in_index=in_index, - idx_in_offset=idx_in_offset, - idx_out_offset=idx_out_offset, - ) - o[0].to_csv(in_buffer, header=False, index=False, mode="a") - if out_rejected: - o[1].to_csv(out_buffer, header=False, index=False, mode="a") - idx_out_offset += len(o[1]) - if in_index and not out_rejected: - index.extend(o[1]) - if in_index and out_rejected: - index.extend(o[2]) - idx_in_offset += len(o[0]) - - mask_cp.close() - in_buffer.seek(0) - output = [pd.read_csv(in_buffer, **read_dict)] - if out_rejected: - out_buffer.seek(0) - output.append(pd.read_csv(out_buffer, **read_dict)) - if in_index: - output.append(index) - - return output - - output = parser(data, mask, col, out_rejected=out_rejected, in_index=in_index) + output = dataframe(data, mask, out_rejected=out_rejected, in_index=in_index) if len(output) > 1: return output From 4650eca8b3bd79b278dbc007fc9029b7d193389a Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 10 Apr 2024 09:43:53 +0200 Subject: [PATCH 08/68] deleted --- .../common/pandas_TextParser_hdlr.py | 94 ------------------- 1 file changed, 94 deletions(-) delete mode 100755 cdm_reader_mapper/common/pandas_TextParser_hdlr.py diff --git a/cdm_reader_mapper/common/pandas_TextParser_hdlr.py b/cdm_reader_mapper/common/pandas_TextParser_hdlr.py deleted file mode 100755 index 34bfce95..00000000 --- a/cdm_reader_mapper/common/pandas_TextParser_hdlr.py +++ /dev/null @@ -1,94 +0,0 @@ -""" -Functions for pandas TextParser objects. - -Created on Tue Apr 2 10:34:56 2019 - -Assumes we are never writing a header! - -@author: iregon -""" - -from __future__ import annotations - -from io import StringIO - -import pandas as pd - -from . import logging_hdlr - -logger = logging_hdlr.init_logger(__name__, level="DEBUG") - -read_params = [ - "chunksize", - "names", - "dtype", - "parse_dates", - "date_parser", - "infer_datetime_format", - "delimiter", - "quotechar", - "escapechar", -] - - -def make_copy(OParser): - """Make a copy of a pandas TextParser object.""" - try: - f = OParser.handles.handle - NewRef = StringIO(f.getvalue()) - read_dict = {x: OParser.orig_options.get(x) for x in read_params} - NParser = pd.read_csv(NewRef, **read_dict) - return NParser - except Exception: - logger.error("Failed to copy TextParser", exc_info=True) - return - - -def restore(Parser): - """Restore pandas TextParser object.""" - try: - f = Parser.handles.handle - f.seek(0) - read_dict = {x: Parser.orig_options.get(x) for x in read_params} - Parser = pd.read_csv(f, **read_dict) - return Parser - except Exception: - logger.error("Failed to restore TextParser", exc_info=True) - return Parser - - -def is_not_empty(Parser): - """Return boolean whether pandas TextParser object is empty.""" - try: - Parser_copy = make_copy(Parser) - except Exception: - logger.error( - f"Failed to process input. Input type is {type(Parser)}", exc_info=True - ) - return - try: - first_chunk = Parser_copy.get_chunk() - Parser_copy.close() - if len(first_chunk) > 0: - logger.debug("Is not empty") - return True - else: - return False - except Exception: - logger.debug("Something went wrong", exc_info=True) - return False - - -def get_length(Parser): - """Get length of pandas TextParser object.""" - try: - Parser_copy = make_copy(Parser) - except Exception: - logger.error( - f"Failed to process input. Input type is {type(Parser)}", exc_info=True - ) - return - no_records = 0 - for df in Parser_copy: - no_records += len(df) - return no_records From 98762ebfb69ff8671938d5f29c4eace5b86bc1e2 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 10 Apr 2024 09:44:11 +0200 Subject: [PATCH 09/68] delete chunksizes --- cdm_reader_mapper/mdf_reader/utils/auxiliary.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py index 93dc7e0b..10aadd98 100755 --- a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py +++ b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py @@ -481,7 +481,6 @@ def _open_data( self, order, valid, - chunksize, open_with="pandas", ): if open_with == "netcdf": @@ -491,7 +490,6 @@ def _open_data( encoding=self.schema["header"].get("encoding"), widths=[properties.MAX_FULL_REPORT_WIDTH], skiprows=self.skiprows, - chunksize=chunksize, ) df, self.missings = self._read_sections( From 3e3812cd221883a75b7ee9f39321b3f2af7eb4da Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 10 Apr 2024 09:44:44 +0200 Subject: [PATCH 10/68] do not differentiate between DataFrame and TExtFileReader --- cdm_reader_mapper/operations/select.py | 82 +------------------------- 1 file changed, 2 insertions(+), 80 deletions(-) diff --git a/cdm_reader_mapper/operations/select.py b/cdm_reader_mapper/operations/select.py index f7a3a2f0..ee31949a 100755 --- a/cdm_reader_mapper/operations/select.py +++ b/cdm_reader_mapper/operations/select.py @@ -8,10 +8,6 @@ """ from __future__ import annotations -from io import StringIO - -import pandas as pd - # Need to define a general thing for the parser() functions, like we did with # the dataframe_apply_index(), because they are all the same but for the # selection applied!!!!! @@ -99,57 +95,10 @@ def dataframe( idx_out_offset=idx_out_offset, ) - def parser(data_parser, col, values, out_rejected=False, in_index=False): - read_params = [ - "chunksize", - "names", - "dtype", - "parse_dates", - "date_parser", - "infer_datetime_format", - ] - read_dict = {x: data_parser.orig_options.get(x) for x in read_params} - in_buffer = StringIO() - if out_rejected: - out_buffer = StringIO() - if in_index: - index = [] - idx_in_offset = 0 - idx_out_offset = 0 - for df in data_parser: - o = dataframe( - df, - col, - values, - out_rejected=out_rejected, - in_index=in_index, - idx_in_offset=idx_in_offset, - idx_out_offset=idx_out_offset, - ) - o[0].to_csv(in_buffer, header=False, index=False, mode="a") - if out_rejected: - o[1].to_csv(out_buffer, header=False, index=False, mode="a") - idx_out_offset += len(o[1]) - if in_index and not out_rejected: - index.extend(o[1]) - if in_index and out_rejected: - index.extend(o[2]) - idx_in_offset += len(o[0]) - - in_buffer.seek(0) - output = [pd.read_csv(in_buffer, **read_dict)] - if out_rejected: - out_buffer.seek(0) - output.append(pd.read_csv(out_buffer, **read_dict)) - if in_index: - output.append(index) - - return output - col = list(selection.keys())[0] values = list(selection.values())[0] - output = parser(data, col, values, out_rejected=out_rejected, in_index=in_index) + output = dataframe(data, col, values, out_rejected=out_rejected, in_index=in_index) if len(output) > 1: return output @@ -170,34 +119,7 @@ def dataframe(df, index, out_rejected=False, idx_in_offset=0, idx_out_offset=0): idx_out_offset=idx_out_offset, ) - def parser(data_parser, index, out_rejected=False): - read_params = [ - "chunksize", - "names", - "dtype", - "parse_dates", - "date_parser", - "infer_datetime_format", - ] - read_dict = {x: data_parser.orig_options.get(x) for x in read_params} - in_buffer = StringIO() - if out_rejected: - out_buffer = StringIO() - - for df in data_parser: - o = dataframe(df, index, out_rejected=out_rejected) - o[0].to_csv(in_buffer, header=False, index=False, mode="a") - if out_rejected: - o[1].to_csv(out_buffer, header=False, index=False, mode="a") - - in_buffer.seek(0) - output = [pd.read_csv(in_buffer, **read_dict)] - if out_rejected: - out_buffer.seek(0) - output.append(pd.read_csv(out_buffer, **read_dict)) - return output - - output = parser(data, index, out_rejected=out_rejected) + output = dataframe(data, index, out_rejected=out_rejected) if len(output) > 1: return output From 70f2e256b43e68c481aec279b4da2a76d07a36c1 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 10 Apr 2024 09:45:08 +0200 Subject: [PATCH 11/68] stop testing with TextFileReader objects --- tests/_data.py | 1 - tests/test_operations.py | 32 +------------------------------- 2 files changed, 1 insertion(+), 32 deletions(-) diff --git a/tests/_data.py b/tests/_data.py index ed26dc3e..09c4668f 100755 --- a/tests/_data.py +++ b/tests/_data.py @@ -13,4 +13,3 @@ def _read_data(**kwargs): data_dict = test_data.test_063_714 data_df, attrs_df, mask_df = _read_data(**data_dict) -data_pa, attrs_pa, mask_pa = _read_data(chunksize=10000, **data_dict) diff --git a/tests/test_operations.py b/tests/test_operations.py index 28400e8b..b69e1cb7 100755 --- a/tests/test_operations.py +++ b/tests/test_operations.py @@ -5,7 +5,7 @@ from cdm_reader_mapper.operations import corrections, inspect, replace, select -from ._data import attrs_df, attrs_pa, data_df, data_pa, mask_df, mask_pa +from ._data import attrs_df, data_df, mask_df from ._results import correction_df, table_df # from _data import attrs_df, attrs_pa, data_df, data_pa, mask_df, mask_pa @@ -16,59 +16,29 @@ def test_select_true_pandas(): select.select_true(data_df, mask_df, out_rejected=True) -def test_select_true_parser(): - select.select_true(data_pa, mask_pa, out_rejected=True) - - def test_select_from_list_pandas(): selection = {("c1", "PT"): ["7", "6"]} select.select_from_list(data_df, selection, out_rejected=True, in_index=True) -def test_select_from_list_parser(): - selection = {("c1", "PT"): ["7", "6"]} - select.select_from_list(data_pa, selection, out_rejected=True, in_index=True) - - def test_select_from_index_pandas(): select.select_from_index(data_df, [0, 1, 2, 3, 4, 5]) -def test_select_from_index_parser(): - select.select_from_index(data_pa, [0, 1, 2, 3, 4, 5]) - - def test_inspect_get_length_pandas(): inspect.get_length(data_df) -def test_inspect_get_length_parser(): - inspect.get_length(data_pa) - - def test_inspect_count_by_ca_pandas(): inspect.count_by_cat(data_df, ("c1", "PT")) -def test_inspect_count_by_cat_parser(): - inspect.count_by_cat(data_pa, ("c1", "PT")) - - def test_corrections_pandas(): corrections.corrections( data_df, dataset="test_data", correction_path=".", yr="2010", mo="07" ) -# test_corrections_pandas() - - -def test_corrections_parser(): - corrections.corrections( - data_pa, dataset="test_data", correction_path=".", yr="2010", mo="07" - ) - - def test_replace(): replace.replace_columns( table_df, From 22aef16d7bf69425858a7bc9e9544b360888db73 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 10 Apr 2024 09:52:30 +0200 Subject: [PATCH 12/68] delete packages for testing TextFileReader objects --- tests/_testing_cdm_suite.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tests/_testing_cdm_suite.py b/tests/_testing_cdm_suite.py index 01456fb7..dd0828ed 100755 --- a/tests/_testing_cdm_suite.py +++ b/tests/_testing_cdm_suite.py @@ -6,7 +6,6 @@ from cdm_reader_mapper import cdm_mapper, mdf_reader from cdm_reader_mapper.cdm_mapper import read_tables -from cdm_reader_mapper.common.pandas_TextParser_hdlr import make_copy from cdm_reader_mapper.metmetpy import ( correct_datetime, correct_pt, @@ -108,14 +107,8 @@ def _testing_suite( deck=deck, ) - if not isinstance(data, pd.DataFrame): - data_pd = make_copy(data).read() - else: - data_pd = data.copy() - if not isinstance(mask, pd.DataFrame): - mask_pd = make_copy(mask).read() - else: - mask_pd = mask.copy() + data_pd = data.copy() + mask_pd = mask.copy() val_dt = validate_datetime.validate( data=data_pd, From d0dbbe40fe560673b29fc273cb6202b181d59758 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 20 Jun 2024 13:15:23 +0000 Subject: [PATCH 13/68] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- cdm_reader_mapper/mdf_reader/utils/auxiliary.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py index 262abb21..9305e125 100755 --- a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py +++ b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py @@ -508,7 +508,6 @@ def _open_data( ) return df - def _convert_and_decode_df( self, df, From ba851ed65240d4d3cce6af1192d49e9f5dec31a6 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 20 Jun 2024 15:23:14 +0200 Subject: [PATCH 14/68] delete correction parser test --- tests/test_operations.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/tests/test_operations.py b/tests/test_operations.py index 54c4cdea..7efa7554 100755 --- a/tests/test_operations.py +++ b/tests/test_operations.py @@ -8,9 +8,6 @@ from ._data import attrs_df, data_df, mask_df from ._results import correction_df, table_df -# from _data import attrs_df, attrs_pa, data_df, data_pa, mask_df, mask_pa -# from _results import correction_df, table_df - def test_select_true_pandas(): select.select_true(data_df, mask_df, out_rejected=True) @@ -39,12 +36,6 @@ def test_corrections_pandas(): ) -def test_corrections_parser(): - corrections.corrections( - data_pa, dataset="test_data", correction_path=".", yr="2010", mo="07" - ) - - def test_replace(): replace.replace_columns( table_df, From c00812a79f71c0669bfee01547f636100efb3271 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Fri, 21 Jun 2024 12:29:33 +0200 Subject: [PATCH 15/68] fix with main branch --- cdm_reader_mapper/mdf_reader/read.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/read.py b/cdm_reader_mapper/mdf_reader/read.py index 9240116f..7132ca90 100755 --- a/cdm_reader_mapper/mdf_reader/read.py +++ b/cdm_reader_mapper/mdf_reader/read.py @@ -79,13 +79,14 @@ def convert_and_decode_entries( decoder_dict = {} dtype = self.configurations["convert_decode"]["dtype"] - self.data = self._convert_and_decode_df( - self.data, - converter_dict, - converter_kwargs, - decoder_dict, + dtype = self._adjust_dtype(dtype, self.data) + data = self._convert_and_decode_df( + self.data, + converter_dict, + converter_kwargs, + decoder_dict, ) - self.data = self.data.astype(dtype) + self.data = data.astype(dtype) return self def validate_entries( @@ -95,7 +96,7 @@ def validate_entries( Fill attribute `valid` with boolean mask. """ - self.mask = self._validate_df(self.data) + self.mask = self._validate_df(self.data, isna=self.isna) return self def read( @@ -161,10 +162,11 @@ def read( # a list with a single dataframe logging.info("Getting data string from source...") self.configurations = self._get_configurations(read_sections_list, sections) - self.data = self._open_data( + self.data, self.isna = self._open_data( read_sections_list, sections, - open_with=properties.open_file[self.imodel], + # INFO: Set default as "pandas" to account for custom schema + open_with=properties.open_file.get(self.imodel, "pandas"), ) ## 2.3. Extract, read and validate data in same loop From 9f38d7bcedebe9cc87934a2c0e8867e16ddf20c7 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Fri, 21 Jun 2024 12:30:53 +0200 Subject: [PATCH 16/68] get rid of TextFiTextFileReader loop --- .../mdf_reader/utils/auxiliary.py | 46 +++++++++---------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py index 9305e125..047ae36c 100755 --- a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py +++ b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py @@ -506,7 +506,7 @@ def _open_data( df, self.missings = self._read_sections( TextParser, order, valid, open_with=open_with ) - return df + return df, df.isna() def _convert_and_decode_df( self, @@ -556,33 +556,29 @@ def _validate_df(self, df, isna=None): def _dump_atts(self, out_atts, out_path): """Dump attributes to atts.json.""" - data = [self.data] - valid = [self.mask] + data_df = self.data.copy() + valid_df = self.mask.copy() logging.info(f"WRITING DATA TO FILES IN: {out_path}") - for i, (data_df, valid_df) in enumerate(zip(data, valid)): - header = False - mode = "a" - if i == 0: - mode = "w" - cols = [x for x in data_df] - if isinstance(cols[0], tuple): - header = [":".join(x) for x in cols] - out_atts_json = { - ":".join(x): out_atts.get(x) for x in out_atts.keys() - } - else: - header = cols - out_atts_json = out_atts - kwargs = { - "header": header, - "mode": mode, + cols = [x for x in data_df] + if isinstance(cols[0], tuple): + header = [":".join(x) for x in cols] + out_atts_json = { + ":".join(x): out_atts.get(x) for x in out_atts.keys() + } + else: + header = cols + out_atts_json = out_atts + + kwargs = { + "header": False, + "mode": "w", "encoding": "utf-8", "index": True, "index_label": "index", "escapechar": "\0", - } - data_df.to_csv(os.path.join(out_path, "data.csv"), **kwargs) - valid_df.to_csv(os.path.join(out_path, "mask.csv"), **kwargs) + } + data_df.to_csv(os.path.join(out_path, "data.csv"), **kwargs) + valid_df.to_csv(os.path.join(out_path, "mask.csv"), **kwargs) - with open(os.path.join(out_path, "atts.json"), "w") as fileObj: - json.dump(out_atts_json, fileObj, indent=4) + with open(os.path.join(out_path, "atts.json"), "w") as fileObj: + json.dump(out_atts_json, fileObj, indent=4) From 22a92d80deee0707af51d502ab35e7fd81dee76d Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Fri, 21 Jun 2024 12:31:16 +0200 Subject: [PATCH 17/68] simplify --- cdm_reader_mapper/operations/corrections.py | 5 ++--- cdm_reader_mapper/operations/select.py | 24 ++++++--------------- 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/cdm_reader_mapper/operations/corrections.py b/cdm_reader_mapper/operations/corrections.py index 1e6832d3..f853e40b 100755 --- a/cdm_reader_mapper/operations/corrections.py +++ b/cdm_reader_mapper/operations/corrections.py @@ -220,9 +220,8 @@ def corrections(data, dataset, correction_path, yr, mo): "timestamp", ]: os.makedirs(os.path.join(correction_path, f), exist_ok=True) - if isinstance(data, pd.DataFrame): - gen_files(data.copy(), dataset, correction_path, yr, mo) - return + + gen_files(data.copy(), dataset, correction_path, yr, mo) def split_list(n): diff --git a/cdm_reader_mapper/operations/select.py b/cdm_reader_mapper/operations/select.py index da9e80e6..ad32f45b 100755 --- a/cdm_reader_mapper/operations/select.py +++ b/cdm_reader_mapper/operations/select.py @@ -8,6 +8,7 @@ """ from __future__ import annotations + # Need to define a general thing for the parser() functions, like we did with # the dataframe_apply_index(), because they are all the same but for the # selection applied!!!!! @@ -61,12 +62,7 @@ def dataframe( idx_out_offset=idx_out_offset, ) - output = dataframe(data, mask, out_rejected=out_rejected, in_index=in_index) - - if len(output) > 1: - return output - else: - return output[0] + return dataframe(data, mask, out_rejected=out_rejected, in_index=in_index) def select_from_list(data, selection, out_rejected=False, in_index=False): @@ -96,13 +92,10 @@ def dataframe( col = list(selection.keys())[0] values = list(selection.values())[0] + return dataframe( + data, col, values, out_rejected=out_rejected, in_index=in_index + ) - output = dataframe(data, col, values, out_rejected=out_rejected, in_index=in_index) - - if len(output) > 1: - return output - else: - return output[0] def select_from_index(data, index, out_rejected=False): @@ -118,9 +111,4 @@ def dataframe(df, index, out_rejected=False, idx_in_offset=0, idx_out_offset=0): idx_out_offset=idx_out_offset, ) - output = dataframe(data, index, out_rejected=out_rejected) - - if len(output) > 1: - return output - else: - return output[0] + return dataframe(data, index, out_rejected=out_rejected) From d3baecf86e0bc2e1540cbb36292ee1da19b951a6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 21 Jun 2024 10:31:56 +0000 Subject: [PATCH 18/68] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- cdm_reader_mapper/mdf_reader/read.py | 8 ++++---- .../mdf_reader/utils/auxiliary.py | 18 ++++++++---------- cdm_reader_mapper/operations/select.py | 6 +----- 3 files changed, 13 insertions(+), 19 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/read.py b/cdm_reader_mapper/mdf_reader/read.py index 7132ca90..54657fde 100755 --- a/cdm_reader_mapper/mdf_reader/read.py +++ b/cdm_reader_mapper/mdf_reader/read.py @@ -81,10 +81,10 @@ def convert_and_decode_entries( dtype = self.configurations["convert_decode"]["dtype"] dtype = self._adjust_dtype(dtype, self.data) data = self._convert_and_decode_df( - self.data, - converter_dict, - converter_kwargs, - decoder_dict, + self.data, + converter_dict, + converter_kwargs, + decoder_dict, ) self.data = data.astype(dtype) return self diff --git a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py index 047ae36c..ec9ecd15 100755 --- a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py +++ b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py @@ -562,20 +562,18 @@ def _dump_atts(self, out_atts, out_path): cols = [x for x in data_df] if isinstance(cols[0], tuple): header = [":".join(x) for x in cols] - out_atts_json = { - ":".join(x): out_atts.get(x) for x in out_atts.keys() - } + out_atts_json = {":".join(x): out_atts.get(x) for x in out_atts.keys()} else: header = cols out_atts_json = out_atts - + kwargs = { - "header": False, - "mode": "w", - "encoding": "utf-8", - "index": True, - "index_label": "index", - "escapechar": "\0", + "header": False, + "mode": "w", + "encoding": "utf-8", + "index": True, + "index_label": "index", + "escapechar": "\0", } data_df.to_csv(os.path.join(out_path, "data.csv"), **kwargs) valid_df.to_csv(os.path.join(out_path, "mask.csv"), **kwargs) diff --git a/cdm_reader_mapper/operations/select.py b/cdm_reader_mapper/operations/select.py index ad32f45b..eec12649 100755 --- a/cdm_reader_mapper/operations/select.py +++ b/cdm_reader_mapper/operations/select.py @@ -8,7 +8,6 @@ """ from __future__ import annotations - # Need to define a general thing for the parser() functions, like we did with # the dataframe_apply_index(), because they are all the same but for the # selection applied!!!!! @@ -92,10 +91,7 @@ def dataframe( col = list(selection.keys())[0] values = list(selection.values())[0] - return dataframe( - data, col, values, out_rejected=out_rejected, in_index=in_index - ) - + return dataframe(data, col, values, out_rejected=out_rejected, in_index=in_index) def select_from_index(data, index, out_rejected=False): From 7fef4f44e6aa433154943f258ac561b731ebef08 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Fri, 21 Jun 2024 12:36:05 +0200 Subject: [PATCH 19/68] use header information --- cdm_reader_mapper/mdf_reader/utils/auxiliary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py index ec9ecd15..30effad7 100755 --- a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py +++ b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py @@ -568,7 +568,7 @@ def _dump_atts(self, out_atts, out_path): out_atts_json = out_atts kwargs = { - "header": False, + "header": header, "mode": "w", "encoding": "utf-8", "index": True, From 7b2bad7ec29c9e4585b8d6b9f0df5c3f6abbbdcf Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 26 Jun 2024 08:34:26 +0000 Subject: [PATCH 20/68] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- cdm_reader_mapper/operations/select.py | 1 + tests/test_operations.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/cdm_reader_mapper/operations/select.py b/cdm_reader_mapper/operations/select.py index 48d03726..eec12649 100755 --- a/cdm_reader_mapper/operations/select.py +++ b/cdm_reader_mapper/operations/select.py @@ -63,6 +63,7 @@ def dataframe( return dataframe(data, mask, out_rejected=out_rejected, in_index=in_index) + def select_from_list(data, selection, out_rejected=False, in_index=False): """DOCUMENTATION.""" diff --git a/tests/test_operations.py b/tests/test_operations.py index ae17ea63..7efa7554 100755 --- a/tests/test_operations.py +++ b/tests/test_operations.py @@ -6,7 +6,6 @@ from cdm_reader_mapper.operations import corrections, inspect, replace, select from ._data import attrs_df, data_df, mask_df - from ._results import correction_df, table_df From d5fb27ebed659e5d44df697130ce95de9449a97a Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 26 Jun 2024 12:31:50 +0200 Subject: [PATCH 21/68] remove unused import --- tests/test_operations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_operations.py b/tests/test_operations.py index 7efa7554..dff640c4 100755 --- a/tests/test_operations.py +++ b/tests/test_operations.py @@ -5,7 +5,7 @@ from cdm_reader_mapper.operations import corrections, inspect, replace, select -from ._data import attrs_df, data_df, mask_df +from ._data import data_df, mask_df from ._results import correction_df, table_df From dba22655cb6b547962a9d74e2ee260ce1f0178db Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 26 Jun 2024 13:39:07 +0200 Subject: [PATCH 22/68] set default encoding to utf-8 --- cdm_reader_mapper/mdf_reader/utils/auxiliary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py index 1a065aff..6e1a37ca 100755 --- a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py +++ b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py @@ -502,7 +502,7 @@ def _open_data( TextParser = self._read_netcdf() elif open_with == "pandas": TextParser = self._read_pandas( - encoding=self.schema["header"].get("encoding"), + encoding=self.schema["header"].get("encoding", "utf-8"), widths=[properties.MAX_FULL_REPORT_WIDTH], skiprows=self.skiprows, ) From 7475d083a4472000cc957b9eed62e766f56f29ee Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 26 Jun 2024 14:35:05 +0200 Subject: [PATCH 23/68] remove StringIO import --- cdm_reader_mapper/mdf_reader/read.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/read.py b/cdm_reader_mapper/mdf_reader/read.py index 0b17b249..0a6fe48a 100755 --- a/cdm_reader_mapper/mdf_reader/read.py +++ b/cdm_reader_mapper/mdf_reader/read.py @@ -3,7 +3,6 @@ from __future__ import annotations import logging -from io import StringIO as StringIO import pandas as pd @@ -28,11 +27,11 @@ class MDFFileReader(_FileReader): Attributes ---------- data : pd.DataFrame - a pandas.DataFrame with the output data + a pd.DataFrame with the output data attrs : dict a dictionary with the output data elements attributes mask : pd.DataFrame - a pandas.DataFrame with the output data validation mask + a pd.DataFrame with the output data validation mask """ def __init__(self, *args, **kwargs): @@ -155,7 +154,7 @@ def read( sections = read_sections_list # 2.2 Homogenize input data to an iterable with dataframes: - # a list with a single dataframe or a pd.io.parsers.TextFileReader + # a list with a single dataframe logging.info("Getting data string from source...") self.configurations = self._get_configurations(read_sections_list, sections) From 86c7f6ca0f5ffdad1d6cdaf15356ad2707dfe7e1 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 26 Jun 2024 15:21:46 +0200 Subject: [PATCH 24/68] if statement adjustment --- cdm_reader_mapper/mdf_reader/utils/auxiliary.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py index 6e1a37ca..c53ef6ca 100755 --- a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py +++ b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py @@ -103,9 +103,8 @@ def __init__( self.valid = valid self.schema = schema self.str_line = "" - if isinstance(df, pd.Series) or isinstance(df, pd.DataFrame): - if len(df) > 0: - self.str_line = df.iloc[0] + if len(df) > 0 and hasattr(df, "iloc"): + self.str_line = df.iloc[0] def _add_field_length(self, index): if "field_length" in self.sections_dict.keys(): From a9a6183dadebbfb1167a2eac34174e5c19ab2b74 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 27 Jun 2024 10:37:27 +0200 Subject: [PATCH 25/68] remove blank lines --- cdm_reader_mapper/mdf_reader/read.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/read.py b/cdm_reader_mapper/mdf_reader/read.py index 0a6fe48a..26c688df 100755 --- a/cdm_reader_mapper/mdf_reader/read.py +++ b/cdm_reader_mapper/mdf_reader/read.py @@ -155,7 +155,6 @@ def read( # 2.2 Homogenize input data to an iterable with dataframes: # a list with a single dataframe - logging.info("Getting data string from source...") self.configurations = self._get_configurations(read_sections_list, sections) self.data, self.isna = self._open_data( @@ -166,8 +165,7 @@ def read( ) ## 2.3. Extract, read and validate data in same loop - # logging.info("Extracting and reading sections") - + logging.info("Extracting and reading sections") if convert or decode: self.convert_and_decode_entries( convert=convert, From c126c3568faabf77298d632c4de377e402e09ed6 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 27 Jun 2024 10:37:48 +0200 Subject: [PATCH 26/68] gather duplicated lines --- cdm_reader_mapper/mdf_reader/utils/auxiliary.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py index c53ef6ca..a73ebfd9 100755 --- a/cdm_reader_mapper/mdf_reader/utils/auxiliary.py +++ b/cdm_reader_mapper/mdf_reader/utils/auxiliary.py @@ -379,16 +379,15 @@ def __init__( # Schema reader will return empty if cannot read schema or is not valid # and will log the corresponding error # multiple_reports_per_line error also while reading schema + logging.info("READING DATA MODEL SCHEMA FILE...") if self.data_model: model_path = f"{properties._base}.code_tables.{self.data_model}" self.code_tables_path = get_files(model_path) self.imodel = data_model - logging.info("READING DATA MODEL SCHEMA FILE...") self.schema = schemas.read_schema(schema_name=data_model) else: self.code_tables_path = os.path.join(data_model_path, "code_tables") self.imodel = data_model_path - logging.info("READING DATA MODEL SCHEMA FILE...") self.schema = schemas.read_schema(ext_schema_path=data_model_path) def _adjust_dtype(self, dtype, df): From a0dba699f4ea4535fdb1e6fb921a6cea8d6e76e4 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 27 Jun 2024 10:38:12 +0200 Subject: [PATCH 27/68] remove if statement which is always True --- cdm_reader_mapper/cdm_mapper/table_writer.py | 25 +++++++------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/cdm_reader_mapper/cdm_mapper/table_writer.py b/cdm_reader_mapper/cdm_mapper/table_writer.py index 43b3a607..61a46163 100755 --- a/cdm_reader_mapper/cdm_mapper/table_writer.py +++ b/cdm_reader_mapper/cdm_mapper/table_writer.py @@ -186,14 +186,10 @@ def print_integer_array_i(row, null_label=None): ------- data: int """ - if row == row: - row = eval(row) - row = row if isinstance(row, list) else [row] - string = ",".join(filter(bool, [str(int(x)) for x in row if np.isfinite(x)])) - if len(string) > 0: - return "{" + string + "}" - else: - return null_label + row = row if isinstance(row, list) else [row] + string = ",".join(filter(bool, [str(int(x)) for x in row if np.isfinite(x)])) + if len(string) > 0: + return "{" + string + "}" else: return null_label @@ -211,14 +207,11 @@ def print_varchar_array_i(row, null_label=None): ------- data: varchar """ - if row == row: - row = eval(row) - row = row if isinstance(row, list) else [row] - string = ",".join(filter(bool, row)) - if len(string) > 0: - return "{" + string + "}" - else: - return null_label + row = eval(row) + row = row if isinstance(row, list) else [row] + string = ",".join(filter(bool, row)) + if len(string) > 0: + return "{" + string + "}" else: return null_label From db6745778a04f410a72939c8edaf17d19e34ab30 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 27 Jun 2024 10:39:10 +0200 Subject: [PATCH 28/68] do not use StringIO buffer --- cdm_reader_mapper/cdm_mapper/mapper.py | 67 ++++++++------------------ 1 file changed, 20 insertions(+), 47 deletions(-) diff --git a/cdm_reader_mapper/cdm_mapper/mapper.py b/cdm_reader_mapper/cdm_mapper/mapper.py index 33d13177..c515fe98 100755 --- a/cdm_reader_mapper/cdm_mapper/mapper.py +++ b/cdm_reader_mapper/cdm_mapper/mapper.py @@ -11,8 +11,6 @@ """ from __future__ import annotations -from io import StringIO - import numpy as np import pandas as pd @@ -68,37 +66,32 @@ def _mapping_type(elements, data_atts): return m_type -def _decimal_places( - cdm_tables, decimal_places, cdm_key, table, imodel_functions, elements -): +def _decimal_places(atts, decimal_places, cdm_key, imodel_functions, elements): if decimal_places is not None: if isinstance(decimal_places, int): - cdm_tables[table]["atts"][cdm_key].update( - {"decimal_places": decimal_places} - ) + atts[cdm_key].update({"decimal_places": decimal_places}) else: - cdm_tables[table]["atts"][cdm_key].update( + atts[cdm_key].update( {"decimal_places": getattr(imodel_functions, decimal_places)(elements)} ) - return cdm_tables + return atts def _write_csv_files( idata, mapping, logger, - table, cols, data_atts, imodel_functions, imodel_code_tables, - cdm_tables, + atts, out_dtypes, ): table_df_i = pd.DataFrame( index=idata.index, columns=mapping.keys() ) # We cannot predifine column based dtypes here! - logger.debug(f"Table: {table}") + # logger.debug(f"Table: {table}") for cdm_key, imapping in mapping.items(): logger.debug(f"\tElement: {cdm_key}") isEmpty = False @@ -172,36 +165,36 @@ def _write_csv_files( if fill_value is not None: table_df_i[cdm_key] = table_df_i[cdm_key].fillna(value=fill_value) - cdm_tables = _decimal_places( - cdm_tables, decimal_places, cdm_key, table, imodel_functions, elements + atts = _decimal_places( + atts, decimal_places, cdm_key, imodel_functions, elements ) # think that NaN also casts floats to float64....!keep floats of lower precision to its original one # will convert all NaN to object type! # but also some numerics with values, like imma observation-value (temperatures), # are being returned as objects!!! pero esto qué es? - out_dtypes[table].update( + out_dtypes.update( { i: table_df_i[i].dtype for i in table_df_i if table_df_i[i].dtype in properties.numpy_floats - and out_dtypes[table].get(i) not in properties.numpy_floats + and out_dtypes.get(i) not in properties.numpy_floats } ) - out_dtypes[table].update( + out_dtypes.update( { i: table_df_i[i].dtype for i in table_df_i if table_df_i[i].dtype == "object" - and out_dtypes[table].get(i) not in properties.numpy_floats + and out_dtypes.get(i) not in properties.numpy_floats } ) if "observation_value" in table_df_i: table_df_i = table_df_i.dropna(subset=["observation_value"]) table_df_i = drop_duplicates(table_df_i) - table_df_i.to_csv(cdm_tables[table]["buffer"], header=False, index=False, mode="a") - return cdm_tables + table_df_i = table_df_i.astype(dtype=out_dtypes) + return table_df_i, atts def _map(imodel, data, data_atts, cdm_subset=None, codes_subset=None, log_level="INFO"): @@ -288,9 +281,7 @@ def _map(imodel, data, data_atts, cdm_subset=None, codes_subset=None, log_level= ) return # Initialize dictionary to store temporal tables (buffer) and table attributes - cdm_tables = { - k: {"buffer": StringIO(), "atts": cdm_atts.get(k)} for k in imodel_maps.keys() - } + cdm_tables = {k: {"atts": cdm_atts.get(k)} for k in imodel_maps.keys()} # Create pandas data types for buffer reading from CDM table definition pseudo-sql dtypes # Also keep track of datetime columns for reader to parse date_columns = {x: [] for x in imodel_maps.keys()} @@ -315,40 +306,22 @@ def _map(imodel, data, data_atts, cdm_subset=None, codes_subset=None, log_level= for k, v in out_dtypes[table].items() } ) + # Now map per iterable item, per table for idata in data: cols = [x for x in idata] for table, mapping in imodel_maps.items(): - cdm_tables = _write_csv_files( + cdm_tables[table]["data"], cdm_tables[table]["atts"] = _write_csv_files( idata, mapping, logger, - table, cols, data_atts, imodel_functions, imodel_code_tables, - cdm_tables, - out_dtypes, + cdm_tables[table]["atts"], + out_dtypes[table], ) - - for table in cdm_tables.keys(): - # Convert dtime to object to be parsed by the reader - logger.debug( - f"\tParse datetime by reader; Table: {table}; Columns: {date_columns[table]}" - ) - logger.debug( - f"\tParse datetime by reader; out_dtype-keys: {out_dtypes[table].keys()}; out dtypes: {out_dtypes[table]}" - ) - cdm_tables[table]["buffer"].seek(0) - cdm_tables[table]["data"] = pd.read_csv( - cdm_tables[table]["buffer"], - names=out_dtypes[table].keys(), - dtype=out_dtypes[table], - parse_dates=date_columns[table], - ) - cdm_tables[table]["buffer"].close() - cdm_tables[table].pop("buffer") return cdm_tables @@ -396,7 +369,7 @@ def map_model( # Make sure data is an iterable: this is to homogenize how we handle # dataframes and textreaders if isinstance(data, pd.DataFrame): - logger.debug("Input data is a pd.DataFrame") + # logger.debug("Input data is a pd.DataFrame") if len(data) == 0: logger.error("Input data is empty") return From 3f6f69cd1ba9a2ed6a8c8e1e6139633d828275bb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 2 Jul 2024 08:30:20 +0000 Subject: [PATCH 29/68] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- cdm_reader_mapper/__init__.py | 1 + cdm_reader_mapper/cdm_mapper/__init__.py | 1 + cdm_reader_mapper/cdm_mapper/codes/codes_hdlr.py | 1 + cdm_reader_mapper/cdm_mapper/mapper.py | 1 + cdm_reader_mapper/cdm_mapper/mappings.py | 1 + cdm_reader_mapper/common/getting_files.py | 1 + cdm_reader_mapper/mdf_reader/schema/schemas.py | 6 +++--- cdm_reader_mapper/mdf_reader/utils/__init__.py | 1 + cdm_reader_mapper/metmetpy/__init__.py | 1 + cdm_reader_mapper/metmetpy/datetime/validate.py | 1 + cdm_reader_mapper/metmetpy/platform_type/correct.py | 1 + cdm_reader_mapper/metmetpy/platform_type/gdac_r0000.py | 1 - cdm_reader_mapper/metmetpy/properties.py | 1 + cdm_reader_mapper/operations/__init__.py | 1 + cdm_reader_mapper/operations/corrections.py | 1 + cdm_reader_mapper/operations/replace.py | 1 - tests/_data.py | 1 + tests/_results.py | 1 + 18 files changed, 18 insertions(+), 5 deletions(-) diff --git a/cdm_reader_mapper/__init__.py b/cdm_reader_mapper/__init__.py index b63a623f..25aeade5 100755 --- a/cdm_reader_mapper/__init__.py +++ b/cdm_reader_mapper/__init__.py @@ -1,4 +1,5 @@ """Common Data Model (CDM) reader and mapper package.""" + from __future__ import annotations from . import cdm_mapper # noqa diff --git a/cdm_reader_mapper/cdm_mapper/__init__.py b/cdm_reader_mapper/cdm_mapper/__init__.py index b9bc36ca..41c351ed 100755 --- a/cdm_reader_mapper/cdm_mapper/__init__.py +++ b/cdm_reader_mapper/cdm_mapper/__init__.py @@ -1,4 +1,5 @@ """Cliamte Data Model (CDM) mapper package.""" + from __future__ import annotations from .mapper import map_model # noqa diff --git a/cdm_reader_mapper/cdm_mapper/codes/codes_hdlr.py b/cdm_reader_mapper/cdm_mapper/codes/codes_hdlr.py index 11e6a230..453dc0f3 100755 --- a/cdm_reader_mapper/cdm_mapper/codes/codes_hdlr.py +++ b/cdm_reader_mapper/cdm_mapper/codes/codes_hdlr.py @@ -6,6 +6,7 @@ @author: iregon """ + from __future__ import annotations import datetime diff --git a/cdm_reader_mapper/cdm_mapper/mapper.py b/cdm_reader_mapper/cdm_mapper/mapper.py index c515fe98..8c2bfee1 100755 --- a/cdm_reader_mapper/cdm_mapper/mapper.py +++ b/cdm_reader_mapper/cdm_mapper/mapper.py @@ -9,6 +9,7 @@ @author: iregon """ + from __future__ import annotations import numpy as np diff --git a/cdm_reader_mapper/cdm_mapper/mappings.py b/cdm_reader_mapper/cdm_mapper/mappings.py index 2bddcffb..fc3dfbca 100755 --- a/cdm_reader_mapper/cdm_mapper/mappings.py +++ b/cdm_reader_mapper/cdm_mapper/mappings.py @@ -21,6 +21,7 @@ @author: iregon """ + from __future__ import annotations import datetime diff --git a/cdm_reader_mapper/common/getting_files.py b/cdm_reader_mapper/common/getting_files.py index 38c8dd5f..bde71483 100755 --- a/cdm_reader_mapper/common/getting_files.py +++ b/cdm_reader_mapper/common/getting_files.py @@ -1,4 +1,5 @@ """pandas local file operator.""" + from __future__ import annotations import hashlib diff --git a/cdm_reader_mapper/mdf_reader/schema/schemas.py b/cdm_reader_mapper/mdf_reader/schema/schemas.py index 01c1d186..5e301608 100755 --- a/cdm_reader_mapper/mdf_reader/schema/schemas.py +++ b/cdm_reader_mapper/mdf_reader/schema/schemas.py @@ -48,9 +48,9 @@ def _read_schema(schema, schema_file=""): "header" ].get("delimiter") schema["header"].pop("delimiter", None) - schema["sections"][properties.dummy_level]["header"][ - "field_layout" - ] = schema["header"].get("field_layout") + schema["sections"][properties.dummy_level]["header"]["field_layout"] = ( + schema["header"].get("field_layout") + ) schema["header"].pop("field_layout", None) schema["sections"][properties.dummy_level]["header"]["format"] = schema[ "header" diff --git a/cdm_reader_mapper/mdf_reader/utils/__init__.py b/cdm_reader_mapper/mdf_reader/utils/__init__.py index 3f42c703..015b78b8 100755 --- a/cdm_reader_mapper/mdf_reader/utils/__init__.py +++ b/cdm_reader_mapper/mdf_reader/utils/__init__.py @@ -1,4 +1,5 @@ """Common Data Model (CDM) reader utilities.""" + from __future__ import annotations from .converters import converters # noqa diff --git a/cdm_reader_mapper/metmetpy/__init__.py b/cdm_reader_mapper/metmetpy/__init__.py index 7413d296..6e32be1b 100755 --- a/cdm_reader_mapper/metmetpy/__init__.py +++ b/cdm_reader_mapper/metmetpy/__init__.py @@ -1,4 +1,5 @@ """metmetpy information package.""" + from __future__ import annotations from . import properties # noqa diff --git a/cdm_reader_mapper/metmetpy/datetime/validate.py b/cdm_reader_mapper/metmetpy/datetime/validate.py index 7f9c10d5..e32aabcc 100755 --- a/cdm_reader_mapper/metmetpy/datetime/validate.py +++ b/cdm_reader_mapper/metmetpy/datetime/validate.py @@ -29,6 +29,7 @@ @author: iregon """ + from __future__ import annotations import pandas as pd diff --git a/cdm_reader_mapper/metmetpy/platform_type/correct.py b/cdm_reader_mapper/metmetpy/platform_type/correct.py index f5aa657f..1984c9f9 100755 --- a/cdm_reader_mapper/metmetpy/platform_type/correct.py +++ b/cdm_reader_mapper/metmetpy/platform_type/correct.py @@ -36,6 +36,7 @@ @author: iregon """ + from __future__ import annotations import json diff --git a/cdm_reader_mapper/metmetpy/platform_type/gdac_r0000.py b/cdm_reader_mapper/metmetpy/platform_type/gdac_r0000.py index 6a946a07..9c55f67e 100755 --- a/cdm_reader_mapper/metmetpy/platform_type/gdac_r0000.py +++ b/cdm_reader_mapper/metmetpy/platform_type/gdac_r0000.py @@ -6,7 +6,6 @@ @author: sbiri """ - from __future__ import annotations import numpy as np diff --git a/cdm_reader_mapper/metmetpy/properties.py b/cdm_reader_mapper/metmetpy/properties.py index 09110221..4e2f2699 100755 --- a/cdm_reader_mapper/metmetpy/properties.py +++ b/cdm_reader_mapper/metmetpy/properties.py @@ -5,6 +5,7 @@ @author: iregon """ + _base = "cdm_reader_mapper.metmetpy" metadata_datamodels = {} diff --git a/cdm_reader_mapper/operations/__init__.py b/cdm_reader_mapper/operations/__init__.py index 7b378224..a06d4606 100755 --- a/cdm_reader_mapper/operations/__init__.py +++ b/cdm_reader_mapper/operations/__init__.py @@ -1,4 +1,5 @@ """Common Data Model (CDM) pandas operators.""" + from __future__ import annotations from . import corrections # noqa diff --git a/cdm_reader_mapper/operations/corrections.py b/cdm_reader_mapper/operations/corrections.py index 240331aa..e0752d81 100755 --- a/cdm_reader_mapper/operations/corrections.py +++ b/cdm_reader_mapper/operations/corrections.py @@ -5,6 +5,7 @@ @author: sbiri """ + from __future__ import annotations import logging diff --git a/cdm_reader_mapper/operations/replace.py b/cdm_reader_mapper/operations/replace.py index 221b5360..fffec9e9 100755 --- a/cdm_reader_mapper/operations/replace.py +++ b/cdm_reader_mapper/operations/replace.py @@ -17,7 +17,6 @@ @author: iregon """ - from __future__ import annotations import pandas as pd diff --git a/tests/_data.py b/tests/_data.py index c4fde568..37685b3e 100755 --- a/tests/_data.py +++ b/tests/_data.py @@ -1,4 +1,5 @@ """cdm_reader_mapper testing suite result files.""" + from __future__ import annotations import pytest # noqa diff --git a/tests/_results.py b/tests/_results.py index f16e4f5c..8876d8f8 100755 --- a/tests/_results.py +++ b/tests/_results.py @@ -1,4 +1,5 @@ """cdm_reader_mapper testing suite result files.""" + from __future__ import annotations import os From 5854100993d3ecd60a7a80040b561e0aa7833f93 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 4 Jul 2024 06:48:57 +0000 Subject: [PATCH 30/68] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- cdm_reader_mapper/cdm_mapper/mapper.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cdm_reader_mapper/cdm_mapper/mapper.py b/cdm_reader_mapper/cdm_mapper/mapper.py index 602cc45a..e811aace 100755 --- a/cdm_reader_mapper/cdm_mapper/mapper.py +++ b/cdm_reader_mapper/cdm_mapper/mapper.py @@ -12,8 +12,6 @@ from __future__ import annotations -from __future__ import annotations - import numpy as np import pandas as pd From 140a025fa38e4a939c3ccd9599c3c74fd9efbc9f Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 4 Jul 2024 08:51:34 +0200 Subject: [PATCH 31/68] make use of ast --- cdm_reader_mapper/cdm_mapper/table_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cdm_reader_mapper/cdm_mapper/table_writer.py b/cdm_reader_mapper/cdm_mapper/table_writer.py index e7ef069b..35431925 100755 --- a/cdm_reader_mapper/cdm_mapper/table_writer.py +++ b/cdm_reader_mapper/cdm_mapper/table_writer.py @@ -208,7 +208,7 @@ def print_varchar_array_i(row, null_label=None): ------- data: varchar """ - row = eval(row) + row = ast.literal_eval(row) row = row if isinstance(row, list) else [row] string = ",".join(filter(bool, row)) if len(string) > 0: From 21b77a4eba1a494f64f1beae97df2df35e9f82ed Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Fri, 12 Jul 2024 11:35:59 +0200 Subject: [PATCH 32/68] remove print statement --- tests/_testing_cdm_suite.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/_testing_cdm_suite.py b/tests/_testing_cdm_suite.py index d9f1884f..f0061e65 100755 --- a/tests/_testing_cdm_suite.py +++ b/tests/_testing_cdm_suite.py @@ -43,7 +43,6 @@ def _pandas_read_csv( def _evaluate_columns(columns): columns_ = [] for col in columns: - print(col) try: columns_.append(ast.literal_eval(col)) except ValueError: From 9dcb7fa12682e396a1f54aade29c5b7a910818af Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Fri, 12 Jul 2024 11:36:24 +0200 Subject: [PATCH 33/68] remove table from decimal_places funciotn --- cdm_reader_mapper/cdm_mapper/mapper.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cdm_reader_mapper/cdm_mapper/mapper.py b/cdm_reader_mapper/cdm_mapper/mapper.py index a7b27235..e9d84d35 100755 --- a/cdm_reader_mapper/cdm_mapper/mapper.py +++ b/cdm_reader_mapper/cdm_mapper/mapper.py @@ -57,9 +57,7 @@ def _map_to_df(m, x): return -def _decimal_places( - cdm_tables, decimal_places, cdm_key, table, imodel_functions, elements -): +def _decimal_places(atts, decimal_places, cdm_key, imodel_functions, elements): if decimal_places is not None: if isinstance(decimal_places, int): atts[cdm_key].update({"decimal_places": decimal_places}) @@ -164,7 +162,6 @@ def _write_csv_files( table_df_i = table_df_i.dropna(subset=["observation_value"]) table_df_i = drop_duplicates(table_df_i) - table_df_i = table_df_i.astype(dtype=out_dtypes) return table_df_i, atts From 0113b0a75ecce6a756c07d727b1dbd0313cd842b Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Fri, 12 Jul 2024 11:37:26 +0200 Subject: [PATCH 34/68] convert float to stringprint_float; directly convert list to str for print_varchar --- cdm_reader_mapper/cdm_mapper/table_writer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cdm_reader_mapper/cdm_mapper/table_writer.py b/cdm_reader_mapper/cdm_mapper/table_writer.py index fbb79a3e..77880a27 100755 --- a/cdm_reader_mapper/cdm_mapper/table_writer.py +++ b/cdm_reader_mapper/cdm_mapper/table_writer.py @@ -76,7 +76,7 @@ def print_float(data, null_label, decimal_places=None): def _return_str(x, null_label, format_float): if pd.isna(x): return null_label - return format_float.format(x) + return format_float.format(float(x)) if decimal_places is None: decimal_places = properties.default_decimal_places @@ -124,6 +124,8 @@ def print_varchar(data, null_label): """ def _return_str(x, null_label): + if isinstance(x, list): + return str(x) if pd.isna(x): return null_label return str(x) From 96c7b2f0eb10ec818248e3bcc58ecad0af5f830c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 19 Sep 2024 11:14:29 +0000 Subject: [PATCH 35/68] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/_testing_cdm_suite.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/_testing_cdm_suite.py b/tests/_testing_cdm_suite.py index df226ede..80d2f146 100755 --- a/tests/_testing_cdm_suite.py +++ b/tests/_testing_cdm_suite.py @@ -7,7 +7,6 @@ from cdm_reader_mapper import cdm_mapper, mdf_reader from cdm_reader_mapper.cdm_mapper import read_tables - from cdm_reader_mapper.metmetpy import ( correct_datetime, correct_pt, From 97d0c78004f4e1864fc2c9c80a44e7ca528084b9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 27 Sep 2024 10:40:13 +0000 Subject: [PATCH 36/68] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- cdm_reader_mapper/metmetpy/platform_type/correct.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cdm_reader_mapper/metmetpy/platform_type/correct.py b/cdm_reader_mapper/metmetpy/platform_type/correct.py index 6847ac98..04831724 100755 --- a/cdm_reader_mapper/metmetpy/platform_type/correct.py +++ b/cdm_reader_mapper/metmetpy/platform_type/correct.py @@ -39,7 +39,6 @@ from __future__ import annotations - from cdm_reader_mapper.common import logging_hdlr from cdm_reader_mapper.common.json_dict import collect_json_files, combine_dicts From be4de72cfde6371882df818d05302a7acffb3926 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Fri, 27 Sep 2024 13:13:30 +0200 Subject: [PATCH 37/68] fixing pre-commit hook --- cdm_reader_mapper/cdm_mapper/mapper.py | 3 +++ cdm_reader_mapper/metmetpy/datetime/correct.py | 2 ++ cdm_reader_mapper/metmetpy/platform_type/correct.py | 3 +-- tests/_testing_cdm_suite.py | 3 --- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cdm_reader_mapper/cdm_mapper/mapper.py b/cdm_reader_mapper/cdm_mapper/mapper.py index 1a0ef109..3b74c4a6 100755 --- a/cdm_reader_mapper/cdm_mapper/mapper.py +++ b/cdm_reader_mapper/cdm_mapper/mapper.py @@ -127,6 +127,7 @@ def _write_csv_files( imodel_functions, codes_subset, cdm_tables, + atts, ): table_df_i = pd.DataFrame( index=idata.index, columns=mapping.keys() @@ -252,6 +253,8 @@ def _map( logger, cols, imodel_functions, + codes_subset, + cdm_subset, cdm_tables[table]["atts"], ) return cdm_tables diff --git a/cdm_reader_mapper/metmetpy/datetime/correct.py b/cdm_reader_mapper/metmetpy/datetime/correct.py index 44d14871..fb11e591 100755 --- a/cdm_reader_mapper/metmetpy/datetime/correct.py +++ b/cdm_reader_mapper/metmetpy/datetime/correct.py @@ -94,5 +94,7 @@ def correct(data, data_model, log_level="INFO"): logger.warning("Module will proceed with no attempt to apply id replacements") return data + correction_method = combine_dicts(replacements_method_files, base=_base) + data = correct_it(data, data_model, dck, correction_method, log_level="INFO") return data diff --git a/cdm_reader_mapper/metmetpy/platform_type/correct.py b/cdm_reader_mapper/metmetpy/platform_type/correct.py index 04831724..1b6c86c0 100755 --- a/cdm_reader_mapper/metmetpy/platform_type/correct.py +++ b/cdm_reader_mapper/metmetpy/platform_type/correct.py @@ -131,5 +131,4 @@ def correct(data, data_model, log_level="INFO"): ) return data - return correct_it( - data, data_model, dck, pt_col, fix_methods, log_level="INFO" + return correct_it(data, data_model, dck, pt_col, fix_methods, log_level="INFO") diff --git a/tests/_testing_cdm_suite.py b/tests/_testing_cdm_suite.py index ba8f086b..717af013 100755 --- a/tests/_testing_cdm_suite.py +++ b/tests/_testing_cdm_suite.py @@ -163,9 +163,6 @@ def _testing_suite( if isinstance(data, pd.DataFrame): if data.empty: return - else: - if get_length(data) == 0: - return if val_dt is not None: val_dt_ = _pandas_read_csv( From 62a30206c955f34ae23465f0b7b5138d0050a334 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Tue, 1 Oct 2024 17:25:11 +0200 Subject: [PATCH 38/68] fixing decimal_places --- cdm_reader_mapper/cdm_mapper/mapper.py | 65 +++++++++----------- cdm_reader_mapper/cdm_mapper/table_writer.py | 4 +- 2 files changed, 29 insertions(+), 40 deletions(-) diff --git a/cdm_reader_mapper/cdm_mapper/mapper.py b/cdm_reader_mapper/cdm_mapper/mapper.py index d5ab9a9e..bb1a469d 100755 --- a/cdm_reader_mapper/cdm_mapper/mapper.py +++ b/cdm_reader_mapper/cdm_mapper/mapper.py @@ -58,18 +58,14 @@ def _map_to_df(m, x): def _decimal_places( - entry, decimal_places, imodel_functions, ): if decimal_places is not None: - if isinstance(decimal_places, int): - entry["decimal_places"] = decimal_places + return decimal_places else: - entry["decimal_places"] = getattr(imodel_functions, decimal_places)() - - return entry + return getattr(imodel_functions, decimal_places)() def _transform( @@ -127,12 +123,12 @@ def _write_csv_files( imodel_functions, codes_subset, cdm_tables, - atts, ): table_df_i = pd.DataFrame( index=idata.index, columns=mapping.keys() ) # We cannot predifine column based dtypes here! - # logger.debug(f"Table: {table}") + logger.debug(f"Table: {table}") + decimals = {} for cdm_key, imapping in mapping.items(): logger.debug(f"\tElement: {cdm_key}") isEmpty = False @@ -164,10 +160,6 @@ def _write_csv_files( notna_idx_idx = np.where(idata[elements].notna().all(axis=1))[0] logger.debug(f"\tnotna_idx_idx: {notna_idx_idx}") to_map = idata[elements].iloc[notna_idx_idx] - # notna_idx = ( - # notna_idx_idx + idata.index[0] - # ) # to account for parsers #original - # notna_idx = idata.index[notna_idx_idx] # fix? if len(elements) == 1: to_map = to_map.iloc[:, 0] @@ -201,17 +193,16 @@ def _write_csv_files( if fill_value is not None: table_df_i[cdm_key] = table_df_i[cdm_key].fillna(value=fill_value) - atts = _decimal_places( - atts, - decimal_places, - imodel_functions, + decimals[cdm_key] = _decimal_places( + decimal_places, + imodel_functions, ) if "observation_value" in table_df_i: table_df_i = table_df_i.dropna(subset=["observation_value"]) table_df_i = drop_duplicates(table_df_i) - return table_df_i, atts + return table_df_i, decimals def _map( @@ -226,16 +217,10 @@ def _map( cdm_subset = properties.cdm_tables cdm_atts = get_cdm_atts(cdm_subset) - imodel_maps = get_imodel_maps(data_model, *sub_models, cdm_tables=cdm_subset) imodel_functions = mapping_functions("_".join([data_model] + list(sub_models))) - # Initialize dictionary to store temporal tables (buffer) and table attributes - cdm_tables = {k: {"atts": cdm_atts.get(k)} for k in imodel_maps.keys()} - # Create pandas data types for buffer reading from CDM table definition pseudo-sql dtypes - # Also keep track of datetime columns for reader to parse - date_columns = {} for table, values in imodel_maps.items(): date_columns[table] = [ @@ -244,19 +229,27 @@ def _map( if "timestamp" in cdm_atts.get(table, {}).get(x, {}).get("data_type") ] - for idata in data: - cols = [x for x in idata] - for table, mapping in imodel_maps.items(): - cdm_tables[table]["data"], cdm_tables[table]["atts"] = _write_csv_files( - idata, + cols = [x for x in data] + cdm_tables = {} + for table, mapping in imodel_maps.items(): + cdm_tables[table] = {} + data_, decimals_ = _write_csv_files( + data, mapping, logger, cols, imodel_functions, codes_subset, cdm_subset, - cdm_tables[table]["atts"], ) + + for k, v in decimals_.items(): + if v is not None: + cdm_atts[table][k]["decimal_places"] = v + print(cdm_atts[table][k]) + cdm_tables[table]["data"] = data_ + cdm_tables[table]["atts"] = cdm_atts[table] + return cdm_tables @@ -296,16 +289,14 @@ def map_model(data, imodel, cdm_subset=None, codes_subset=None, log_level="INFO" # Check input data type and content (empty?) # Make sure data is an iterable: this is to homogenize how we handle # dataframes and textreaders - if isinstance(data, pd.DataFrame): - # logger.debug("Input data is a pd.DataFrame") - if len(data) == 0: - logger.error("Input data is empty") - return - else: - data = [data] - else: + if not isinstance(data, pd.DataFrame): logger.error("Input data type " f"{type(data)}" " not supported") return + + logger.info("Input data is a pd.DataFrame") + if len(data) == 0: + logger.error("Input data is empty") + return # Map thing: return _map( diff --git a/cdm_reader_mapper/cdm_mapper/table_writer.py b/cdm_reader_mapper/cdm_mapper/table_writer.py index f249650f..7a1265f2 100755 --- a/cdm_reader_mapper/cdm_mapper/table_writer.py +++ b/cdm_reader_mapper/cdm_mapper/table_writer.py @@ -72,7 +72,7 @@ def print_float(data, null_label, decimal_places): def _return_str(x, null_label, format_float): if pd.isna(x): return null_label - return format_float.format(float(x)) + return format_float.format(x) format_float = "{:." + str(decimal_places) + "f}" return data.apply(lambda x: _return_str(x, null_label, format_float)) @@ -117,8 +117,6 @@ def print_varchar(data, null_label): """ def _return_str(x, null_label): - if isinstance(x, list): - return str(x) if pd.isna(x): return null_label return str(x) From 9249acd443677ca022e64ffda039bf53b27b1328 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Tue, 1 Oct 2024 17:29:01 +0200 Subject: [PATCH 39/68] rename _writing_csv_files to _to_map --- cdm_reader_mapper/cdm_mapper/mapper.py | 48 +++++++++++++------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/cdm_reader_mapper/cdm_mapper/mapper.py b/cdm_reader_mapper/cdm_mapper/mapper.py index bb1a469d..becb70ca 100755 --- a/cdm_reader_mapper/cdm_mapper/mapper.py +++ b/cdm_reader_mapper/cdm_mapper/mapper.py @@ -115,7 +115,7 @@ def _default( return default -def _write_csv_files( +def _to_map( idata, mapping, logger, @@ -127,7 +127,6 @@ def _write_csv_files( table_df_i = pd.DataFrame( index=idata.index, columns=mapping.keys() ) # We cannot predifine column based dtypes here! - logger.debug(f"Table: {table}") decimals = {} for cdm_key, imapping in mapping.items(): logger.debug(f"\tElement: {cdm_key}") @@ -194,8 +193,8 @@ def _write_csv_files( table_df_i[cdm_key] = table_df_i[cdm_key].fillna(value=fill_value) decimals[cdm_key] = _decimal_places( - decimal_places, - imodel_functions, + decimal_places, + imodel_functions, ) if "observation_value" in table_df_i: @@ -232,23 +231,24 @@ def _map( cols = [x for x in data] cdm_tables = {} for table, mapping in imodel_maps.items(): - cdm_tables[table] = {} - data_, decimals_ = _write_csv_files( - data, - mapping, - logger, - cols, - imodel_functions, - codes_subset, - cdm_subset, - ) - - for k, v in decimals_.items(): - if v is not None: - cdm_atts[table][k]["decimal_places"] = v - print(cdm_atts[table][k]) - cdm_tables[table]["data"] = data_ - cdm_tables[table]["atts"] = cdm_atts[table] + logger.debug(f"Table: {table}") + cdm_tables[table] = {} + data_, decimals_ = _to_map( + data, + mapping, + logger, + cols, + imodel_functions, + codes_subset, + cdm_subset, + ) + + for k, v in decimals_.items(): + if v is not None: + cdm_atts[table][k]["decimal_places"] = v + + cdm_tables[table]["data"] = data_ + cdm_tables[table]["atts"] = cdm_atts[table] return cdm_tables @@ -292,11 +292,11 @@ def map_model(data, imodel, cdm_subset=None, codes_subset=None, log_level="INFO" if not isinstance(data, pd.DataFrame): logger.error("Input data type " f"{type(data)}" " not supported") return - + logger.info("Input data is a pd.DataFrame") if len(data) == 0: - logger.error("Input data is empty") - return + logger.error("Input data is empty") + return # Map thing: return _map( From df188dde3136fb89596b62af785fa845bc93790a Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 2 Oct 2024 08:43:41 +0200 Subject: [PATCH 40/68] write striinggs and flots adjusted --- cdm_reader_mapper/cdm_mapper/table_writer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cdm_reader_mapper/cdm_mapper/table_writer.py b/cdm_reader_mapper/cdm_mapper/table_writer.py index 7a1265f2..9e369c67 100755 --- a/cdm_reader_mapper/cdm_mapper/table_writer.py +++ b/cdm_reader_mapper/cdm_mapper/table_writer.py @@ -72,7 +72,7 @@ def print_float(data, null_label, decimal_places): def _return_str(x, null_label, format_float): if pd.isna(x): return null_label - return format_float.format(x) + return format_float.format(float(x)) format_float = "{:." + str(decimal_places) + "f}" return data.apply(lambda x: _return_str(x, null_label, format_float)) @@ -117,6 +117,10 @@ def print_varchar(data, null_label): """ def _return_str(x, null_label): + if isinstance(x, list): + if len(x) == 0: + return null_label + return str(x) if pd.isna(x): return null_label return str(x) From 5515a90f1431a91d5ef468c761abd670f85590f2 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 2 Oct 2024 14:27:47 +0200 Subject: [PATCH 41/68] remove --- cdm_reader_mapper/operations/corrections.py | 338 -------------------- tests/_data.py | 16 - 2 files changed, 354 deletions(-) delete mode 100755 cdm_reader_mapper/operations/corrections.py delete mode 100755 tests/_data.py diff --git a/cdm_reader_mapper/operations/corrections.py b/cdm_reader_mapper/operations/corrections.py deleted file mode 100755 index cf05ac50..00000000 --- a/cdm_reader_mapper/operations/corrections.py +++ /dev/null @@ -1,338 +0,0 @@ -""" -Common Data Model (CDM) pandas correction operators. - -Created on Tue Jun 21 15:32:29 2022 - -@author: sbiri -""" - -from __future__ import annotations - -import logging -import math -import os - -import numpy as np -import pandas as pd -from textdistance import levenshtein - - -# %% extract NOC_corrections/duplicates -def gen_files(data, dataset, correction_path, yr, mo): - """DOCUMENTATION.""" - prepend = ( - dataset.split("_")[0] + "-" + dataset.split("_")[1][1:6].replace(".", "") + "-" - ) - for f in [ - "duplicates", - "duplicate_flags", - "id", - "longitude", - "latitude", - "timestamp", - ]: - os.makedirs(os.path.join(correction_path, f), exist_ok=True) - df = pd.DataFrame( - { - "UID": data[("c98", "UID")], - "ID": data[("core", "ID")], - "LON": data[("core", "LON")], - "LAT": data[("core", "LAT")], - } - ) - df["UID"] = df["UID"].apply(lambda x: f"{prepend + x}") - hours = np.round(data[("core", "HR")], decimals=0).astype("Int64") - minutes = np.round(60 * np.fmod(data[("core", "HR")], 1)).astype("Int64") - df["TM"] = pd.to_datetime( - pd.DataFrame( - { - "YR": int(yr) * np.ones(hours.shape, dtype=int), - "MO": int(mo) * np.ones(hours.shape, dtype=int), - "DY": data[("core", "DY")].astype("Int64"), - "H": hours, - "M": minutes, - } - ) - .astype(str) - .apply("-".join, axis=1) - .values, - format="%Y-%m-%d-%H-%M", - errors="coerce", - ) - df["TM"] = df["TM"].apply(lambda x: f"{x}+00:00") - df["flag"] = 0 - # df['UID'] = df['UID'].apply(lambda x: f"{prepend+x}") - # %% duplicates - fn = os.path.join(correction_path, "duplicates", yr + "-" + mo + ".txt.gz") - fn_f = os.path.join(correction_path, "duplicate_flags", yr + "-" + mo + ".txt.gz") - if not df.empty: - dup, dup_f = get_dup(data, dataset) - if os.path.exists(fn): - df1 = pd.read_csv( - fn, - delimiter="|", - dtype="object", - header=None, - usecols=[0, 1, 2], - names=["UID", "UID_d", "flag"], - quotechar=None, - quoting=3, - ) - os.remove(fn) - pd.concat([df1.astype(str), dup.astype(str)]).drop_duplicates( - subset="UID", keep="last" - ).to_csv(fn, sep="|", header=False, index=False, compression="infer") - else: - dup.astype(str).to_csv( - fn, sep="|", header=False, index=False, compression="infer" - ) - if os.path.exists(fn_f): - df1 = pd.read_csv( - fn_f, - delimiter="|", - dtype="object", - header=None, - usecols=[0, 1, 2], - names=["UID", "dup_flag", "flag"], - quotechar=None, - quoting=3, - ) - os.remove(fn_f) - pd.concat([df1.astype(str), dup_f.astype(str)]).drop_duplicates( - subset="UID", keep="last" - ).to_csv(fn_f, sep="|", header=False, index=False, compression="infer") - else: - dup_f.astype(str).to_csv( - fn_f, sep="|", header=False, index=False, compression="infer" - ) - # %% id - fn = os.path.join(correction_path, "id", yr + "-" + mo + ".txt.gz") - if not df.empty: - # print("ID not empty") - if os.path.exists(fn): - df1 = pd.read_csv( - fn, - delimiter="|", - dtype="object", - header=None, - usecols=[0, 1, 2], - names=["UID", "ID", "flag"], - quotechar=None, - quoting=3, - ) - os.remove(fn) - pd.concat( - [df1.astype(str), df[["UID", "ID", "flag"]].astype(str)] - ).drop_duplicates(subset="UID", keep="last").to_csv( - fn, sep="|", header=False, index=False, compression="infer" - ) - else: - df[["UID", "ID", "flag"]].astype(str).to_csv( - fn, sep="|", header=False, index=False, compression="infer" - ) - # %% longitude - fn = os.path.join(correction_path, "longitude", yr + "-" + mo + ".txt.gz") - if not df.empty: - if os.path.exists(fn): - df1 = pd.read_csv( - fn, - delimiter="|", - dtype="object", - header=None, - usecols=[0, 1, 2], - names=["UID", "LON", "flag"], - quotechar=None, - quoting=3, - ) - os.remove(fn) - pd.concat( - [df1.astype(str), df[["UID", "LON", "flag"]].astype(str)] - ).drop_duplicates(subset="UID", keep="last").to_csv( - fn, sep="|", header=False, index=False, compression="infer" - ) - else: - df[["UID", "LON", "flag"]].astype(str).to_csv( - fn, sep="|", header=False, index=False, compression="infer" - ) - # latitude - fn = os.path.join(correction_path, "latitude", yr + "-" + mo + ".txt.gz") - if not df.empty: - if os.path.exists(fn): - df1 = pd.read_csv( - fn, - delimiter="|", - dtype="object", - header=None, - usecols=[0, 1, 2], - names=["UID", "LAT", "flag"], - quotechar=None, - quoting=3, - ) - os.remove(fn) - pd.concat( - [df1.astype(str), df[["UID", "LAT", "flag"]].astype(str)] - ).drop_duplicates(subset="UID", keep="last").to_csv( - fn, sep="|", header=False, index=False, compression="infer" - ) - else: - df[["UID", "LAT", "flag"]].astype(str).to_csv( - fn, sep="|", header=False, index=False, compression="infer" - ) - # timestamp - fn = os.path.join(correction_path, "timestamp", yr + "-" + mo + ".txt.gz") - if not df.empty: - if os.path.exists(fn): - df1 = pd.read_csv( - fn, - delimiter="|", - dtype="object", - header=None, - usecols=[0, 1, 2], - names=["UID", "TM", "flag"], - quotechar=None, - quoting=3, - ) - os.remove(fn) - pd.concat( - [df1.astype(str), df[["UID", "TM", "flag"]].astype(str)] - ).drop_duplicates(subset="UID", keep="last").to_csv( - fn, sep="|", header=False, index=False, compression="infer" - ) - else: - df[["UID", "TM", "flag"]].astype(str).to_csv( - fn, sep="|", header=False, index=False, compression="infer" - ) - - -def corrections(data, dataset, correction_path, yr, mo): - """DOCUMENTATION.""" - logging.basicConfig( - format="%(levelname)s\t[%(asctime)s](%(filename)s)\t%(message)s", - level=logging.INFO, - datefmt="%Y%m%d %H:%M:%S", - filename=None, - ) - for f in [ - "duplicates", - "duplicate_flags", - "id", - "longitude", - "latitude", - "timestamp", - ]: - os.makedirs(os.path.join(correction_path, f), exist_ok=True) - - gen_files(data.copy(), dataset, correction_path, yr, mo) - - -def split_list(n): - """DOCUMENTATION.""" - return [(x + 1) for x, y in zip(n, n[1:]) if y - x != 1] - - -def convert_longitude(lon): - """Convert longitude to -180 to 180.""" - if lon > 180: - return -180 + math.fmod(lon, 180) - return lon - - -def get_dup(data, dataset): - """ - Check for duplicates. - - In a subset of dataframe that contains UID, ID, LON, LAT, DY, HR - checks for duplicates with tolerances for 2 digits in strings - and depending on variable for numeric variables/columns given in tol - - - Parameters - ---------- - data : pd.dataframe - DESCRIPTION. - dataset : str - DESCRIPTION. - tol : pd.series - DESCRIPTION - - Returns - ------- - None. - - """ - prepend = ( - dataset.split("_")[0] + "-" + dataset.split("_")[1][1:6].replace(".", "") + "-" - ) - df = pd.DataFrame( - { - "UID": data[("c98", "UID")], - "ID": data[("core", "ID")], - "LON": data[("core", "LON")], - "LAT": data[("core", "LAT")], - "DY": data[("core", "DY")], - "HR": data[("core", "HR")], - } - ) - df["UID"] = df["UID"].apply(lambda x: f"{prepend + x}") - # round lon, lat to one digit - df[["LON", "LAT"]] = df[["LON", "LAT"]].astype(float).round(1) - # convert longitdute to -180-180 - df["LON"] = df["LON"].apply(convert_longitude) - tol = pd.Series([2, 0, 0.05, 0.05, 0, 0]) - tol.index = ["UID", "ID", "LON", "LAT", "DY", "HR"] - df_dup = df.copy() - df_dup["flag"] = 0 - # first flag pos & time - df_dup = df_dup.sort_values(by=["LON", "LAT", "DY", "HR"]) - tmp_id = pd.DataFrame() - tmp_id["ID"] = df_dup["ID"].copy() - tmp_id["ID_s"] = df_dup["ID"].shift().astype(str) - tmp_id = tmp_id.assign( - distance=[*map(levenshtein.distance, tmp_id.ID, tmp_id.ID_s)] - ) - tmp_uid = pd.DataFrame() - tmp_uid["UID"] = df_dup["UID"].copy() - tmp_uid["UID_s"] = df_dup["UID"].shift().astype(str) - tmp_uid = tmp_uid.assign( - distance=[*map(levenshtein.distance, tmp_uid.UID, tmp_uid.UID_s)] - ) - loc = ( - (abs(df_dup["LON"] - df_dup["LON"].shift()) <= tol["LON"]) - & (abs(df_dup["LAT"] - df_dup["LAT"].shift()) <= tol["LAT"]) - & (abs(df_dup["DY"] - df_dup["DY"].shift()) <= tol["DY"]) - & (abs(df_dup["HR"] - df_dup["HR"].shift()) <= tol["HR"]) - & (tmp_id["distance"] <= tol["ID"]) - & (tmp_uid["distance"] <= tol["UID"]) - ) - df_dup["flag"] = df_dup["flag"].where(~loc, 1) - dup_flag = pd.DataFrame({"UID": df["UID"].copy(), "dup_flag": 0, "flag": 1}) - dup_flag["dup_flag"] = dup_flag["dup_flag"].where(~loc, 1) - # %% - dup_list = ( - df_dup.sort_values(by=["LON", "LAT", "DY", "HR"]) - .loc[df_dup["flag"] == 1] - .index.to_list() - ) - # %% find consecutive indices in list of duplicates - lst = dup_list.copy() - lst.sort() - ind = split_list(lst) - pv = 0 - dup_out = pd.DataFrame() - # find consecutive indices - for index in ind: - nlst = [x for x in lst[pv:] if x < index] - pv += len(nlst) - nlst.insert(0, nlst[0] - 1) - # choose the first duplicate to keep #!THiS SHOULD IMPROVE - dup_flag["dup_flag"][nlst[0]] = 3 - # generate all combinations of consecutive values and add to df - for fe in nlst: - r_nlst = list(nlst) - r_nlst.remove(fe) - tmp = "{" + ",".join(df["UID"].loc[r_nlst]) + "}" - dup_out = dup_out.append( - {"UID": df["UID"].loc[fe], "UID_d": tmp, "flag": 1}, - ignore_index=True, - ) - return dup_out, dup_flag diff --git a/tests/_data.py b/tests/_data.py deleted file mode 100755 index b8aa2a7a..00000000 --- a/tests/_data.py +++ /dev/null @@ -1,16 +0,0 @@ -"""cdm_reader_mapper testing suite result files.""" - -from __future__ import annotations - -import pytest # noqa - -from cdm_reader_mapper import mdf_reader, test_data - - -def _read_data(**kwargs): - read_ = mdf_reader.read(**kwargs) - return read_.data, read_.attrs, read_.mask - - -data_dict = dict(test_data.test_icoads_r300_d714) -data_df, attrs_df, mask_df = _read_data(**data_dict, imodel="icoads_r300_d714") From 461b96b39760958692617dad7a6881e0524174c3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 2 Oct 2024 12:34:29 +0000 Subject: [PATCH 42/68] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_operations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_operations.py b/tests/test_operations.py index 5e1a4bf1..368cae11 100755 --- a/tests/test_operations.py +++ b/tests/test_operations.py @@ -81,4 +81,4 @@ def test_replace(): "MASKSTID2", "MASKSTID2", ] - pd.testing.assert_frame_equal(table_df, result) \ No newline at end of file + pd.testing.assert_frame_equal(table_df, result) From ab3ee89aba057c5937efad5c267ee64111225fa0 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 2 Oct 2024 14:38:15 +0200 Subject: [PATCH 43/68] delete argument TextParser --- tests/test_operations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_operations.py b/tests/test_operations.py index 368cae11..a3c87ded 100755 --- a/tests/test_operations.py +++ b/tests/test_operations.py @@ -1,7 +1,7 @@ from __future__ import annotations import pandas as pd -import pytest +import pytest # noqa from cdm_reader_mapper import mdf_reader, test_data from cdm_reader_mapper.operations import inspect, replace, select @@ -15,7 +15,7 @@ def _read_data(**kwargs): return mdf_reader.read(**kwargs) -def _get_data(TextParser, **kwargs): +def _get_data(**kwargs): return _read_data(**data_dict, imodel="icoads_r300_d721", **kwargs) From 6c5cbacc93cc6654f9e07e6cf7dcbfbf86934e52 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 2 Oct 2024 14:51:12 +0200 Subject: [PATCH 44/68] select first list entry --- cdm_reader_mapper/operations/select.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cdm_reader_mapper/operations/select.py b/cdm_reader_mapper/operations/select.py index eec12649..3d6dbba4 100755 --- a/cdm_reader_mapper/operations/select.py +++ b/cdm_reader_mapper/operations/select.py @@ -107,4 +107,4 @@ def dataframe(df, index, out_rejected=False, idx_in_offset=0, idx_out_offset=0): idx_out_offset=idx_out_offset, ) - return dataframe(data, index, out_rejected=out_rejected) + return dataframe(data, index, out_rejected=out_rejected)[0] From a886081baf9496f2b47f1721cf5f7b9eb3829982 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 9 Oct 2024 15:37:59 +0200 Subject: [PATCH 45/68] import os --- tests/test_cdm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_cdm.py b/tests/test_cdm.py index 7d8a7971..fa6defad 100755 --- a/tests/test_cdm.py +++ b/tests/test_cdm.py @@ -2,6 +2,8 @@ import pytest # noqa +import os + from cdm_reader_mapper import test_data from ._testing_cdm_suite import _testing_suite From 8b26ddd0e93cde18b245d678ba7179474a21c0c8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Oct 2024 13:38:36 +0000 Subject: [PATCH 46/68] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_cdm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_cdm.py b/tests/test_cdm.py index fa6defad..521c199c 100755 --- a/tests/test_cdm.py +++ b/tests/test_cdm.py @@ -1,9 +1,9 @@ from __future__ import annotations -import pytest # noqa - import os +import pytest # noqa + from cdm_reader_mapper import test_data from ._testing_cdm_suite import _testing_suite From 9dd6b1f3bf08522a4ebb134806b05fd43593c3da Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 16 Dec 2024 13:49:25 +0000 Subject: [PATCH 47/68] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- cdm_reader_mapper/cdm_mapper/mapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cdm_reader_mapper/cdm_mapper/mapper.py b/cdm_reader_mapper/cdm_mapper/mapper.py index 640c95d8..63a4f6f4 100755 --- a/cdm_reader_mapper/cdm_mapper/mapper.py +++ b/cdm_reader_mapper/cdm_mapper/mapper.py @@ -151,7 +151,7 @@ def _to_map( ) ) continue - + to_map = idata[elements] if len(elements) == 1: From fab17a04a0232f61a3eeba3d7f48cd6f3eb63a0a Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Mon, 16 Dec 2024 15:12:13 +0100 Subject: [PATCH 48/68] use default decimal places from properties --- cdm_reader_mapper/cdm_mapper/mapper.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cdm_reader_mapper/cdm_mapper/mapper.py b/cdm_reader_mapper/cdm_mapper/mapper.py index 640c95d8..9581c8b2 100755 --- a/cdm_reader_mapper/cdm_mapper/mapper.py +++ b/cdm_reader_mapper/cdm_mapper/mapper.py @@ -12,7 +12,6 @@ from __future__ import annotations -import numpy as np import pandas as pd from cdm_reader_mapper.common import logging_hdlr @@ -59,12 +58,11 @@ def _map_to_df(m, x): def _decimal_places( decimal_places, - imodel_functions, ): if decimal_places is not None: if isinstance(decimal_places, int): return decimal_places - return getattr(imodel_functions, decimal_places)() + return properties.default_decimal_places def _transform( @@ -151,7 +149,7 @@ def _to_map( ) ) continue - + to_map = idata[elements] if len(elements) == 1: From dfb406f9929369ad189256d08e96ce612789fbf0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Dec 2024 09:41:20 +0000 Subject: [PATCH 49/68] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- cdm_reader_mapper/cdm_mapper/mapper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cdm_reader_mapper/cdm_mapper/mapper.py b/cdm_reader_mapper/cdm_mapper/mapper.py index c331ae17..511936b4 100755 --- a/cdm_reader_mapper/cdm_mapper/mapper.py +++ b/cdm_reader_mapper/cdm_mapper/mapper.py @@ -19,6 +19,7 @@ from . import properties from ._mappings import map_and_convert + def _map( data_model, *sub_models, From 303356e1466916f97a8f04977e044f13fe6c899c Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 18 Dec 2024 10:45:36 +0100 Subject: [PATCH 50/68] delete syntax errors --- cdm_reader_mapper/cdm_mapper/mapper.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/cdm_reader_mapper/cdm_mapper/mapper.py b/cdm_reader_mapper/cdm_mapper/mapper.py index 511936b4..7f6371ab 100755 --- a/cdm_reader_mapper/cdm_mapper/mapper.py +++ b/cdm_reader_mapper/cdm_mapper/mapper.py @@ -20,14 +20,6 @@ from ._mappings import map_and_convert -def _map( - data_model, - *sub_models, - data=pd.DataFrame(), -======= -from ._mappings import map_and_convert - - def map_model( data, imodel, From 5f7478324694635da833d0bd12da03ecaa29a8c1 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 18 Dec 2024 11:20:30 +0100 Subject: [PATCH 51/68] remove TextFileReader elements from main --- cdm_reader_mapper/cdm_mapper/_mappings.py | 62 +++++++---------------- 1 file changed, 17 insertions(+), 45 deletions(-) diff --git a/cdm_reader_mapper/cdm_mapper/_mappings.py b/cdm_reader_mapper/cdm_mapper/_mappings.py index 8fc4e937..0ac705b3 100755 --- a/cdm_reader_mapper/cdm_mapper/_mappings.py +++ b/cdm_reader_mapper/cdm_mapper/_mappings.py @@ -2,9 +2,6 @@ from __future__ import annotations -from copy import deepcopy -from io import StringIO - import numpy as np import pandas as pd @@ -203,11 +200,11 @@ def _map_and_convert( null_label, imodel_functions, codes_subset, - cdm_tables, cdm_complete, + cdm_atts, logger, ): - atts = deepcopy(cdm_tables[table]["atts"]) + atts = cdm_atts.get(table) columns = ( [x for x in atts.keys() if x in idata.columns] if not cdm_complete @@ -239,10 +236,7 @@ def _map_and_convert( table_df_i.columns = pd.MultiIndex.from_product([[table], columns]) table_df_i = drop_duplicates(table_df_i) - table_df_i = table_df_i.fillna(null_label) - table_df_i.to_csv(cdm_tables[table]["buffer"], header=False, index=False, mode="a") - cdm_tables[table]["columns"] = table_df_i.columns - return cdm_tables + return table_df_i.fillna(null_label) def map_and_convert( @@ -264,11 +258,6 @@ def map_and_convert( imodel_functions = mapping_functions("_".join([data_model] + list(sub_models))) - # Initialize dictionary to store temporal tables (buffer) and table attributes - cdm_tables = { - k: {"buffer": StringIO(), "atts": cdm_atts.get(k)} for k in imodel_maps.keys() - } - date_columns = {} for table, values in imodel_maps.items(): date_columns[table] = [ @@ -277,39 +266,22 @@ def map_and_convert( if "timestamp" in cdm_atts.get(table, {}).get(x, {}).get("data_type") ] - for idata in data: - cols = [x for x in idata] - for table, mapping in imodel_maps.items(): - cdm_tables = _map_and_convert( - idata, - mapping, - table, - cols, - null_label, - imodel_functions, - codes_subset, - cdm_tables, - cdm_complete, - logger, - ) - table_list = [] - for table in cdm_tables.keys(): - # Convert dtime to object to be parsed by the reader - logger.debug( - f"\tParse datetime by reader; Table: {table}; Columns: {date_columns[table]}" - ) - cdm_tables[table]["buffer"].seek(0) - data = pd.read_csv( - cdm_tables[table]["buffer"], - names=cdm_tables[table]["columns"], - na_values=[], - dtype="object", - keep_default_na=False, + for table in cdm_subset: + mapping = imodel_maps[table] + table_df = _map_and_convert( + data, + mapping, + table, + data.columns, + null_label, + imodel_functions, + codes_subset, + cdm_complete, + cdm_atts, + logger, ) - cdm_tables[table]["buffer"].close() - cdm_tables[table].pop("buffer") - table_list.append(data) + table_list.append(table_df) merged = pd.concat(table_list, axis=1, join="outer") return merged.reset_index(drop=True) From 316f688a0474eb812e660a9d1d727caaa2fc558e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 6 Jan 2025 08:18:48 +0000 Subject: [PATCH 52/68] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- cdm_reader_mapper/mdf_reader/read.py | 38 ++++++++++++++-------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/read.py b/cdm_reader_mapper/mdf_reader/read.py index ed0d836f..73ff4e8a 100755 --- a/cdm_reader_mapper/mdf_reader/read.py +++ b/cdm_reader_mapper/mdf_reader/read.py @@ -16,25 +16,25 @@ class MDFFileReader(_FileReader): """Class to represent reader output. - Attributes - ---------- -<<<<<<< HEAD - data : pd.DataFrame or pd.io.parsers.TextFileReader - a pandas.DataFrame or pandas.io.parsers.TextFileReader - with the output data - mask : pd.DataFrame or pd.io.parsers.TextFileReader - a pandas.DataFrame or pandas.io.parsers.TextFileReader - with the output data validation mask - attrs : dict - a dictionary with the output data elements attributes -======= - data : pd.DataFrame - a pd.DataFrame with the output data - attrs : dict - a dictionary with the output data elements attributes - mask : pd.DataFrame - a pd.DataFrame with the output data validation mask ->>>>>>> 5f7478324694635da833d0bd12da03ecaa29a8c1 + Attributes + ---------- + <<<<<<< HEAD + data : pd.DataFrame or pd.io.parsers.TextFileReader + a pandas.DataFrame or pandas.io.parsers.TextFileReader + with the output data + mask : pd.DataFrame or pd.io.parsers.TextFileReader + a pandas.DataFrame or pandas.io.parsers.TextFileReader + with the output data validation mask + attrs : dict + a dictionary with the output data elements attributes + ======= + data : pd.DataFrame + a pd.DataFrame with the output data + attrs : dict + a dictionary with the output data elements attributes + mask : pd.DataFrame + a pd.DataFrame with the output data validation mask + >>>>>>> 5f7478324694635da833d0bd12da03ecaa29a8c1 """ def __init__(self, *args, **kwargs): From 7de069d5d2313eeea85b9c9a5526214ea49314bf Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Mon, 6 Jan 2025 09:22:45 +0100 Subject: [PATCH 53/68] no hdlr --- cdm_reader_mapper/metmetpy/datetime/validate.py | 8 +++----- cdm_reader_mapper/metmetpy/station_id/validate.py | 6 ++---- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/cdm_reader_mapper/metmetpy/datetime/validate.py b/cdm_reader_mapper/metmetpy/datetime/validate.py index 82dc4e01..fb04b4dd 100755 --- a/cdm_reader_mapper/metmetpy/datetime/validate.py +++ b/cdm_reader_mapper/metmetpy/datetime/validate.py @@ -34,7 +34,7 @@ import pandas as pd -from cdm_reader_mapper.common import logging_hdlr, pandas_TextParser_hdlr +from cdm_reader_mapper.common import logging_hdlr from . import model_datetimes @@ -45,9 +45,7 @@ def validate(data, imodel, log_level="INFO"): logger = logging_hdlr.init_logger(__name__, level=log_level) model = imodel.split("_")[0] - if isinstance(data, pd.io.parsers.TextFileReader): - data = pandas_TextParser_hdlr.make_copy(data).read() - elif not isinstance(data, pd.DataFrame) and not isinstance(data, pd.Series): + if not isinstance(data, pd.DataFrame) and not isinstance(data, pd.Series): logger.error( f"Input data must be a pd.DataFrame or pd.Series.\ Input data type is {type(data)}" @@ -61,7 +59,7 @@ def validate(data, imodel, log_level="INFO"): f'Data model "{model}" datetime conversor not defined in model_datetimes module"' ) return - elif len(data_model_datetime) == 0: + elif data_model_datetime.empty: data_columns = list(data.columns) logger.info( f"No columns found for datetime conversion. Selected columns are {data_columns}" diff --git a/cdm_reader_mapper/metmetpy/station_id/validate.py b/cdm_reader_mapper/metmetpy/station_id/validate.py index 11e7fd2d..041c8fa1 100755 --- a/cdm_reader_mapper/metmetpy/station_id/validate.py +++ b/cdm_reader_mapper/metmetpy/station_id/validate.py @@ -36,7 +36,7 @@ import pandas as pd -from cdm_reader_mapper.common import logging_hdlr, pandas_TextParser_hdlr +from cdm_reader_mapper.common import logging_hdlr from cdm_reader_mapper.common.json_dict import collect_json_files, combine_dicts from .. import properties @@ -53,9 +53,7 @@ def validate(data, imodel, blank=False, log_level="INFO"): return dck = mrd[2] - if isinstance(data, pd.io.parsers.TextFileReader): - data = pandas_TextParser_hdlr.make_copy(data).read() - elif not isinstance(data, pd.DataFrame) and not isinstance(data, pd.Series): + if not isinstance(data, pd.DataFrame) and not isinstance(data, pd.Series): logger.error( f"Input data must be a pd.DataFrame or pd.Series.\ Input data type is {type(data)}" From a780a85c2f75cd98ab8ebefbb1aa4a7a84b3e29a Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Mon, 6 Jan 2025 09:32:44 +0100 Subject: [PATCH 54/68] delete more TextFileReaders --- cdm_reader_mapper/mdf_reader/read.py | 11 ----------- tests/_testing_workflow_suite.py | 9 ++------- tests/test_operations.py | 4 ++-- 3 files changed, 4 insertions(+), 20 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/read.py b/cdm_reader_mapper/mdf_reader/read.py index ed0d836f..0c9bee4e 100755 --- a/cdm_reader_mapper/mdf_reader/read.py +++ b/cdm_reader_mapper/mdf_reader/read.py @@ -18,23 +18,12 @@ class MDFFileReader(_FileReader): Attributes ---------- -<<<<<<< HEAD - data : pd.DataFrame or pd.io.parsers.TextFileReader - a pandas.DataFrame or pandas.io.parsers.TextFileReader - with the output data - mask : pd.DataFrame or pd.io.parsers.TextFileReader - a pandas.DataFrame or pandas.io.parsers.TextFileReader - with the output data validation mask - attrs : dict - a dictionary with the output data elements attributes -======= data : pd.DataFrame a pd.DataFrame with the output data attrs : dict a dictionary with the output data elements attributes mask : pd.DataFrame a pd.DataFrame with the output data validation mask ->>>>>>> 5f7478324694635da833d0bd12da03ecaa29a8c1 """ def __init__(self, *args, **kwargs): diff --git a/tests/_testing_workflow_suite.py b/tests/_testing_workflow_suite.py index 05e34a11..0b70250f 100755 --- a/tests/_testing_workflow_suite.py +++ b/tests/_testing_workflow_suite.py @@ -5,7 +5,6 @@ import pandas as pd from cdm_reader_mapper import read_mdf, read_tables -from cdm_reader_mapper.common.pandas_TextParser_hdlr import make_copy from ._results import result_data from ._utilities import ( @@ -59,12 +58,8 @@ def _testing_suite( data_exp = drop_rows(data_exp, drops) mask_exp = drop_rows(mask_exp, drops) - if isinstance(read_.data, pd.io.parsers.TextFileReader): - data = make_copy(read_.data).read() - mask = make_copy(read_.mask).read() - else: - data = read_.data.copy() - mask = read_.mask.copy() + data = read_.data.copy() + mask = read_.mask.copy() pd.testing.assert_frame_equal(data, data_exp) pd.testing.assert_frame_equal(mask, mask_exp, check_dtype=False) diff --git a/tests/test_operations.py b/tests/test_operations.py index 6cbddedb..10dca2fe 100755 --- a/tests/test_operations.py +++ b/tests/test_operations.py @@ -4,7 +4,7 @@ import pytest # noqa from cdm_reader_mapper import read_mdf, test_data -from cdm_reader_mapper.operations import inspect, replace, select +from cdm_reader_mapper.operations import replace from ._results import cdm_header, correction_df @@ -60,7 +60,7 @@ def test_select_from_list(): def test_inspect_count_by_cat(): - read_ = _get_data(TextParser) + read_ = _get_data() result = read_.unique(columns=("c1", "B1")) assert result == {("c1", "B1"): {19: 1, 26: 1, 27: 1, 41: 1, 91: 1}} From f59641b241427c627c3b83a6dec1e871f6bfe0b4 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Mon, 6 Jan 2025 09:46:24 +0100 Subject: [PATCH 55/68] import from workflow_suite --- tests/test_workflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_workflow.py b/tests/test_workflow.py index 51fbfa9f..76736e11 100755 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -6,7 +6,7 @@ from cdm_reader_mapper import test_data -from ._testing_cdm_suite import _testing_suite +from ._testing_workflow_suite import _testing_suite @pytest.mark.parametrize( From 0a3edb00f24c85c94bcd00bbb7633eae0d4cd31f Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Tue, 14 Jan 2025 16:51:49 +0100 Subject: [PATCH 56/68] remove unused imports --- cdm_reader_mapper/mdf_reader/read.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/read.py b/cdm_reader_mapper/mdf_reader/read.py index 44e263c5..2d6fb49d 100755 --- a/cdm_reader_mapper/mdf_reader/read.py +++ b/cdm_reader_mapper/mdf_reader/read.py @@ -64,8 +64,6 @@ def convert_and_decode_entries( converter_kwargs = self.configurations["convert_decode"]["converter_kwargs"] if decoder_dict is None: decoder_dict = self.configurations["convert_decode"]["decoder_dict"] - if dtype is None: - dtype = self.configurations["convert_decode"]["dtype"] if not (convert and decode): return self if convert is not True: @@ -74,6 +72,7 @@ def convert_and_decode_entries( if decode is not True: decoder_dict = {} + dtype = self.configurations["convert_decode"]["dtype"] dtype = adjust_dtype(dtype, self.data) data = self.convert_and_decode_df( self.data, From fa721658ad822e122866173827e83904f779dcef Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Tue, 14 Jan 2025 16:51:57 +0100 Subject: [PATCH 57/68] make copy --- tests/test_operations.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_operations.py b/tests/test_operations.py index b75eeaec..f1cc27f5 100755 --- a/tests/test_operations.py +++ b/tests/test_operations.py @@ -22,7 +22,7 @@ def _get_data(**kwargs): def test_select_true(): read_ = _get_data(sections=["c99_data"]) read_.select_true(overwrite=False, out_rejected=True) - data = read_.data + data = read_.data.copy() selected = read_.selected deselected = read_.deselected @@ -35,7 +35,7 @@ def test_select_true(): def test_select_from_index(): read_ = _get_data() read_.select_from_index([0, 2, 4], overwrite=False) - data = read_.data + data = read_.data.copy() result = read_.selected idx = data.index.isin([0, 2, 4]) @@ -47,7 +47,7 @@ def test_select_from_list(): read_ = _get_data() selection = {("c1", "B1"): [26, 41]} read_.select_from_list(selection, overwrite=False, out_rejected=True, in_index=True) - data = read_.data + data = read_.data.copy() selected = read_.selected deselected = read_.deselected From 8424c7060e8802e9095ddb10ff3d9c633ce2a807 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Tue, 14 Jan 2025 17:04:39 +0100 Subject: [PATCH 58/68] solving pylint issues --- .../mdf_reader/utils/filereader.py | 62 +++++++------------ 1 file changed, 23 insertions(+), 39 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/utils/filereader.py b/cdm_reader_mapper/mdf_reader/utils/filereader.py index c49e1f85..2e9413a8 100755 --- a/cdm_reader_mapper/mdf_reader/utils/filereader.py +++ b/cdm_reader_mapper/mdf_reader/utils/filereader.py @@ -2,19 +2,15 @@ from __future__ import annotations -import csv import json import logging import os from copy import deepcopy -from io import StringIO import numpy as np import pandas as pd import xarray as xr -from cdm_reader_mapper.common import pandas_TextParser_hdlr - from .. import properties from ..schemas import schemas from ..validate import validate @@ -215,47 +211,36 @@ def validate_df(self, df, isna=None): def dump_atts(self, out_atts, out_path): """Dump attributes to atts.json.""" - if not isinstance(self.data, pd.io.parsers.TextFileReader): - data = [self.data] - valid = [self.mask] - else: - data = pandas_TextParser_hdlr.make_copy(self.data) - valid = pandas_TextParser_hdlr.make_copy(self.mask) + data = self.data.copy() + mask = self.mask.copy() logging.info(f"WRITING DATA TO FILES IN: {out_path}") - for i, (data_df, valid_df) in enumerate(zip(data, valid)): - header = False - mode = "a" - out_atts_json = {} - if i == 0: - mode = "w" - cols = [x for x in data_df] - if isinstance(cols[0], tuple): - header = [":".join(x) for x in cols] - out_atts_json = { - ":".join(x): out_atts.get(x) for x in out_atts.keys() - } - else: - header = cols - out_atts_json = out_atts - kwargs = { - "header": header, - "mode": mode, - "encoding": "utf-8", - "index": True, - "index_label": "index", - "escapechar": "\0", - } - data_df.to_csv(os.path.join(out_path, "data.csv"), **kwargs) - valid_df.to_csv(os.path.join(out_path, "mask.csv"), **kwargs) - with open(os.path.join(out_path, "atts.json"), "w") as fileObj: - json.dump(out_atts_json, fileObj, indent=4) + mode = "w" + cols = data.columns + if isinstance(cols[0], tuple): + header = [":".join(x) for x in cols] + out_atts_json = {":".join(x): out_atts.get(x) for x in out_atts.keys()} + else: + header = cols + out_atts_json = out_atts + kwargs = { + "header": header, + "mode": mode, + "encoding": "utf-8", + "index": True, + "index_label": "index", + "escapechar": "\0", + } + data.to_csv(os.path.join(out_path, "data.csv"), **kwargs) + mask.to_csv(os.path.join(out_path, "mask.csv"), **kwargs) + + with open(os.path.join(out_path, "atts.json"), "w") as fileObj: + json.dump(out_atts_json, fileObj, indent=4) def open_data( self, order, valid, - chunksize, open_with="pandas", ): """DOCUMENTATION.""" @@ -266,7 +251,6 @@ def open_data( encoding=self.schema["header"].get("encoding"), widths=[properties.MAX_FULL_REPORT_WIDTH], skiprows=self.skiprows, - chunksize=chunksize, ) else: raise ValueError("open_with has to be one of ['pandas', 'netcdf']") From 23d60cd6c84cb1bc66e1e1381de0a0de9ca16465 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Fri, 17 Jan 2025 08:52:37 +0100 Subject: [PATCH 59/68] delete TestFileReader from write_data --- cdm_reader_mapper/mdf_reader/write.py | 70 +++++++++++---------------- 1 file changed, 27 insertions(+), 43 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/write.py b/cdm_reader_mapper/mdf_reader/write.py index 71d502b3..2c1a60d6 100755 --- a/cdm_reader_mapper/mdf_reader/write.py +++ b/cdm_reader_mapper/mdf_reader/write.py @@ -10,7 +10,6 @@ import pandas as pd from cdm_reader_mapper.common import get_filename -from cdm_reader_mapper.common.pandas_TextParser_hdlr import make_copy def write_data( @@ -86,19 +85,9 @@ def _join(col): return ":".join(col) return col - if not isinstance(data, pd.io.parsers.TextFileReader): - data = [data] - else: - data = make_copy(data) - if mask is None: mask = pd.DataFrame() - if not isinstance(mask, pd.io.parsers.TextFileReader): - mask = [mask] - else: - mask = make_copy(mask) - info = {} info["dtypes"] = dtypes info["parse_dates"] = [_join(parse_date) for parse_date in parse_dates] @@ -113,38 +102,33 @@ def _join(col): filename_info = get_filename( [prefix, "info", suffix], path=out_dir, extension="json" ) - for i, (data_df, mask_df) in enumerate(zip(data, mask)): - if col_subset is not None: - data_df = data_df[col_subset] - mask_df = mask_df[col_subset] - header = False - mode = "a" - if i == 0: - mode = "w" - header = [] - info["dtypes"] = { - k: v for k, v in info["dtypes"].items() if k in data_df.columns - } - for col in data_df.columns: - col_ = _join(col) - header.append(col_) - - if col in info["dtypes"]: - info["dtypes"][col_] = info["dtypes"][col] - del info["dtypes"][col] - info["parse_dates"] = [ - parse_date for parse_date in info["parse_dates"] if parse_date in header - ] - - kwargs = { - "header": header, - "mode": mode, - "encoding": "utf-8", - "index": False, - "sep": delimiter, - } - data_df.to_csv(os.path.join(out_dir, filename_data), **kwargs) - mask_df.to_csv(os.path.join(out_dir, filename_mask), **kwargs) + + if col_subset is not None: + data = data[col_subset] + mask = mask[col_subset] + + header = [] + info["dtypes"] = {k: v for k, v in info["dtypes"].items() if k in data.columns} + for col in data.columns: + col_ = _join(col) + header.append(col_) + + if col in info["dtypes"]: + info["dtypes"][col_] = info["dtypes"][col] + del info["dtypes"][col] + info["parse_dates"] = [ + parse_date for parse_date in info["parse_dates"] if parse_date in header + ] + + kwargs = { + "header": header, + "mode": "w", + "encoding": "utf-8", + "index": False, + "sep": delimiter, + } + data.to_csv(os.path.join(out_dir, filename_data), **kwargs) + mask.to_csv(os.path.join(out_dir, filename_mask), **kwargs) if info: with open(os.path.join(out_dir, filename_info), "w") as fileObj: From f8a56305125d2a3421cfc4d273bf2b4543f3fc64 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Fri, 17 Jan 2025 11:25:04 +0100 Subject: [PATCH 60/68] update indentation --- cdm_reader_mapper/mdf_reader/write.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/write.py b/cdm_reader_mapper/mdf_reader/write.py index 2c1a60d6..24f8e06f 100755 --- a/cdm_reader_mapper/mdf_reader/write.py +++ b/cdm_reader_mapper/mdf_reader/write.py @@ -113,12 +113,12 @@ def _join(col): col_ = _join(col) header.append(col_) - if col in info["dtypes"]: - info["dtypes"][col_] = info["dtypes"][col] - del info["dtypes"][col] - info["parse_dates"] = [ - parse_date for parse_date in info["parse_dates"] if parse_date in header - ] + if col in info["dtypes"]: + info["dtypes"][col_] = info["dtypes"][col] + del info["dtypes"][col] + info["parse_dates"] = [ + parse_date for parse_date in info["parse_dates"] if parse_date in header + ] kwargs = { "header": header, From b035081760fa8840796db1ba04944311488c8932 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Fri, 17 Jan 2025 11:47:13 +0100 Subject: [PATCH 61/68] update more indentation --- cdm_reader_mapper/mdf_reader/write.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/write.py b/cdm_reader_mapper/mdf_reader/write.py index 24f8e06f..3fd0d7f4 100755 --- a/cdm_reader_mapper/mdf_reader/write.py +++ b/cdm_reader_mapper/mdf_reader/write.py @@ -116,9 +116,9 @@ def _join(col): if col in info["dtypes"]: info["dtypes"][col_] = info["dtypes"][col] del info["dtypes"][col] - info["parse_dates"] = [ - parse_date for parse_date in info["parse_dates"] if parse_date in header - ] + info["parse_dates"] = [ + parse_date for parse_date in info["parse_dates"] if parse_date in header + ] kwargs = { "header": header, From 40742ed4011dc90d50513a39cba9ce06b8b7b31f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 5 Feb 2025 12:45:40 +0000 Subject: [PATCH 62/68] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- cdm_reader_mapper/mdf_reader/write.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/write.py b/cdm_reader_mapper/mdf_reader/write.py index ef8209d5..3965a576 100755 --- a/cdm_reader_mapper/mdf_reader/write.py +++ b/cdm_reader_mapper/mdf_reader/write.py @@ -128,7 +128,7 @@ def _join(col): col_ = _join(col) header.append(col_) info["dtypes"] = _update_col_names(info["dtypes"], col, col_) - + info["parse_dates"] = [ parse_date for parse_date in info["parse_dates"] if parse_date in header ] @@ -144,7 +144,6 @@ def _join(col): if not mask.empty: mask.to_csv(os.path.join(out_dir, filename_mask), **kwargs) - if info: with open(os.path.join(out_dir, filename_info), "w") as fileObj: json.dump(info, fileObj, indent=4) From 06026abb6e012409983120d4f94175804936a64b Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 5 Feb 2025 13:58:25 +0100 Subject: [PATCH 63/68] data_df -> data --- cdm_reader_mapper/mdf_reader/write.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cdm_reader_mapper/mdf_reader/write.py b/cdm_reader_mapper/mdf_reader/write.py index 3965a576..807d34bc 100755 --- a/cdm_reader_mapper/mdf_reader/write.py +++ b/cdm_reader_mapper/mdf_reader/write.py @@ -123,7 +123,7 @@ def _join(col): mask = mask[col_subset] header = [] - info["dtypes"] = _update_dtypes(info["dtypes"], data_df.columns) + info["dtypes"] = _update_dtypes(info["dtypes"], data.columns) for col in data.columns: col_ = _join(col) header.append(col_) From 6353df9456cd18cc7d9e68c46d2fbfe1c9fe7aa1 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Wed, 5 Feb 2025 13:58:43 +0100 Subject: [PATCH 64/68] remove unused import --- cdm_reader_mapper/mdf_reader/read.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cdm_reader_mapper/mdf_reader/read.py b/cdm_reader_mapper/mdf_reader/read.py index 359a0a5c..63b04a9d 100755 --- a/cdm_reader_mapper/mdf_reader/read.py +++ b/cdm_reader_mapper/mdf_reader/read.py @@ -13,7 +13,7 @@ from . import properties from .utils.filereader import FileReader -from .utils.utilities import adjust_dtype, convert_str_boolean, validate_arg +from .utils.utilities import convert_str_boolean, validate_arg def _remove_boolean_values(x): From 82250f475a466100c41c6bd884162c1f68d111f1 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 6 Feb 2025 09:30:05 +0100 Subject: [PATCH 65/68] no need for utility function convert_str_boolean --- cdm_reader_mapper/mdf_reader/read.py | 20 +++++++++++-------- .../mdf_reader/utils/configurator.py | 1 - .../mdf_reader/utils/converters.py | 7 ++----- .../mdf_reader/utils/decoders.py | 2 -- .../mdf_reader/utils/utilities.py | 17 ++++------------ .../mdf_reader/utils/validators.py | 10 ++-------- 6 files changed, 20 insertions(+), 37 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/read.py b/cdm_reader_mapper/mdf_reader/read.py index 63b04a9d..684a7c8b 100755 --- a/cdm_reader_mapper/mdf_reader/read.py +++ b/cdm_reader_mapper/mdf_reader/read.py @@ -13,11 +13,10 @@ from . import properties from .utils.filereader import FileReader -from .utils.utilities import convert_str_boolean, validate_arg +from .utils.utilities import adjust_dtypes, validate_arg def _remove_boolean_values(x): - x = convert_str_boolean(x) if x is True: return if x is False: @@ -86,22 +85,27 @@ def convert_and_decode_entries( if decode is not True: decoder_dict = {} - data = self.convert_and_decode_df( - self.data, + return self.convert_and_decode_df( + data, converter_dict, converter_kwargs, decoder_dict, ) - return data def validate_entries(self, data, validate): """Validate data entries by using a pre-defined data model. Fill attribute `valid` with boolean mask. """ - if validate is not True: + if validate is True: return self.validate_df(data) - return pd.DataFrame() + return pd.DataFrame(columns=data.columns) + + def remove_boolean_values(self, data): + """DOCUMENTATION.""" + data = data.map(_remove_boolean_values) + dtypes = adjust_dtypes(self.dtypes, self.columns) + return data.astype(dtypes, errors="ignore") def read( self, @@ -171,11 +175,11 @@ def read( convert=convert, decode=decode, ) - mask = self.validate_entries(data, validate) # 3. Create output DataBundle object logging.info("Creata output DataBundle object") + data = self.remove_boolean_values(data) return DataBundle( data=data, columns=self.columns, diff --git a/cdm_reader_mapper/mdf_reader/utils/configurator.py b/cdm_reader_mapper/mdf_reader/utils/configurator.py index 7d7357aa..f8c0652d 100755 --- a/cdm_reader_mapper/mdf_reader/utils/configurator.py +++ b/cdm_reader_mapper/mdf_reader/utils/configurator.py @@ -128,7 +128,6 @@ def get_configuration(self): "converter_dict": converters, "converter_kwargs": kwargs, "decoder_dict": decoders, - # "dtype": dtypes, }, "self": { "dtypes": dtypes, diff --git a/cdm_reader_mapper/mdf_reader/utils/converters.py b/cdm_reader_mapper/mdf_reader/utils/converters.py index 2f350ca8..329d8d74 100755 --- a/cdm_reader_mapper/mdf_reader/utils/converters.py +++ b/cdm_reader_mapper/mdf_reader/utils/converters.py @@ -5,7 +5,6 @@ import pandas as pd from .. import properties -from .utilities import convert_str_boolean class df_converters: @@ -35,7 +34,6 @@ def to_numeric(self, data, offset, scale): """Convert object type elements of a pandas series to numeric type.""" def _to_numeric(x): - x = convert_str_boolean(x) if isinstance(x, bool): return x if isinstance(x, str): @@ -76,11 +74,10 @@ def object_to_numeric(self, data, scale=None, offset=None): Data series of type self.dtype """ - scale = scale if scale else self.numeric_scale - offset = offset if offset else self.numeric_offset if data.dtype == "object": + scale = scale if scale else self.numeric_scale + offset = offset if offset else self.numeric_offset data = self.to_numeric(data, offset, scale) - return data def object_to_object(self, data, disable_white_strip=False): diff --git a/cdm_reader_mapper/mdf_reader/utils/decoders.py b/cdm_reader_mapper/mdf_reader/utils/decoders.py index c8f5bbd6..a95b008b 100755 --- a/cdm_reader_mapper/mdf_reader/utils/decoders.py +++ b/cdm_reader_mapper/mdf_reader/utils/decoders.py @@ -8,7 +8,6 @@ import pandas as pd from .. import properties -from .utilities import convert_str_boolean def _get_overpunch_number(): @@ -99,7 +98,6 @@ def base36(self, data): """DOCUMENTATION.""" def _base36(x): - x = convert_str_boolean(x) if isinstance(x, bool): return x return str(int(str(x), 36)) diff --git a/cdm_reader_mapper/mdf_reader/utils/utilities.py b/cdm_reader_mapper/mdf_reader/utils/utilities.py index 7dc94914..691e5f81 100755 --- a/cdm_reader_mapper/mdf_reader/utils/utilities.py +++ b/cdm_reader_mapper/mdf_reader/utils/utilities.py @@ -72,17 +72,8 @@ def decode_entries(series, decoder_func): return decoder_func(series) -def adjust_dtype(dtype, df): +def adjust_dtypes(dtypes, columns): """DOCUMENTATION.""" - if not isinstance(dtype, dict): - return dtype - return {k: v for k, v in dtype.items() if k in df.columns} - - -def convert_str_boolean(x): - """DOCUMENTATION.""" - if x == "True": - x = True - if x == "False": - x = False - return x + if not isinstance(dtypes, dict): + return dtypes + return {k: v for k, v in dtypes.items() if k in columns} diff --git a/cdm_reader_mapper/mdf_reader/utils/validators.py b/cdm_reader_mapper/mdf_reader/utils/validators.py index 64504075..9a9f2880 100755 --- a/cdm_reader_mapper/mdf_reader/utils/validators.py +++ b/cdm_reader_mapper/mdf_reader/utils/validators.py @@ -10,7 +10,6 @@ from .. import properties from ..codes import codes from ..schemas import schemas -from .utilities import convert_str_boolean def validate_datetime(elements, data): @@ -34,7 +33,6 @@ def validate_numeric(elements, data, schema): def _to_numeric(x): if x is None: return np.nan - x = convert_str_boolean(x) if isinstance(x, bool): return x return float(x) @@ -87,11 +85,9 @@ def validate_codes(elements, data, schema, imodel, ext_table_path): if not table: continue - dtype = properties.pandas_dtypes.get(schema.get(element).get("column_type")) - table_keys = list(table.keys()) validation_df = data[element] - value = validation_df.astype(dtype).astype("str") + value = validation_df.astype(str) valid = validation_df.notna() mask_ = value.isin(table_keys) mask[element] = mask_.where(valid, True) @@ -118,7 +114,6 @@ def _element_tuples(numeric_elements, datetime_elements, coded_elements): def _mask_boolean(x, boolean): - x = convert_str_boolean(x) if x is boolean: return True return False @@ -159,8 +154,7 @@ def validate( filename=None, ) # Check input - if not isinstance(data, pd.DataFrame): # or not isinstance(mask0, pd.DataFrame): - # logging.error("Input data and mask must be a pandas data frame object") + if not isinstance(data, pd.DataFrame): logging.error("input data must be a pandas DataFrame.") return From 2f31be5d6c4bcb73c13d3e712a4c62b536c3588b Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Fri, 7 Feb 2025 12:07:32 +0100 Subject: [PATCH 66/68] update encoding --- cdm_reader_mapper/mdf_reader/write.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cdm_reader_mapper/mdf_reader/write.py b/cdm_reader_mapper/mdf_reader/write.py index 60b12ed9..65ff2534 100755 --- a/cdm_reader_mapper/mdf_reader/write.py +++ b/cdm_reader_mapper/mdf_reader/write.py @@ -139,7 +139,7 @@ def _join(col): kwargs = { "header": header, "mode": "w", - "encoding": "utf-8", + "encoding": encoding, "index": False, "sep": delimiter, } From 68a3a23eea6175149fc8957362622d704dbca2be Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 13 Feb 2025 10:04:32 +0100 Subject: [PATCH 67/68] delete unused modules --- cdm_reader_mapper/cdm_mapper/_mappings.py | 310 ------------------ .../metmetpy/datetime/correct.py | 101 ------ .../metmetpy/datetime/validate.py | 68 ---- 3 files changed, 479 deletions(-) delete mode 100755 cdm_reader_mapper/cdm_mapper/_mappings.py delete mode 100755 cdm_reader_mapper/metmetpy/datetime/correct.py delete mode 100755 cdm_reader_mapper/metmetpy/datetime/validate.py diff --git a/cdm_reader_mapper/cdm_mapper/_mappings.py b/cdm_reader_mapper/cdm_mapper/_mappings.py deleted file mode 100755 index b45001b7..00000000 --- a/cdm_reader_mapper/cdm_mapper/_mappings.py +++ /dev/null @@ -1,310 +0,0 @@ -"""Map and convert functions.""" - -from __future__ import annotations - -import numpy as np -import pandas as pd - -from . import properties -from ._conversions import converters, iconverters_kwargs -from ._mapping_functions import mapping_functions -from .codes.codes import get_code_table -from .tables.tables import get_cdm_atts, get_imodel_maps - - -def drop_duplicates(df): - """Drop duplicates from list.""" - - def list_to_tuple(v): - if isinstance(v, list): - v = tuple(v) - return v - - def tuple_to_list(v): - if isinstance(v, tuple): - v = list(v) - return v - - dtypes = df.dtypes - df = df.map(list_to_tuple) - df = df.drop_duplicates(ignore_index=True) - df = df.map(tuple_to_list) - return df.astype(dtypes) - - -def _map_to_df(m, x): - if not isinstance(m, dict): - return - for x_ in x: - if x_ in m.keys(): - v = m[x_] - if isinstance(v, dict): - m = v - continue - else: - return v - return - - -def _decimal_places( - entry, - decimal_places, -): - if decimal_places is not None: - - if isinstance(decimal_places, int): - entry["decimal_places"] = decimal_places - else: - entry["decimal_places"] = properties.default_decimal_places - - return entry - - -def _transform( - to_map, - imodel_functions, - transform, - kwargs, - logger, -): - logger.debug(f"\ttransform: {transform}") - logger.debug("\tkwargs: {}".format(",".join(list(kwargs.keys())))) - trans = getattr(imodel_functions, transform) - return trans(to_map, **kwargs) - - -def _code_table( - to_map, - data_model, - code_table, - logger, -): - table_map = get_code_table(*data_model.split("_"), code_table=code_table) - try: - to_map = to_map.to_frame() - except Exception: - logger.warning(f"Could not convert {to_map} to frame.") - - to_map_str = to_map.astype(str) - - to_map_str.columns = ["_".join(col) for col in to_map_str.columns.values] - return to_map_str.apply(lambda x: _map_to_df(table_map, x), axis=1) - - -def _default( - default, - length, -): - if isinstance(default, list): - return [default] * length - return default - - -def _fill_value(data, fill_value): - if fill_value is None: - return data - if data is None: - return fill_value - return data.fillna(value=fill_value) - - -def _map_data( - to_map, - transform, - code_table, - elements, - default, - fill_value, - isEmpty, - imodel_functions, - kwargs, - length, - logger, -): - if transform and not isEmpty: - data = _transform( - to_map, - imodel_functions, - transform, - kwargs, - logger=logger, - ) - elif code_table and not isEmpty: - data = _code_table( - to_map, - imodel_functions.imodel, - code_table, - logger=logger, - ) - elif elements and not isEmpty: - data = to_map - elif default is not None: - data = _default( - default, - length, - ) - else: - data = _default( - None, - length, - ) - return _fill_value(data, fill_value) - - -def _mapping(idata, imapping, imodel_functions, atts, codes_subset, cols, logger): - isEmpty = False - elements = imapping.get("elements") - transform = imapping.get("transform") - kwargs = imapping.get("kwargs", {}) - code_table = imapping.get("code_table") - default = imapping.get("default") - fill_value = imapping.get("fill_value") - decimal_places = imapping.get("decimal_places") - - if codes_subset: - if code_table not in codes_subset: - code_table = None - - to_map = None - if elements: - logger.debug("\telements: {}".format(" ".join([str(x) for x in elements]))) - missing_els = [x for x in elements if x not in cols] - if len(missing_els) > 0: - logger.warning( - "Following elements from data model missing from input data: {} to map.".format( - ",".join([str(x) for x in missing_els]) - ) - ) - return _default(None, len(idata)), atts - - to_map = idata[elements] - if len(elements) == 1: - to_map = to_map.iloc[:, 0] - - if len(to_map) == 0: - isEmpty = True - - data = _map_data( - to_map, - transform, - code_table, - elements, - default, - fill_value, - isEmpty, - imodel_functions, - kwargs, - len(idata), - logger, - ) - atts = _decimal_places(atts, decimal_places) - return data, atts - - -def _convert_dtype(data, atts, logger): - if atts is None: - return np.nan - itype = atts.get("data_type") - if converters.get(itype): - iconverter_kwargs = iconverters_kwargs.get(itype) - if iconverter_kwargs: - kwargs = {x: atts.get(x) for x in iconverter_kwargs} - else: - kwargs = {} - return converters.get(itype)(data, np.nan, **kwargs) - return data - - -def _map_and_convert( - idata, - mapping, - table, - cols, - null_label, - imodel_functions, - codes_subset, - cdm_complete, - cdm_atts, - logger, -): - atts = cdm_atts.get(table) - columns = ( - [x for x in atts.keys() if x in idata.columns] - if not cdm_complete - else list(atts.keys()) - ) - table_df_i = pd.DataFrame(index=idata.index, columns=columns) - - logger.debug(f"Table: {table}") - for column in columns: - if column not in mapping.keys(): - continue - else: - logger.debug(f"\tElement: {column}") - table_df_i[column], atts[column] = _mapping( - idata, - mapping[column], - imodel_functions, - atts[column], - codes_subset, - cols, - logger, - ) - table_df_i[column] = _convert_dtype( - table_df_i[column], atts.get(column), logger - ) - - if "observation_value" in table_df_i: - table_df_i = table_df_i.dropna(subset=["observation_value"]) - - table_df_i.columns = pd.MultiIndex.from_product([[table], columns]) - table_df_i = drop_duplicates(table_df_i) - return table_df_i.fillna(null_label) - - -def map_and_convert( - data_model, - *sub_models, - data=pd.DataFrame(), - cdm_subset=None, - codes_subset=None, - cdm_complete=True, - null_label="null", - logger=None, -): - if not cdm_subset: - cdm_subset = properties.cdm_tables - - cdm_atts = get_cdm_atts(cdm_subset) - - imodel_maps = get_imodel_maps(data_model, *sub_models, cdm_tables=cdm_subset) - - imodel_functions = mapping_functions("_".join([data_model] + list(sub_models))) - - date_columns = {} - for table, values in imodel_maps.items(): - date_columns[table] = [ - i - for i, x in enumerate(list(values)) - if "timestamp" in cdm_atts.get(table, {}).get(x, {}).get("data_type") - ] - - table_list = [] - for table in cdm_subset: - mapping = imodel_maps[table] - table_df = _map_and_convert( - data, - mapping, - table, - data.columns, - null_label, - imodel_functions, - codes_subset, - cdm_complete, - cdm_atts, - logger, - ) - table_list.append(table_df) - - merged = pd.concat(table_list, axis=1, join="outer") - return merged.reset_index(drop=True) diff --git a/cdm_reader_mapper/metmetpy/datetime/correct.py b/cdm_reader_mapper/metmetpy/datetime/correct.py deleted file mode 100755 index 961bdc2a..00000000 --- a/cdm_reader_mapper/metmetpy/datetime/correct.py +++ /dev/null @@ -1,101 +0,0 @@ -""" -metmetpy correction package. - -Created on Tue Jun 25 09:00:19 2019 - -Corrects datetime fields from a given deck in a data model. - -To account for dataframes stored in TextParsers and for eventual use of data columns other -than those to be fixed in this or other metmetpy modules, -the input and output are the full data set. - -Correctionsare data model and deck specific and are registered -in ./lib/data_model.json: multiple decks in the same input data are not -supported. - -Reference names of different metadata fields used in the metmetpy modules -and its location column|(section,column) in a data model are -registered in ../properties.py in metadata_datamodels. - -If the data model is not available in ./lib it is assumed to no corrections are -needed. -If the data model is not available in metadata_models, the module -will return with no output (will break full processing downstream of its -invocation) logging an error. - - -@author: iregon -""" - -from __future__ import annotations - -from cdm_reader_mapper.common import logging_hdlr -from cdm_reader_mapper.common.json_dict import collect_json_files, combine_dicts - -from .. import properties -from . import correction_functions - -_base = f"{properties._base}.datetime" - - -def correct_it(data, data_model, dck, correction_method, log_level="INFO"): - """DOCUMENTATION.""" - logger = logging_hdlr.init_logger(__name__, level=log_level) - - # 1. Optional deck specific corrections - datetime_correction = correction_method.get(dck, {}).get("function") - if not datetime_correction: - logger.info( - f"No datetime correction to apply to deck {dck} data from data\ - model {data_model}" - ) - else: - logger.info(f'Applying "{datetime_correction}" datetime correction') - try: - trans = getattr(correction_functions, datetime_correction) - trans(data) - except Exception: - logger.error("Applying correction ", exc_info=True) - return - - return data - - -def correct(data, imodel, log_level="INFO"): - """Apply ICOADS deck specific datetime corrections. - - Parameters - ---------- - data: pandas.DataFrame or pandas.io.parsers.TextFileReader - Input dataset. - imodel: str - Name of internally available data model. - e.g. icoads_d300_704 - log_level: str - level of logging information to save. - Default: INFO - - Returns - ------- - pandas.DataFrame or pandas.io.parsers.TextFileReader - a pandas.DataFrame or pandas.io.parsers.TextFileReader - with the adjusted data - """ - logger = logging_hdlr.init_logger(__name__, level=log_level) - mrd = imodel.split("_") - if len(mrd) < 3: - logger.warning(f"Dataset {imodel} has to deck information.") - return data - dck = mrd[2] - - replacements_method_files = collect_json_files(*mrd, base=_base) - - if len(replacements_method_files) == 0: - logger.warning(f"Data model {imodel} has no replacements in library") - logger.warning("Module will proceed with no attempt to apply id replacements") - return data - - correction_method = combine_dicts(replacements_method_files, base=_base) - - data = correct_it(data, imodel, dck, correction_method, log_level="INFO") - return data diff --git a/cdm_reader_mapper/metmetpy/datetime/validate.py b/cdm_reader_mapper/metmetpy/datetime/validate.py deleted file mode 100755 index fb04b4dd..00000000 --- a/cdm_reader_mapper/metmetpy/datetime/validate.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -metmetpy validation package. - -Created on Tue Jun 25 09:00:19 2019 - -Validates the datetime fields of a data model: - -1. extracts or creates the datetime field of a data model as defined - in submodule model_datetimes. - -2. validates to False where NaT: no datetime or conversion to datetime failure - -Validation is data model specific. - -Output is a boolean series. - -Does not account for input dataframes/series stored in TextParsers: as opposed -to correction modules, the output is only a boolean series which is external -to the input data .... - -If the datetime conversion (or extraction) for a given data model is not -available in submodule model_datetimes, the module -will return with no output (will break full processing downstream of its -invocation) logging an error. - -Reference names of different metadata fields used in the metmetpy modules -and its location column|(section,column) in a data model are -registered in ../properties.py in metadata_datamodels. - -NaN, NaT: will validate to False. - -@author: iregon -""" - -from __future__ import annotations - -import pandas as pd - -from cdm_reader_mapper.common import logging_hdlr - -from . import model_datetimes - - -def validate(data, imodel, log_level="INFO"): - """DOCUMENTATiON.""" - # dck input only to be consistent with other validators in the metmetpy module - logger = logging_hdlr.init_logger(__name__, level=log_level) - model = imodel.split("_")[0] - - if not isinstance(data, pd.DataFrame) and not isinstance(data, pd.Series): - logger.error( - f"Input data must be a pd.DataFrame or pd.Series.\ - Input data type is {type(data)}" - ) - return - - data_model_datetime = model_datetimes.to_datetime(data, model) - - if not isinstance(data_model_datetime, pd.Series): - logger.error( - f'Data model "{model}" datetime conversor not defined in model_datetimes module"' - ) - return - elif data_model_datetime.empty: - data_columns = list(data.columns) - logger.info( - f"No columns found for datetime conversion. Selected columns are {data_columns}" - ) - return - return data_model_datetime.notna() From 3350023e2a8affcf92cea9ad818a9ccf8dc9a680 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 13 Feb 2025 10:04:48 +0100 Subject: [PATCH 68/68] remove comments --- cdm_reader_mapper/common/select.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/cdm_reader_mapper/common/select.py b/cdm_reader_mapper/common/select.py index eec12649..3cb5ff3a 100755 --- a/cdm_reader_mapper/common/select.py +++ b/cdm_reader_mapper/common/select.py @@ -8,15 +8,6 @@ """ from __future__ import annotations -# Need to define a general thing for the parser() functions, like we did with -# the dataframe_apply_index(), because they are all the same but for the -# selection applied!!!!! - -# The index of the resulting dataframe(s) is reinitialized here, it does not -# inherit from parent df -# -# data is a dataframe - def dataframe_apply_index( df,