diff --git a/cdm_reader_mapper/cdm_mapper/codes/marob/__init__.py b/cdm_reader_mapper/cdm_mapper/codes/marob/__init__.py new file mode 100755 index 00000000..d076a16a --- /dev/null +++ b/cdm_reader_mapper/cdm_mapper/codes/marob/__init__.py @@ -0,0 +1 @@ +"""Common Data Model (CDM) MAROB mapper code tables.""" diff --git a/cdm_reader_mapper/cdm_mapper/codes/marob/quality_flag.json b/cdm_reader_mapper/cdm_mapper/codes/marob/quality_flag.json new file mode 100755 index 00000000..69a88e3b --- /dev/null +++ b/cdm_reader_mapper/cdm_mapper/codes/marob/quality_flag.json @@ -0,0 +1 @@ +{} diff --git a/cdm_reader_mapper/cdm_mapper/codes/marob/report_time_quality.json b/cdm_reader_mapper/cdm_mapper/codes/marob/report_time_quality.json new file mode 100755 index 00000000..69a88e3b --- /dev/null +++ b/cdm_reader_mapper/cdm_mapper/codes/marob/report_time_quality.json @@ -0,0 +1 @@ +{} diff --git a/cdm_reader_mapper/cdm_mapper/codes/marob/sensor_automation_status.json b/cdm_reader_mapper/cdm_mapper/codes/marob/sensor_automation_status.json new file mode 100755 index 00000000..21be42a8 --- /dev/null +++ b/cdm_reader_mapper/cdm_mapper/codes/marob/sensor_automation_status.json @@ -0,0 +1,6 @@ +{ + "256": 4, + "10256": 4, + "384": 0, + "10384": 0 +} diff --git a/cdm_reader_mapper/cdm_mapper/mapper.py b/cdm_reader_mapper/cdm_mapper/mapper.py index ec5b09d8..16f390a4 100755 --- a/cdm_reader_mapper/cdm_mapper/mapper.py +++ b/cdm_reader_mapper/cdm_mapper/mapper.py @@ -155,6 +155,7 @@ def _transform( ) -> pd.Series: """Apply a transformation function from imodel_functions to a pandas Series.""" logger.debug(f"Applying transform: {transform}") + if kwargs: logger.debug(f"With kwargs: {', '.join(kwargs.keys())}") try: @@ -212,11 +213,15 @@ def _fill_value(series, fill_value) -> pd.Series: def _extract_input_data(idata, elements, default, logger): """Extract the relevant input data based on `elements`.""" - def _return_default(): - return pd.Series(_default(default, len(idata)), index=idata.index), True + def _return_default(bool): + return pd.Series(_default(default, len(idata)), index=idata.index), bool if not elements: - return _return_default() + if default is None: + bool = False + else: + bool = True + return _return_default(bool) logger.debug(f"\telements: {' '.join(map(str, elements))}") @@ -225,12 +230,12 @@ def _return_default(): for e in elements: if e not in cols: logger.warning(f"Missing element from input data: {e}") - return _return_default() + return _return_default(True) data = idata[elements[0]] if len(elements) == 1 else idata[elements] if _is_empty(data): - return _return_default() + return _return_default(True) return data, False @@ -532,6 +537,15 @@ def map_model( DataFrame with MultiIndex columns (cdm_table, column_name). """ logger = logging_hdlr.init_logger(__name__, level=log_level) + + if imodel is None: + logger.error("Input data model 'imodel' is not defined.") + return + + if not isinstance(imodel, str): + logger.error(f"Input data model type is not supported: {type(imodel)}") + return + imodel = imodel.split("_") if imodel[0] not in get_args(properties.SupportedDataModels): logger.error("Input data model " f"{imodel[0]}" " not supported") diff --git a/cdm_reader_mapper/cdm_mapper/tables/craid/observations-slp.json b/cdm_reader_mapper/cdm_mapper/tables/craid/observations-slp.json index 3f092412..3c9963e6 100755 --- a/cdm_reader_mapper/cdm_mapper/tables/craid/observations-slp.json +++ b/cdm_reader_mapper/cdm_mapper/tables/craid/observations-slp.json @@ -10,10 +10,7 @@ "observation_value": { "sections": "drifter_measurements", "elements": "ATMS_ADJUSTED", - "transform": "float_scale", - "kwargs": { - "factor": 100 - }, + "transform": "pressue_hpa_in_pa", "decimal_places": 0 }, "units": { diff --git a/cdm_reader_mapper/cdm_mapper/tables/gdac/observations-slp.json b/cdm_reader_mapper/cdm_mapper/tables/gdac/observations-slp.json index 8cc5179a..884319ca 100644 --- a/cdm_reader_mapper/cdm_mapper/tables/gdac/observations-slp.json +++ b/cdm_reader_mapper/cdm_mapper/tables/gdac/observations-slp.json @@ -10,10 +10,7 @@ }, "observation_value": { "elements": "PPPP", - "transform": "float_scale", - "kwargs": { - "factor": 100 - }, + "transform": "pressue_hpa_in_pa", "decimal_places": 0 }, "units": { diff --git a/cdm_reader_mapper/cdm_mapper/tables/icoads/observations-slp.json b/cdm_reader_mapper/cdm_mapper/tables/icoads/observations-slp.json index e541166c..2cb040c2 100755 --- a/cdm_reader_mapper/cdm_mapper/tables/icoads/observations-slp.json +++ b/cdm_reader_mapper/cdm_mapper/tables/icoads/observations-slp.json @@ -10,10 +10,7 @@ "observation_value": { "sections": "core", "elements": "SLP", - "transform": "float_scale", - "kwargs": { - "factor": 100 - }, + "transform": "pressue_hpa_in_pa", "decimal_places": 0 }, "units": { diff --git a/cdm_reader_mapper/cdm_mapper/tables/icoads/r300/d714/observations-slp.json b/cdm_reader_mapper/cdm_mapper/tables/icoads/r300/d714/observations-slp.json index d7416108..de9139f2 100755 --- a/cdm_reader_mapper/cdm_mapper/tables/icoads/r300/d714/observations-slp.json +++ b/cdm_reader_mapper/cdm_mapper/tables/icoads/r300/d714/observations-slp.json @@ -2,10 +2,7 @@ "observation_value": { "sections": "c99", "elements": "Pressure", - "transform": "float_scale", - "kwargs": { - "factor": 100 - }, + "transform": "pressue_hpa_in_pa", "decimal_places": 0 }, "original_value": { diff --git a/cdm_reader_mapper/cdm_mapper/tables/marob/__init__.py b/cdm_reader_mapper/cdm_mapper/tables/marob/__init__.py new file mode 100755 index 00000000..1cda077b --- /dev/null +++ b/cdm_reader_mapper/cdm_mapper/tables/marob/__init__.py @@ -0,0 +1 @@ +"""Common Data Model (CDM) MAROB mapping tables.""" diff --git a/cdm_reader_mapper/cdm_mapper/tables/marob/header.json b/cdm_reader_mapper/cdm_mapper/tables/marob/header.json new file mode 100755 index 00000000..0e5f5677 --- /dev/null +++ b/cdm_reader_mapper/cdm_mapper/tables/marob/header.json @@ -0,0 +1,106 @@ +{ + "report_id": { + "elements": "ID", + "transform": "string_add", + "kwargs": { + "separator": "-", + "prepend": "DWD_MAROBSHIP" + } + }, + "application_area": { + "default": [ + 1, + 7, + 10, + 11 + ] + }, + "observing_programme": { + "default": 56 + }, + "report_type": { + "default": 0 + }, + "station_type": { + "default": 2 + }, + "platform_type": { + "default": 2 + }, + "primary_station_id": { + "elements": "KENNUNG" + }, + "station_record_number": { + "default": 1 + }, + "primary_station_id_scheme": { + "fill_value": 5 + }, + "longitude": { + "elements": "GEOGR_LAENGE", + "decimal_places": 1 + }, + "latitude": { + "elements": "GEOGR_BREITE", + "decimal_places": 1 + }, + "location_quality": { + "default": 3 + }, + "crs": { + "default": 0 + }, + "station_speed": { + "elements": "FAHRTGESCHWINDIGKEIT", + "transform": "velocity_kn_in_ms", + "decimal_places": 2 + }, + "station_course": { + "elements": "FAHRTRICHTUNG", + "decimal_places": 0 + }, + "height_of_station_above_local_ground": { + "default": 0, + "decimal_places": 1 + }, + "height_of_station_above_sea_level": { + "elements": "STATIONSHOEHE_MSL", + "fill_value": 0, + "decimal_places": 1 + }, + "report_meaning_of_timestamp": { + "default": 2 + }, + "report_timestamp": { + "elements": "MESSZEIT", + "transform": "datetime_marob" + }, + "report_duration": { + "default": 15 + }, + "report_time_accuracy": { + "default": 1, + "decimal_places": 0 + }, + "report_time_quality": { + "default": 2 + }, + "report_quality": { + "default": 2 + }, + "duplicate_status": { + "default": 4 + }, + "record_timestamp": { + "transform": "datetime_utcnow" + }, + "history": { + "transform": "lineage" + }, + "source_id": { + "elements": "DATENQUELLE_ID" + }, + "source_record_id": { + "elements": "ID" + } +} diff --git a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-at.json b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-at.json new file mode 100755 index 00000000..aeeeddaf --- /dev/null +++ b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-at.json @@ -0,0 +1,35 @@ +{ + "observation_id": { + "kwargs": { + "append": "AT" + } + }, + "observation_height_above_station_surface": { + "elements": "SENSORHOEHE_WAS_TT", + "decimal_places": 1 + }, + "observed_variable": { + "default": 85 + }, + "observation_value": { + "elements": "LUFTTEMPERATUR", + "transform": "temperature_celsius_to_kelvin", + "decimal_places": 2 + }, + "units": { + "default": 5 + }, + "quality_flag": { + "default": 2 + }, + "original_units": { + "default": 60 + }, + "original_value": { + "elements": "LUFTTEMPERATUR", + "decimal_places": 1 + }, + "conversion_method": { + "default": 1 + } +} diff --git a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-dpt.json b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-dpt.json new file mode 100755 index 00000000..98314ee0 --- /dev/null +++ b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-dpt.json @@ -0,0 +1,35 @@ +{ + "observation_id": { + "kwargs": { + "append": "DPT" + } + }, + "observation_height_above_station_surface": { + "elements": "SENSORHOEHE_WAS_TT", + "decimal_places": 1 + }, + "observed_variable": { + "default": 36 + }, + "observation_value": { + "elements": "TAUPUNKTTEMPERATUR", + "transform": "temperature_celsius_to_kelvin", + "decimal_places": 2 + }, + "units": { + "default": 5 + }, + "quality_flag": { + "default": 2 + }, + "original_units": { + "default": 60 + }, + "original_value": { + "elements": "TAUPUNKTTEMPERATUR", + "decimal_places": 1 + }, + "conversion_method": { + "default": 1 + } +} diff --git a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-slp.json b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-slp.json new file mode 100755 index 00000000..954f9805 --- /dev/null +++ b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-slp.json @@ -0,0 +1,35 @@ +{ + "observation_id": { + "kwargs": { + "append": "SLP" + } + }, + "observation_height_above_station_surface": { + "elements": "BAROMETERHOEHE_MSL", + "decimal_places": 1 + }, + "observed_variable": { + "default": 58 + }, + "observation_value": { + "elements": "LUFTDRUCK_STATIONSHOEHE", + "transform": "pressue_hpa_in_pa", + "decimal_places": 0 + }, + "units": { + "default": 32 + }, + "quality_flag": { + "default": 2 + }, + "original_units": { + "default": 530 + }, + "original_value": { + "elements": "LUFTDRUCK_STATIONSHOEHE", + "decimal_places": 1 + }, + "conversion_method": { + "default": 7 + } +} diff --git a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-sst.json b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-sst.json new file mode 100755 index 00000000..1cc5fa46 --- /dev/null +++ b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-sst.json @@ -0,0 +1,36 @@ +{ + "observation_id": { + "kwargs": { + "append": "SST" + } + }, + "observation_height_above_station_surface": { + "elements": "MESSTIEFE", + "transform": "float_opposite", + "decimal_places": 1 + }, + "observed_variable": { + "default": 95 + }, + "observation_value": { + "elements": "WASSERTEMPERATUR", + "transform": "temperature_celsius_to_kelvin", + "decimal_places": 2 + }, + "units": { + "default": 5 + }, + "quality_flag": { + "default": 2 + }, + "original_units": { + "default": 60 + }, + "original_value": { + "elements": "WASSERTEMPERATUR", + "decimal_places": 1 + }, + "conversion_method": { + "default": 1 + } +} diff --git a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-wbt.json b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-wbt.json new file mode 100755 index 00000000..f79eb5e5 --- /dev/null +++ b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-wbt.json @@ -0,0 +1,35 @@ +{ + "observation_id": { + "kwargs": { + "append": "WBT" + } + }, + "observation_height_above_station_surface": { + "elements": "SENSORHOEHE_WAS_TT", + "decimal_places": 1 + }, + "observed_variable": { + "default": 41 + }, + "observation_value": { + "elements": "FEUCHTTEMPERATUR", + "transform": "temperature_celsius_to_kelvin", + "decimal_places": 2 + }, + "units": { + "default": 5 + }, + "quality_flag": { + "default": 2 + }, + "original_units": { + "default": 60 + }, + "original_value": { + "elements": "FEUCHTTEMPERATUR", + "decimal_places": 1 + }, + "conversion_method": { + "default": 1 + } +} diff --git a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-wd.json b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-wd.json new file mode 100755 index 00000000..b8e48a4e --- /dev/null +++ b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-wd.json @@ -0,0 +1,34 @@ +{ + "observation_id": { + "kwargs": { + "append": "WD" + } + }, + "observation_height_above_station_surface": { + "elements": "SENSORHOEHE_WAS_FF", + "decimal_places": 1 + }, + "observed_variable": { + "default": 106 + }, + "observation_value": { + "elements": "WINDRICHTUNG", + "decimal_places": 0 + }, + "units": { + "default": 320 + }, + "quality_flag": { + "default": 2 + }, + "original_units": { + "default": 320 + }, + "original_value": { + "elements": "WINDRICHTUNG", + "decimal_places": 0 + }, + "conversion_flag": { + "default": 2 + } +} diff --git a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-ws.json b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-ws.json new file mode 100755 index 00000000..0b83ebb2 --- /dev/null +++ b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-ws.json @@ -0,0 +1,36 @@ +{ + "observation_id": { + "kwargs": { + "append": "WS" + } + }, + "observation_height_above_station_surface": { + "elements": "SENSORHOEHE_WAS_FF", + "decimal_places": 1 + }, + "observed_variable": { + "default": 107 + }, + "observation_value": { + "elements": "WINDGESCHWINDIGKEIT", + "transform": "velocity_kmh_in_ms", + "decimal_places": 1 + }, + "units": { + "default": 731 + }, + "quality_flag": { + "default": 2 + }, + "original_precision": {}, + "original_units": { + "default": 741 + }, + "original_value": { + "elements": "WINDGESCHWINDIGKEIT", + "decimal_places": 1 + }, + "conversion_flag": { + "default": 2 + } +} diff --git a/cdm_reader_mapper/cdm_mapper/tables/marob/observations.json b/cdm_reader_mapper/cdm_mapper/tables/marob/observations.json new file mode 100755 index 00000000..e5594732 --- /dev/null +++ b/cdm_reader_mapper/cdm_mapper/tables/marob/observations.json @@ -0,0 +1,81 @@ +{ + "observation_id": { + "elements": "ID", + "transform": "string_add", + "kwargs": { + "separator": "-", + "prepend": "DWD_MAROBSHIP" + } + }, + "report_id": { + "elements": "ID", + "transform": "string_add", + "kwargs": { + "separator": "-", + "prepend": "DWD_MAROBSHIP" + } + }, + "data_policy_licence": { + "default": 1 + }, + "date_time": { + "elements": "MESSZEIT", + "transform": "datetime_marob" + }, + "date_time_meaning": { + "default": 2 + }, + "observation_duration": { + "default": 8 + }, + "longitude": { + "elements": "GEOGR_LAENGE", + "decimal_places": 1 + }, + "latitude": { + "elements": "GEOGR_BREITE", + "decimal_places": 1 + }, + "crs": { + "default": 0 + }, + "z_coordinate_type": { + "default": 0 + }, + "value_significance": { + "default": 2 + }, + "conversion_flag": { + "default": 2 + }, + "spatial_representativeness": { + "default": 3 + }, + "numerical_precision": {}, + "sensor_automation_status": { + "elements": "DATENQUELLE_ID", + "code_table": "sensor_automation_status" + }, + "exposure_of_sensor": { + "default": 3 + }, + "original_precision": {}, + "processing_level": { + "default": 3 + }, + "traceability": { + "default": 2 + }, + "advanced_qc": { + "default": 0 + }, + "advanced_uncertainty": { + "default": 0 + }, + "advanced_homogenisation": { + "default": 0 + }, + "source_id": { + "elements": "DATENQUELLE_ID" + } +} diff --git a/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py b/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py index 4c01fadc..68583c9b 100755 --- a/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py +++ b/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py @@ -40,6 +40,7 @@ "icoads_r300_d714": icoads_lineage + " with supplemental data recovery", "icoads_r302": ". Initial conversion from ICOADS R3.0.2T NRT", "craid": ". Initial conversion from C-RAID", + "marob": ". Initial conversion from DWD MAROB data base", } c2k_methods = { @@ -283,8 +284,29 @@ def to_int(value: Any) -> int | pd.NA: return pd.NA +def series_strptime(series: pd.Series, format: str) -> pd.Series: + """ + Convert series with strings to series with datetime. + + Parameters + ---------- + series : pd.Series + Series with strings. + format : str + String time format. + + Returns + ------- + pd.Series + Series with datetime + """ + if series.empty: + return pd.Series([]) + return pd.to_datetime(series, format=format, errors="coerce") + + class mapping_functions: - """Class for mapping Common Data Model (CDM) elements from IMMA1, GDAC, ICOADS, C-RAID, and IMMT datasets.""" + """Class for mapping Common Data Model (CDM) elements from IMMA1, GDAC, ICOADS, C-RAID, MAROB, Pub47, and IMMT datasets.""" def __init__(self, imodel): self.imodel = imodel @@ -497,10 +519,27 @@ def datetime_craid( pd.DatetimeIndex DatetimeIndex of converted dates. """ - if series.empty: - return pd.DatetimeIndex([]) - data_1d = series.values.ravel() - return pd.to_datetime(data_1d, format=format, errors="coerce") + return series_strptime(series, format) + + def datetime_marob( + self, series: pd.Series, format: str = "%d.%m.%y %H:%M:%S,%f" + ) -> pd.Series: + """ + Convert MAROB date strings to pandas datetime. + + Parameters + ---------- + series : pd.Series + Series of date strings. + format : str, optional + Datetime format string (default: "%d.%m.%y %H:%M:%S,%f"). + + Returns + ------- + pd.Series + Series of converted dates. + """ + return series_strptime(series, format) def df_col_join(self, df: pd.DataFrame, sep: str) -> pd.Series: """ @@ -537,6 +576,7 @@ def float_opposite(self, series: pd.Series) -> pd.Series: pd.Series Series with negated values. """ + series = series.astype(float) return -series def select_column(self, df: pd.DataFrame) -> pd.Series: @@ -580,9 +620,9 @@ def float_scale(self, series: pd.Series, factor: float = 1) -> pd.Series: pd.Series Scaled Series, or empty float Series if input is non-numeric. """ - if pd.api.types.is_numeric_dtype(series): - return series * factor - return pd.Series(dtype=float, name=series.name) + scaled = pd.to_numeric(series, errors="coerce") * factor + scaled.name = series.name + return scaled def integer_to_float(self, s: pd.Series) -> pd.Series: """ @@ -843,6 +883,54 @@ def temperature_celsius_to_kelvin(self, df: pd.DataFrame) -> pd.Series: result = result.iloc[:, 0] return pd.Series(result, dtype=float) + def velocity_kmh_in_ms(self, series: pd.Series) -> pd.Series: + """ + Convert velocity from kilometers per hour in meters per second. + + Parameters + ---------- + series : pd.Series + Series of velocity in kilometers per hour. + + Returns + ------- + pd.Series + Series of velocity in meters per second. + """ + return self.float_scale(series, 1 / 3.6) + + def velocity_kn_in_ms(self, series: pd.Series) -> pd.Series: + """ + Convert velocity from knots in meters per second. + + Parameters + ---------- + series : pd.Series + Series of velocity in kilometers per hour. + + Returns + ------- + pd.Series + Series of velocity in meters per second. + """ + return self.float_scale(series, 1852.0 / 3600.0) + + def pressue_hpa_in_pa(self, series: pd.Series) -> pd.Series: + """ + Convert pressure from hPa in Pa. + + Parameters + ---------- + series : pd.Series + Series of presuure in hPa. + + Returns + ------- + pd.Series + Series of pressure in Pa. + """ + return self.float_scale(series, 100) + def time_accuracy(self, series: pd.Series) -> pd.Series: """ Map time accuracy codes to seconds. @@ -969,3 +1057,34 @@ def gdac_longitude(self, df: pd.DataFrame) -> pd.Series: lon = df["LoLoLoLo"].copy() lon[df["Qc"].isin([5, 7])] *= -1 return lon + + def marob_location_quality(self, df: pd.DataFrame) -> pd.Series: + """ + Get MAROB location quality. + + Parameters + ---------- + df : pd.DataFrame + Input DataFrame with columns 'GEOGR_BREITE_FLAG' and 'GEOGR_LAENGE_FLAG'. + + Returns + ------- + pd.Series + Series of location quality flags. + + Raises + ------ + KeyError + If required columns are missing. + """ + return np.nan + # if ( + # "GEOGR_BREITE_FLAG" not in df.columns + # or "GEOGR_LAENGE_FLAG" not in df.columns + # ): + # raise KeyError( + # "DataFrame must contain 'GEOGR_BREITE_FLAG' and 'GEOGR_LAENGE_FLAG' columns" + # ) + # lat_flag = df["GEOGR_BREITE_FLAG"] + # lon_flag = df["GEOGR_LAENGE_FLAG"] + # return pd.Series([None] * len(lat_flag), index=lat_flag.idx) diff --git a/cdm_reader_mapper/core/databundle.py b/cdm_reader_mapper/core/databundle.py index 42e26e6e..c6336dc0 100755 --- a/cdm_reader_mapper/core/databundle.py +++ b/cdm_reader_mapper/core/databundle.py @@ -638,11 +638,15 @@ def replace_columns( db_._columns = db_._data.columns return self._return_db(db_, inplace) - def correct_datetime(self, inplace=False) -> DataBundle | None: + def correct_datetime( + self, imodel=None, inplace=False, **kwargs + ) -> DataBundle | None: """Correct datetime information in :py:attr:`data`. Parameters ---------- + imodel: str, optional + Name of the MFD/CDM data model. inplace: bool If ``True`` overwrite :py:attr:`data` in :py:class:`~DataBundle` else return a copy of :py:class:`~DataBundle` with datetime-corrected values in :py:attr:`data`. @@ -667,13 +671,19 @@ def correct_datetime(self, inplace=False) -> DataBundle | None: ---- For more information see :py:func:`correct_datetime` """ + imodel = imodel or self._imodel db_ = self._get_db(inplace) - db_._data = correct_datetime(db_._data, db_._imodel) + db_._data = correct_datetime(db_._data, imodel, **kwargs) return self._return_db(db_, inplace) - def validate_datetime(self) -> pd.DataFrame: + def validate_datetime(self, imodel=None, **kwargs) -> pd.DataFrame: """Validate datetime information in :py:attr:`data`. + Parameters + ---------- + imodel: str, optional + Name of the MFD/CDM data model. + Returns ------- pandas.DataFrame @@ -695,13 +705,16 @@ def validate_datetime(self) -> pd.DataFrame: ---- For more information see :py:func:`validate_datetime` """ - return validate_datetime(self._data, self._imodel) + imodel = imodel or self._imodel + return validate_datetime(self._data, imodel, **kwargs) - def correct_pt(self, inplace=False) -> DataBundle | None: + def correct_pt(self, imodel=None, inplace=False, **kwargs) -> DataBundle | None: """Correct platform type information in :py:attr:`data`. Parameters ---------- + imodel: str, optional + Name of the MFD/CDM data model. inplace: bool If ``True`` overwrite :py:attr:`data` in :py:class:`~DataBundle` else return a copy of :py:class:`~DataBundle` with platform-corrected values in :py:attr:`data`. @@ -726,13 +739,19 @@ def correct_pt(self, inplace=False) -> DataBundle | None: ---- For more information see :py:func:`correct_pt` """ + imodel = imodel or self._imodel db_ = self._get_db(inplace) - db_._data = correct_pt(db_._data, db_._imodel) + db_._data = correct_pt(db_._data, imodel, **kwargs) return self._return_db(db_, inplace) - def validate_id(self, **kwargs) -> pd.DataFrame: + def validate_id(self, imodel=None, **kwargs) -> pd.DataFrame: """Validate station id information in :py:attr:`data`. + Parameters + ---------- + imodel: str, optional + Name of the MFD/CDM data model. + Returns ------- pandas.DataFrame @@ -754,13 +773,16 @@ def validate_id(self, **kwargs) -> pd.DataFrame: ---- For more information see :py:func:`validate_id` """ - return validate_id(self._data, self._imodel, **kwargs) + imodel = imodel or self._imodel + return validate_id(self._data, imodel, **kwargs) - def map_model(self, inplace=False, **kwargs) -> DataBundle | None: + def map_model(self, imodel=None, inplace=False, **kwargs) -> DataBundle | None: """Map :py:attr:`data` to the Common Data Model. Parameters ---------- + imodel: str, optional + Name of the MFD/CDM data model. inplace: bool If ``True`` overwrite :py:attr:`data` in :py:class:`~DataBundle` else return a copy of :py:class:`~DataBundle` with :py:attr:`data` as CDM tables. @@ -779,16 +801,31 @@ def map_model(self, inplace=False, **kwargs) -> DataBundle | None: ---- For more information see :py:func:`map_model` """ + imodel = imodel or self._imodel db_ = self._get_db(inplace) - _tables = map_model(db_._data, db_._imodel, **kwargs) + _tables = map_model(db_._data, imodel, **kwargs) db_._mode = "tables" db_._columns = _tables.columns db_._data = _tables return self._return_db(db_, inplace) - def write(self, **kwargs) -> None: + def write( + self, dtypes=None, parse_dates=None, encoding=None, mode=None, **kwargs + ) -> None: """Write :py:attr:`data` on disk. + Parameters + ---------- + dtypes: dict, optional + Data types of ``data``. + parse_dates: list, optional + Information how to parse dates on ``data`` + encoding: str, optional + The encoding of the input file. Overrides the value in the imodel schema file. + mode: str, optional + Data mode ("data" or "tables") + Default: "data" + Examples -------- >>> db.write() @@ -807,13 +844,17 @@ def write(self, **kwargs) -> None: If :py:attr:`mode` is "data" write data using :py:func:`write_data`. If :py:attr:`mode` is "tables" write data using :py:func:`write_tables`. """ + dtypes = dtypes or self._dtypes + parse_dates = parse_dates or self._parse_dates + encoding = encoding or self._encoding + mode = mode or self._mode write( data=self._data, mask=self._mask, - dtypes=self._dtypes, - parse_dates=self._parse_dates, - encoding=self._encoding, - mode=self._mode, + dtypes=dtypes, + parse_dates=parse_dates, + encoding=encoding, + mode=mode, **kwargs, ) @@ -959,7 +1000,7 @@ def remove_duplicates(self, inplace=False, **kwargs) -> DataBundle | None: Returns ------- :py:class:`~DataBundle` or None - DataBundle without duplictaed rows or None if ``inplace=True``. + DataBundle without duplicated rows or None if ``inplace=True``. Note ---- @@ -991,3 +1032,34 @@ def remove_duplicates(self, inplace=False, **kwargs) -> DataBundle | None: header_ = db_.DupDetect.result db_._data = db_._data[db_._data.index.isin(header_.index)] return self._return_db(db_, inplace) + + def convert_comma_as_decimal_float( + self, columns, inplace=False + ) -> DataBundle | None: + """Replace commas with dots and convert to floats. + + Parameters + ---------- + columns: list, pd.Index or pd.MultiIndex + List of commas to convert. + inplace: bool + If ``True`` overwrite :py:attr:`data` in :py:class:`~DataBundle` + else return a copy of :py:class:`~DataBundle` with :py:attr:`data` containing no duplicates. + Default: False + + Returns + ------- + :py:class:`~DataBundle` or None + DataBundle without converted ffloat entries or None if ``inplace=True``. + """ + if not isinstance(self._data, pd.DataFrame): + raise NotImplementedError( + f"This function is only implemented for pd.DataFrames, not {type(self._data)}." + ) + + db_ = self._get_db(inplace) + for column in columns: + db_._data[column] = ( + db_[column].astype(str).str.replace(",", ".", regex=False).astype(float) + ) + return self._return_db(db_, inplace) diff --git a/cdm_reader_mapper/mdf_reader/utils/utilities.py b/cdm_reader_mapper/mdf_reader/utils/utilities.py index bd211027..51cd5105 100755 --- a/cdm_reader_mapper/mdf_reader/utils/utilities.py +++ b/cdm_reader_mapper/mdf_reader/utils/utilities.py @@ -251,6 +251,7 @@ def _read_data_from_file( def read_csv( filepath: Path, + delimiter: str = ",", col_subset: str | list | None = None, column_names: pd.Index | pd.MultiIndex | None = None, **kwargs, @@ -280,7 +281,7 @@ def read_csv( reader=pd.read_csv, col_subset=col_subset, column_names=column_names, - reader_kwargs=kwargs, + reader_kwargs={"delimiter": delimiter, **kwargs}, iterator=True, ) diff --git a/cdm_reader_mapper/properties.py b/cdm_reader_mapper/properties.py index 9c82fade..28d7c329 100755 --- a/cdm_reader_mapper/properties.py +++ b/cdm_reader_mapper/properties.py @@ -8,7 +8,7 @@ ObjectTypes = Literal["str", "object", "key", "datetime"] -SupportedDataModels = Literal["craid", "gdac", "icoads", "pub47"] +SupportedDataModels = Literal["craid", "gdac", "icoads", "pub47", "marob"] SupportedFileTypes = Literal["csv", "parquet", "feather"] diff --git a/tests/test_cdm_mapper.py b/tests/test_cdm_mapper.py index 4969fff3..e831ed7d 100755 --- a/tests/test_cdm_mapper.py +++ b/tests/test_cdm_mapper.py @@ -347,9 +347,7 @@ def test_convert_dtype(value, atts, expected): ("location_quality", [("c1", "LZ")], None, False, "idata"), ], ) -def test_extract_input_data( - imodel_maps, data_header, column, elements, default, use_default, exp -): +def test_extract_input_data(data_header, column, elements, default, use_default, exp): logger = logging_hdlr.init_logger(__name__, level="INFO") result = _extract_input_data( data_header, @@ -361,6 +359,8 @@ def test_extract_input_data( assert result[1] is use_default + print(result) + if exp == "idata": exp = data_header[elements[0]] elif isinstance(exp, list): @@ -398,6 +398,22 @@ def test_column_mapping(imodel_maps, imodel_functions, data_header, column, expe pd.testing.assert_series_equal(result, pd.Series(expected, name=column)) +def test_history_column_mapping(imodel_maps, imodel_functions, data_header): + logger = logging_hdlr.init_logger(__name__, level="INFO") + mapping_column = imodel_maps["header"]["history"] + column_atts = get_cdm_atts("header")["header"]["history"] + result = _column_mapping( + data_header, + mapping_column, + imodel_functions, + column_atts, + None, + "history", + logger, + ) + assert result.str.contains("Initial conversion from ICOADS R3.0.0T").all() + + def test_table_mapping( imodel_maps, imodel_functions, data_header, data_header_expected ): diff --git a/tests/test_mapping_functions.py b/tests/test_mapping_functions.py index 1603508c..d1a88e20 100755 --- a/tests/test_mapping_functions.py +++ b/tests/test_mapping_functions.py @@ -443,26 +443,52 @@ def test_datetime_utcnow(): "df, expected", [ ( - pd.DataFrame([["2025-11-02 10:30:00.000"]]), - pd.DatetimeIndex([pd.Timestamp("2025-11-02 10:30:00")]), + pd.Series(["2025-11-02 10:30:00.000"]), + pd.Series(pd.Timestamp("2025-11-02 10:30:00")), ), ( - pd.DataFrame([["2025-11-02 10:30:00.000"], ["2025-12-03 15:45:00.123"]]), - pd.DatetimeIndex( + pd.Series(["2025-11-02 10:30:00.000", "2025-12-03 15:45:00.123"]), + pd.Series( [ pd.Timestamp("2025-11-02 10:30:00"), pd.Timestamp("2025-12-03 15:45:00.123"), ] ), ), - (pd.DataFrame([["invalid"]]), pd.DatetimeIndex([pd.NaT])), - (pd.DataFrame([]), pd.DatetimeIndex([])), + (pd.Series(["invalid"]), pd.Series([pd.NaT])), + (pd.Series([]), pd.Series([])), ], ) def test_datetime_craid(df, expected): obj = mapping_functions("dummy_model") result = obj.datetime_craid(df) - pd.testing.assert_index_equal(result, expected) + pd.testing.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "df, expected", + [ + ( + pd.Series(["02.11.25 10:30:00,000"]), + pd.Series([pd.Timestamp("2025-11-02 10:30:00")]), + ), + ( + pd.Series(["02.11.25 10:30:00,000", "03.12.25 15:45:00,123"]), + pd.Series( + [ + pd.Timestamp("2025-11-02 10:30:00"), + pd.Timestamp("2025-12-03 15:45:00.123"), + ] + ), + ), + (pd.Series(["invalid"]), pd.Series([pd.NaT])), + (pd.Series([]), pd.Series([])), + ], +) +def test_datetime_marob(df, expected): + obj = mapping_functions("dummy_model") + result = obj.datetime_marob(df) + pd.testing.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -487,10 +513,10 @@ def test_df_col_join_series(df, sep, expected): @pytest.mark.parametrize( "df, expected", [ - (5.0, -5.0), - (-3.2, 3.2), - (0.0, -0.0), - (123.456, -123.456), + (pd.Series([5.0]), pd.Series([-5.0])), + (pd.Series([-3.2]), pd.Series([3.2])), + (pd.Series([0.0]), pd.Series([-0.0])), + (pd.Series([123.456]), pd.Series([-123.456])), (pd.Series([1.0, -2.0, 3.5]), pd.Series([-1.0, 2.0, -3.5])), ], ) @@ -535,7 +561,11 @@ def test_select_column(df, expected): 10, pd.Series([], dtype=float, name="E"), ), - (pd.Series(["x", "y", "z"], name="F"), 3, pd.Series([], dtype=float, name="F")), + ( + pd.Series(["x", "y", "z"], name="F"), + 3, + pd.Series([np.nan, np.nan, np.nan], dtype=float, name="F"), + ), ], ) def test_float_scale(input_s, factor, expected): @@ -823,6 +853,52 @@ def test_temperature_celsius_to_kelvin(input_df, imodel, expected): pd.testing.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "input_values, expected_values", + [ + ([0, 36, 72], [0.0, 10.0, 20.0]), + ([3.6, 7.2], [1.0, 2.0]), + ([0], [0.0]), + ], +) +def test_velocity_kmh_in_ms(input_values, expected_values): + obj = mapping_functions("dummy_model") + series = pd.Series(input_values) + expected = pd.Series(expected_values) + result = obj.velocity_kmh_in_ms(series) + pd.testing.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "input_values, expected_values", + [ + ([0, 10, 20], [0.0, 5.14444444444, 10.2888888889]), + ([1, 2], [0.51444444444, 1.0288888889]), + ], +) +def test_velocity_kn_in_ms(input_values, expected_values): + obj = mapping_functions("dummy_model") + series = pd.Series(input_values) + expected = pd.Series(expected_values) + result = obj.velocity_kn_in_ms(series) + pd.testing.assert_series_equal(result, expected, atol=1e-8) + + +@pytest.mark.parametrize( + "input_values, expected_values", + [ + ([1013, 1000, 950], [101300, 100000, 95000]), + ([0], [0]), + ], +) +def test_pressure_hpa_in_pa(input_values, expected_values): + obj = mapping_functions("dummy_model") + series = pd.Series(input_values) + expected = pd.Series(expected_values) + result = obj.pressue_hpa_in_pa(series) + pd.testing.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "input_series, expected", [