diff --git a/cdm_reader_mapper/cdm_mapper/mapper.py b/cdm_reader_mapper/cdm_mapper/mapper.py index 16f390a4..e52aff72 100755 --- a/cdm_reader_mapper/cdm_mapper/mapper.py +++ b/cdm_reader_mapper/cdm_mapper/mapper.py @@ -399,8 +399,6 @@ def _process_chunk( logger=logger, ) - table_df.columns = pd.MultiIndex.from_product([[table], table_df.columns]) - if is_reader: table_df.to_csv( cdm_tables[table]["buffer"], @@ -433,6 +431,9 @@ def _finalize_output(cdm_tables, logger): else: df = meta.get("df", pd.DataFrame()) + df = df.set_index("report_id", drop=False) + df.columns = pd.MultiIndex.from_product([[table], df.columns]) + final_tables.append(df) if not final_tables: diff --git a/cdm_reader_mapper/cdm_mapper/tables/marob/header.json b/cdm_reader_mapper/cdm_mapper/tables/marob/header.json index 0e5f5677..4b7b29ce 100755 --- a/cdm_reader_mapper/cdm_mapper/tables/marob/header.json +++ b/cdm_reader_mapper/cdm_mapper/tables/marob/header.json @@ -38,10 +38,12 @@ }, "longitude": { "elements": "GEOGR_LAENGE", + "transform": "convert_to_decimal", "decimal_places": 1 }, "latitude": { "elements": "GEOGR_BREITE", + "transform": "convert_to_decimal", "decimal_places": 1 }, "location_quality": { @@ -53,10 +55,14 @@ "station_speed": { "elements": "FAHRTGESCHWINDIGKEIT", "transform": "velocity_kn_in_ms", + "kwargs": { + "convert_to_decimal_float": true + }, "decimal_places": 2 }, "station_course": { "elements": "FAHRTRICHTUNG", + "transform": "convert_to_decimal", "decimal_places": 0 }, "height_of_station_above_local_ground": { @@ -65,6 +71,7 @@ }, "height_of_station_above_sea_level": { "elements": "STATIONSHOEHE_MSL", + "transform": "convert_to_decimal", "fill_value": 0, "decimal_places": 1 }, @@ -82,9 +89,6 @@ "default": 1, "decimal_places": 0 }, - "report_time_quality": { - "default": 2 - }, "report_quality": { "default": 2 }, diff --git a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-at.json b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-at.json index aeeeddaf..bf751b4e 100755 --- a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-at.json +++ b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-at.json @@ -6,6 +6,7 @@ }, "observation_height_above_station_surface": { "elements": "SENSORHOEHE_WAS_TT", + "transform": "convert_to_decimal", "decimal_places": 1 }, "observed_variable": { @@ -14,19 +15,20 @@ "observation_value": { "elements": "LUFTTEMPERATUR", "transform": "temperature_celsius_to_kelvin", + "kwargs": { + "convert_to_decimal_float": true + }, "decimal_places": 2 }, "units": { "default": 5 }, - "quality_flag": { - "default": 2 - }, "original_units": { "default": 60 }, "original_value": { "elements": "LUFTTEMPERATUR", + "transform": "convert_to_decimal", "decimal_places": 1 }, "conversion_method": { diff --git a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-dpt.json b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-dpt.json index 98314ee0..23c81f0b 100755 --- a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-dpt.json +++ b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-dpt.json @@ -6,6 +6,7 @@ }, "observation_height_above_station_surface": { "elements": "SENSORHOEHE_WAS_TT", + "transform": "convert_to_decimal", "decimal_places": 1 }, "observed_variable": { @@ -14,19 +15,20 @@ "observation_value": { "elements": "TAUPUNKTTEMPERATUR", "transform": "temperature_celsius_to_kelvin", + "kwargs": { + "convert_to_decimal_float": true + }, "decimal_places": 2 }, "units": { "default": 5 }, - "quality_flag": { - "default": 2 - }, "original_units": { "default": 60 }, "original_value": { "elements": "TAUPUNKTTEMPERATUR", + "transform": "convert_to_decimal", "decimal_places": 1 }, "conversion_method": { diff --git a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-slp.json b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-slp.json index 954f9805..7c7ae963 100755 --- a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-slp.json +++ b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-slp.json @@ -6,6 +6,7 @@ }, "observation_height_above_station_surface": { "elements": "BAROMETERHOEHE_MSL", + "transform": "convert_to_decimal", "decimal_places": 1 }, "observed_variable": { @@ -14,19 +15,20 @@ "observation_value": { "elements": "LUFTDRUCK_STATIONSHOEHE", "transform": "pressue_hpa_in_pa", + "kwargs": { + "convert_to_decimal_float": true + }, "decimal_places": 0 }, "units": { "default": 32 }, - "quality_flag": { - "default": 2 - }, "original_units": { "default": 530 }, "original_value": { "elements": "LUFTDRUCK_STATIONSHOEHE", + "transform": "convert_to_decimal", "decimal_places": 1 }, "conversion_method": { diff --git a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-sst.json b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-sst.json index 1cc5fa46..7d02b78b 100755 --- a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-sst.json +++ b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-sst.json @@ -7,6 +7,9 @@ "observation_height_above_station_surface": { "elements": "MESSTIEFE", "transform": "float_opposite", + "kwargs": { + "convert_to_decimal_float": true + }, "decimal_places": 1 }, "observed_variable": { @@ -15,19 +18,20 @@ "observation_value": { "elements": "WASSERTEMPERATUR", "transform": "temperature_celsius_to_kelvin", + "kwargs": { + "convert_to_decimal_float": true + }, "decimal_places": 2 }, "units": { "default": 5 }, - "quality_flag": { - "default": 2 - }, "original_units": { "default": 60 }, "original_value": { "elements": "WASSERTEMPERATUR", + "transform": "convert_to_decimal", "decimal_places": 1 }, "conversion_method": { diff --git a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-wbt.json b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-wbt.json index f79eb5e5..964eef9a 100755 --- a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-wbt.json +++ b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-wbt.json @@ -6,6 +6,7 @@ }, "observation_height_above_station_surface": { "elements": "SENSORHOEHE_WAS_TT", + "transform": "convert_to_decimal", "decimal_places": 1 }, "observed_variable": { @@ -14,19 +15,20 @@ "observation_value": { "elements": "FEUCHTTEMPERATUR", "transform": "temperature_celsius_to_kelvin", + "kwargs": { + "convert_to_decimal_float": true + }, "decimal_places": 2 }, "units": { "default": 5 }, - "quality_flag": { - "default": 2 - }, "original_units": { "default": 60 }, "original_value": { "elements": "FEUCHTTEMPERATUR", + "transform": "convert_to_decimal", "decimal_places": 1 }, "conversion_method": { diff --git a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-wd.json b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-wd.json index b8e48a4e..c1d87f27 100755 --- a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-wd.json +++ b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-wd.json @@ -6,6 +6,7 @@ }, "observation_height_above_station_surface": { "elements": "SENSORHOEHE_WAS_FF", + "transform": "convert_to_decimal", "decimal_places": 1 }, "observed_variable": { @@ -13,19 +14,18 @@ }, "observation_value": { "elements": "WINDRICHTUNG", + "transform": "convert_to_decimal", "decimal_places": 0 }, "units": { "default": 320 }, - "quality_flag": { - "default": 2 - }, "original_units": { "default": 320 }, "original_value": { "elements": "WINDRICHTUNG", + "transform": "convert_to_decimal", "decimal_places": 0 }, "conversion_flag": { diff --git a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-ws.json b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-ws.json index 0b83ebb2..7301e9c5 100755 --- a/cdm_reader_mapper/cdm_mapper/tables/marob/observations-ws.json +++ b/cdm_reader_mapper/cdm_mapper/tables/marob/observations-ws.json @@ -6,6 +6,7 @@ }, "observation_height_above_station_surface": { "elements": "SENSORHOEHE_WAS_FF", + "transform": "convert_to_decimal", "decimal_places": 1 }, "observed_variable": { @@ -14,20 +15,21 @@ "observation_value": { "elements": "WINDGESCHWINDIGKEIT", "transform": "velocity_kmh_in_ms", + "kwargs": { + "convert_to_decimal_float": true + }, "decimal_places": 1 }, "units": { "default": 731 }, - "quality_flag": { - "default": 2 - }, "original_precision": {}, "original_units": { "default": 741 }, "original_value": { "elements": "WINDGESCHWINDIGKEIT", + "transform": "convert_to_decimal", "decimal_places": 1 }, "conversion_flag": { diff --git a/cdm_reader_mapper/cdm_mapper/tables/marob/observations.json b/cdm_reader_mapper/cdm_mapper/tables/marob/observations.json index e5594732..ba9496a8 100755 --- a/cdm_reader_mapper/cdm_mapper/tables/marob/observations.json +++ b/cdm_reader_mapper/cdm_mapper/tables/marob/observations.json @@ -30,10 +30,12 @@ }, "longitude": { "elements": "GEOGR_LAENGE", + "transform": "convert_to_decimal", "decimal_places": 1 }, "latitude": { "elements": "GEOGR_BREITE", + "transform": "convert_to_decimal", "decimal_places": 1 }, "crs": { @@ -51,6 +53,9 @@ "spatial_representativeness": { "default": 3 }, + "quality_flag": { + "default": 2 + }, "numerical_precision": {}, "sensor_automation_status": { "elements": "DATENQUELLE_ID", diff --git a/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py b/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py index 68583c9b..ac31d683 100755 --- a/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py +++ b/cdm_reader_mapper/cdm_mapper/utils/mapping_functions.py @@ -541,6 +541,22 @@ def datetime_marob( """ return series_strptime(series, format) + def convert_to_decimal(self, series): + """ + Convert a string series to a float series with decimals. + + Parameters + ---------- + series : pd.Series + Series of string values. + + Returns + ------- + pd.Series + Series of decimal floats. + """ + return series.astype(str).str.replace(",", ".", regex=False).astype(float) + def df_col_join(self, df: pd.DataFrame, sep: str) -> pd.Series: """ Join all columns of a pandas DataFrame into a single Series of strings. @@ -562,7 +578,9 @@ def df_col_join(self, df: pd.DataFrame, sep: str) -> pd.Series: return df.astype(str).agg(sep.join, axis=1) - def float_opposite(self, series: pd.Series) -> pd.Series: + def float_opposite( + self, series: pd.Series, convert_to_decimal_float=False + ) -> pd.Series: """ Return the opposite (negation) of a numeric Series. @@ -576,6 +594,8 @@ def float_opposite(self, series: pd.Series) -> pd.Series: pd.Series Series with negated values. """ + if convert_to_decimal_float is True: + series = self.convert_to_decimal(series) series = series.astype(float) return -series @@ -856,7 +876,9 @@ def string_join_add( ) return pd.Series(result, index=df.index, dtype="object") - def temperature_celsius_to_kelvin(self, df: pd.DataFrame) -> pd.Series: + def temperature_celsius_to_kelvin( + self, df: pd.DataFrame, convert_to_decimal_float=False + ) -> pd.Series: """ Convert temperatures from Celsius to Kelvin using the model-specific method. @@ -870,6 +892,9 @@ def temperature_celsius_to_kelvin(self, df: pd.DataFrame) -> pd.Series: pd.Series Series of temperatures in Kelvin. """ + if convert_to_decimal_float is True: + df = self.convert_to_decimal(df) + method = find_entry(self.imodel, c2k_methods) if not method: method = "method_a" @@ -883,7 +908,9 @@ def temperature_celsius_to_kelvin(self, df: pd.DataFrame) -> pd.Series: result = result.iloc[:, 0] return pd.Series(result, dtype=float) - def velocity_kmh_in_ms(self, series: pd.Series) -> pd.Series: + def velocity_kmh_in_ms( + self, series: pd.Series, convert_to_decimal_float=False + ) -> pd.Series: """ Convert velocity from kilometers per hour in meters per second. @@ -897,9 +924,13 @@ def velocity_kmh_in_ms(self, series: pd.Series) -> pd.Series: pd.Series Series of velocity in meters per second. """ + if convert_to_decimal_float is True: + series = self.convert_to_decimal(series) return self.float_scale(series, 1 / 3.6) - def velocity_kn_in_ms(self, series: pd.Series) -> pd.Series: + def velocity_kn_in_ms( + self, series: pd.Series, convert_to_decimal_float=False + ) -> pd.Series: """ Convert velocity from knots in meters per second. @@ -913,9 +944,13 @@ def velocity_kn_in_ms(self, series: pd.Series) -> pd.Series: pd.Series Series of velocity in meters per second. """ + if convert_to_decimal_float is True: + series = self.convert_to_decimal(series) return self.float_scale(series, 1852.0 / 3600.0) - def pressue_hpa_in_pa(self, series: pd.Series) -> pd.Series: + def pressue_hpa_in_pa( + self, series: pd.Series, convert_to_decimal_float=False + ) -> pd.Series: """ Convert pressure from hPa in Pa. @@ -929,6 +964,8 @@ def pressue_hpa_in_pa(self, series: pd.Series) -> pd.Series: pd.Series Series of pressure in Pa. """ + if convert_to_decimal_float is True: + series = self.convert_to_decimal(series) return self.float_scale(series, 100) def time_accuracy(self, series: pd.Series) -> pd.Series: diff --git a/cdm_reader_mapper/common/getting_files.py b/cdm_reader_mapper/common/getting_files.py index 0a35b1bf..3a951305 100755 --- a/cdm_reader_mapper/common/getting_files.py +++ b/cdm_reader_mapper/common/getting_files.py @@ -137,7 +137,7 @@ def _get_file( def load_file( name: str | os.PathLike, github_url: str = "https://github.com/glamod/cdm-testdata", - branch: str = "main", + branch: str = "maron_testdata", # "main", cache: bool = True, cache_dir: str | Path = _default_cache_dir_, clear_cache: bool = False, diff --git a/cdm_reader_mapper/core/databundle.py b/cdm_reader_mapper/core/databundle.py index c6336dc0..dbc14cae 100755 --- a/cdm_reader_mapper/core/databundle.py +++ b/cdm_reader_mapper/core/databundle.py @@ -1050,7 +1050,7 @@ def convert_comma_as_decimal_float( Returns ------- :py:class:`~DataBundle` or None - DataBundle without converted ffloat entries or None if ``inplace=True``. + DataBundle without converted float entries or None if ``inplace=True``. """ if not isinstance(self._data, pd.DataFrame): raise NotImplementedError( diff --git a/cdm_reader_mapper/data/__init__.py b/cdm_reader_mapper/data/__init__.py index c09823f7..92a17d12 100755 --- a/cdm_reader_mapper/data/__init__.py +++ b/cdm_reader_mapper/data/__init__.py @@ -4,7 +4,7 @@ import logging -from urllib.error import HTTPError +from requests.exceptions import HTTPError from cdm_reader_mapper.common import load_file from cdm_reader_mapper.cdm_mapper.properties import cdm_tables @@ -13,13 +13,16 @@ class LazyDataDict(dict): """Lazy data dict.""" - def __init__(self, loader, items): + def __init__(self, loader, items, attrs=None): super().__init__() self._loader = loader self._items = items + self._attrs = attrs or {} def __getitem__(self, key): """Make class subscriptable.""" + if key in self._attrs: + return self._attrs[key] if key not in self: path = self._items[key] self[key] = self._loader(path) @@ -208,6 +211,16 @@ def test_craid(self): """C-RAID 1260810 test data.""" return self._get_data_dict("2004-12-20_subset", "craid", "nc") + @property + def test_marob(self): + """IMMT test data.""" + return self._get_data_dict( + "2026-02-12_subset", + "marob", + "csv", + ";", + ) + @property def test_pub47(self): """Pub47 v202501 test data.""" @@ -225,10 +238,11 @@ def _load_file(self, ifile): return load_file(ifile) except HTTPError as err: logging.warning(err) + return None except OSError as err: raise err - def _get_data_dict(self, data_file, data_model, source_ext): + def _get_data_dict(self, data_file, data_model, source_ext, delimiter=","): drs = "/".join(data_model.split("_")) data_dict = { "source": f"{drs}/input/{data_model}_{data_file}.{source_ext}", @@ -243,8 +257,8 @@ def _get_data_dict(self, data_file, data_model, source_ext): f"{drs}/cdm_tables/{cdm_table}-{data_model}_{data_file}.psv" ) data_dict[f"cdm_{cdm_table}"] = cdm_table_file - # data_dict["cdm_path"] = cdm_table_file.parent - return LazyDataDict(self._load_file, data_dict) + + return LazyDataDict(self._load_file, data_dict, attrs={"delimiter": delimiter}) test_data = TestData() diff --git a/cdm_reader_mapper/mdf_reader/reader.py b/cdm_reader_mapper/mdf_reader/reader.py index ced76b50..264ace15 100755 --- a/cdm_reader_mapper/mdf_reader/reader.py +++ b/cdm_reader_mapper/mdf_reader/reader.py @@ -265,6 +265,7 @@ def read_data( imodel: str | None = None, col_subset: str | list | tuple | None = None, encoding: str | None = None, + delimiter: str | None = None, **kwargs, ) -> DataBundle: """Read MDF data which is already on a pre-defined data model. @@ -294,6 +295,8 @@ def read_data( Column labels could be both string or tuple. encoding : str, optional The encoding of the input file. Overrides the value in the imodel schema file. + delimiter : str, optional + The delimiter used in the input file. Overrides the value in the imodel schema file. Returns ------- @@ -321,13 +324,15 @@ def read_data( info_dict = open_json_file(info_file) if info_file else {} dtype = info_dict.get("dtypes", "object") parse_dates = info_dict.get("parse_dates", False) - encoding = encoding or info_dict.get("encoding", None) + encoding = encoding or info_dict.get("encoding") + delimiter = delimiter or info_dict.get("delimiter") data_kwargs.setdefault("dtype", dtype) data_kwargs.setdefault("parse_dates", parse_dates) data_kwargs.setdefault("encoding", encoding) mask_kwargs.setdefault("dtype", "boolean") + mask_kwargs.setdefault("delimiter", delimiter) data, mask, info = _read_data( data_file=data_file, diff --git a/cdm_reader_mapper/mdf_reader/utils/parser.py b/cdm_reader_mapper/mdf_reader/utils/parser.py index 3ba2e9ae..89daa9ba 100755 --- a/cdm_reader_mapper/mdf_reader/utils/parser.py +++ b/cdm_reader_mapper/mdf_reader/utils/parser.py @@ -407,8 +407,6 @@ def parse_netcdf( obj_cols = df.select_dtypes(include="object").columns for col in obj_cols: - print(df[col]) - print(df[col].str) s = df[col].str.decode("utf-8").str.strip() df[col] = s.map(lambda x: True if x == "" else x) diff --git a/tests/test_cdm_mapper.py b/tests/test_cdm_mapper.py index e831ed7d..383103d2 100755 --- a/tests/test_cdm_mapper.py +++ b/tests/test_cdm_mapper.py @@ -102,13 +102,26 @@ def data_header_expected(): def _map_model_test_data(data_model, encoding="utf-8", select=None, **kwargs): source = test_data[f"test_{data_model}"]["mdf_data"] - info = open_json_file(test_data[f"test_{data_model}"]["mdf_info"]) - df = pd.read_csv(source, dtype=info["dtypes"], encoding=encoding) + + mdf_info = test_data[f"test_{data_model}"]["mdf_info"] + if mdf_info is None: + dtypes = object + else: + info = open_json_file(test_data[f"test_{data_model}"]["mdf_info"]) + dtypes = info["dtypes"] + + delimiter = test_data[f"test_{data_model}"]["delimiter"] + + df = pd.read_csv(source, dtype=dtypes, delimiter=delimiter, encoding=encoding) + if ":" in df.columns[0]: df.columns = pd.MultiIndex.from_tuples(col.split(":") for col in df.columns) + result = map_model(df, data_model, **kwargs) + if not select: select = cdm_tables + for cdm_table in select: expected = pd.read_csv( test_data[f"test_{data_model}"][f"cdm_{cdm_table}"], @@ -119,6 +132,7 @@ def _map_model_test_data(data_model, encoding="utf-8", select=None, **kwargs): ) result_table = result[cdm_table].copy() result_table = result_table.dropna() + result_table = result_table.reset_index(drop=True) if "record_timestamp" in expected.columns: expected = expected.drop("record_timestamp", axis=1) @@ -359,8 +373,6 @@ def test_extract_input_data(data_header, column, elements, default, use_default, assert result[1] is use_default - print(result) - if exp == "idata": exp = data_header[elements[0]] elif isinstance(exp, list): @@ -588,6 +600,7 @@ def test_map_model_pub47(): "icoads_r302_d992", "craid", "gdac", + "marob", ], ) def test_map_model_test_data_basic(data_model): diff --git a/tests/test_reader_validator.py b/tests/test_reader_validator.py index d7c17b7a..bbdf6278 100755 --- a/tests/test_reader_validator.py +++ b/tests/test_reader_validator.py @@ -106,5 +106,4 @@ def test_validate_all_columns(sample_df, attributes): assert mask["DATE"].tolist() == expected_date expected_bool = [True, False, False, False, True] - print(mask["BOOL"]) assert mask["BOOL"].tolist() == expected_bool