glamod · ludwiglierhammer · Feb 21, 2025 · Feb 21, 2025 · Feb 21, 2025 · Feb 21, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,5 +1,5 @@
 default_language_version:
-  python: python3.12
+  python: python3.13
 
 repos:
   - repo: https://github.com/asottile/pyupgrade

diff --git a/cdm_reader_mapper/duplicates/duplicates.py b/cdm_reader_mapper/duplicates/duplicates.py
@@ -56,10 +56,15 @@ def _datetime_now():
 
         return now.strftime("%Y-%m-%d %H:%M:%S")
 
+    def _add_history(x):
+        if x:
+            return f"{x}; {addition}"
+        return addition
+
     indexes = list(indexes)
     history_tstmp = _datetime_now()
-    addition = "".join([f"; {history_tstmp}. {add}" for add in _histories.items()])
-    df.loc[indexes, "history"] = df.loc[indexes, "history"] + addition
+    addition = "".join([f"{history_tstmp}. {add}" for add in _histories.items()])
+    df.loc[indexes, "history"] = df.loc[indexes, "history"].apply(_add_history)
     return df
 
 
@@ -400,6 +405,21 @@ def _count_nulls(row):
     return df.reindex(indexes_[1])
 
 
+def fill_columns(df):
+    """Fill columns if necessary for duplicate check."""
+    if "report_id" not in df.columns:
+        df["report_id"] = df.index.astype(str)
+    if "report_quality" not in df.columns:
+        df["report_quality"] = 2
+    if "history" not in df.columns:
+        df["history"] = ""
+    if "duplicate_status" not in df.columns:
+        df["duplicate_status"] = 4
+    if "duplicates" not in df.columns:
+        df["duplicates"] = ""
+    return df
+
+
 class Comparer:
     """Class to compare DataFrame with recordlinkage Comparer."""
 
@@ -472,6 +492,7 @@ def duplicate_check(
         cdm_reader_mapper.DupDetect
     """
     data = data.reset_index(drop=True)
+    data = fill_columns(data)
 
     if reindex_by_null is True:
         data = reindex_nulls(data)

diff --git a/mirakel_test/probe.sql b/mirakel_test/probe.sql
@@ -0,0 +1,25 @@
+select
+schiffsname as primary_station_id,
+tagebuchart,
+tagebuchnr,
+tagebuchzusatz,
+belegbogennr_num,
+mess_datum,
+mess_datum_moz as report_timestamp,
+geogr_breite as longitude,
+geogr_laenge as latitude,
+lufttemp,
+wassertemp,
+fahrtrichtung as station_course,
+fahrtgeschwindigkeit as station_speed
+from
+prj_histor.marob_histor mar,
+prj_histor.tagebuecher tgb,
+schiffe_namen_rufzeichen namen
+where
+tgb.tagebuch_nummer = mar.tagebuchnr
+and tgb.schiffs_id = namen.schiffs_id
+and mar.tagebuchart='S'
+and tgb.tagebuch_typ_id = 4
+and tgb.tagebuch_nummer=5057
+;
diff --git a/mirakel_test/test_dupdetect.py b/mirakel_test/test_dupdetect.py
@@ -0,0 +1,20 @@
+"""Duplicate check for test data."""
+
+from cdm_reader_mapper.common.json_dict import open_json_file
+from cdm_reader_mapper import DataBundle
+
+import pandas as pd
+
+ifile = "test_in.json"
+
+idict = open_json_file(ifile)
+
+data = pd.DataFrame(idict)
+data.columns = [c.lower() for c in data.columns]
+
+db = DataBundle(tables=data)
+
+db.duplicate_check()
+df_flagged = db.flag_duplicates(overwrite=False)
+
+df_flagged.to_csv("test_out.json")