Skip to content
Draft
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
default_language_version:
python: python3.12
python: python3.13

repos:
- repo: https://github.com/asottile/pyupgrade
Expand Down
25 changes: 23 additions & 2 deletions cdm_reader_mapper/duplicates/duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,15 @@ def _datetime_now():

return now.strftime("%Y-%m-%d %H:%M:%S")

def _add_history(x):
if x:
return f"{x}; {addition}"
return addition

indexes = list(indexes)
history_tstmp = _datetime_now()
addition = "".join([f"; {history_tstmp}. {add}" for add in _histories.items()])
df.loc[indexes, "history"] = df.loc[indexes, "history"] + addition
addition = "".join([f"{history_tstmp}. {add}" for add in _histories.items()])
df.loc[indexes, "history"] = df.loc[indexes, "history"].apply(_add_history)
return df


Expand Down Expand Up @@ -400,6 +405,21 @@ def _count_nulls(row):
return df.reindex(indexes_[1])


def fill_columns(df):
"""Fill columns if necessary for duplicate check."""
if "report_id" not in df.columns:
df["report_id"] = df.index.astype(str)
if "report_quality" not in df.columns:
df["report_quality"] = 2
if "history" not in df.columns:
df["history"] = ""
if "duplicate_status" not in df.columns:
df["duplicate_status"] = 4
if "duplicates" not in df.columns:
df["duplicates"] = ""
return df


class Comparer:
"""Class to compare DataFrame with recordlinkage Comparer."""

Expand Down Expand Up @@ -472,6 +492,7 @@ def duplicate_check(
cdm_reader_mapper.DupDetect
"""
data = data.reset_index(drop=True)
data = fill_columns(data)

if reindex_by_null is True:
data = reindex_nulls(data)
Expand Down
25 changes: 25 additions & 0 deletions mirakel_test/probe.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
select
schiffsname as primary_station_id,
tagebuchart,
tagebuchnr,
tagebuchzusatz,
belegbogennr_num,
mess_datum,
mess_datum_moz as report_timestamp,
geogr_breite as longitude,
geogr_laenge as latitude,
lufttemp,
wassertemp,
fahrtrichtung as station_course,
fahrtgeschwindigkeit as station_speed
from
prj_histor.marob_histor mar,
prj_histor.tagebuecher tgb,
schiffe_namen_rufzeichen namen
where
tgb.tagebuch_nummer = mar.tagebuchnr
and tgb.schiffs_id = namen.schiffs_id
and mar.tagebuchart='S'
and tgb.tagebuch_typ_id = 4
and tgb.tagebuch_nummer=5057
;
20 changes: 20 additions & 0 deletions mirakel_test/test_dupdetect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""Duplicate check for test data."""

from cdm_reader_mapper.common.json_dict import open_json_file
from cdm_reader_mapper import DataBundle

import pandas as pd

ifile = "test_in.json"

idict = open_json_file(ifile)

data = pd.DataFrame(idict)
data.columns = [c.lower() for c in data.columns]

db = DataBundle(tables=data)

db.duplicate_check()
df_flagged = db.flag_duplicates(overwrite=False)

df_flagged.to_csv("test_out.json")
Loading