From 1523cc62801074e0d14e8c52104d5d80dfb73aba Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Thu, 29 Jan 2026 17:07:47 +0100 Subject: [PATCH 1/7] style: ruff --- .github/workflows/ci-cd.yml | 7 +++++-- src/ms_blocking/ms_blocking.py | 4 ++-- src/ms_blocking/utils.py | 7 ++++--- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci-cd.yml b/.github/workflows/ci-cd.yml index d674ed1..cb460dc 100644 --- a/.github/workflows/ci-cd.yml +++ b/.github/workflows/ci-cd.yml @@ -23,11 +23,14 @@ jobs: - name: Install package run: poetry install + #- name: Install ruff + # run: poetry install ruff + - name: Format with ruff - run: ruff format + run: poetry run ruff format - name: Test with pytest - run: poetry run pytest tests/ --cov=pycounts --cov-report=xml + run: poetry run pytest tests/ --cov=pycounts --cov-report=html - name: Build documentation run: poetry run make html --directory docs/ diff --git a/src/ms_blocking/ms_blocking.py b/src/ms_blocking/ms_blocking.py index d6ad2fb..0fdfae9 100644 --- a/src/ms_blocking/ms_blocking.py +++ b/src/ms_blocking/ms_blocking.py @@ -242,7 +242,7 @@ def __repr__(self): return f"AndNode{{{self.left}, {self.right}}}" def __eq__(self, other): - return self.left==other.left and self.right==other.right + return self.left == other.left and self.right == other.right def block(self, df, motives=False): # In order not to perform redundant computations, we first filter out the rows that were not considered by the first blocker before running the second blocker @@ -275,7 +275,7 @@ def __repr__(self): return f"OrNode{{{self.left}, {self.right}}}" def __eq__(self, other): - return self.left==other.left and self.right==other.right + return self.left == other.left and self.right == other.right def block(self, df, motives=False): coords_left = self.left.block(df, motives=motives) diff --git a/src/ms_blocking/utils.py b/src/ms_blocking/utils.py index da01fa6..c32ebc1 100644 --- a/src/ms_blocking/utils.py +++ b/src/ms_blocking/utils.py @@ -546,7 +546,8 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]: else: return [s for s in cleaned_items if len(s) > 0] -def scoring(data: pd.DataFrame, motives_column: str="motive") -> pd.Series: + +def scoring(data: pd.DataFrame, motives_column: str = "motive") -> pd.Series: """Add a score to a blocked DataFrame based on the number of motives Parameters @@ -565,11 +566,11 @@ def scoring(data: pd.DataFrame, motives_column: str="motive") -> pd.Series: # Check that we do have motives if motives_column not in data.columns: - raise ValueError(f"Specified motives column \"{motives_column}\" does not exist") + raise ValueError(f'Specified motives column "{motives_column}" does not exist') if "score" in data.columns: print("Renaming 'score' column to 'score_old'") data = data.rename(columns={"score": "score_old"}) scores = data[motives_column].apply(len) - return scores \ No newline at end of file + return scores From 4e14e6dfada8c41dc762c3401417eb23691d63cb Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Thu, 29 Jan 2026 17:58:33 +0100 Subject: [PATCH 2/7] refactor: rename Node to BlockerNode; fix: motives handling in AndNode --- .github/workflows/ci-cd.yml | 3 --- src/ms_blocking/ms_blocking.py | 29 +++++++++++++++++++---------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ci-cd.yml b/.github/workflows/ci-cd.yml index cb460dc..3b6e956 100644 --- a/.github/workflows/ci-cd.yml +++ b/.github/workflows/ci-cd.yml @@ -23,9 +23,6 @@ jobs: - name: Install package run: poetry install - #- name: Install ruff - # run: poetry install ruff - - name: Format with ruff run: poetry run ruff format diff --git a/src/ms_blocking/ms_blocking.py b/src/ms_blocking/ms_blocking.py index 0fdfae9..d9234b7 100644 --- a/src/ms_blocking/ms_blocking.py +++ b/src/ms_blocking/ms_blocking.py @@ -211,28 +211,36 @@ def add_motives_to_coords(coords, explanations): return {pair: explanations for pair in coords} -class Node: +class BlockerNode: """Abstract class from which derive all classes in the module""" def __init__(self, left=None, right=None): self.left = left self.right = right + self.equivalence_columns = None + self.overlap_columns = None self.overlap = None self.normalize = None self.must_not_be_different = None self.word_level = None def __and__(self, other): - return merge_blockers(self, other) + if self == other: + return self + else: + return merge_blockers(self, other) def __or__(self, other): - return OrNode(self, other) + if self == other: + return self + else: + return OrNode(self, other) def __repr__(self): return f"Node{{{self.left}, {self.right}}}" -class AndNode(Node): +class AndNode(BlockerNode): """Used to compute the intersection of the outputs of two Blockers.""" def __init__(self, left, right): @@ -258,14 +266,14 @@ def block(self, df, motives=False): if id_lists else pd.DataFrame(columns=df.columns) ) - - coords_right = self.right.block(df_shortened, motives=motives) + # Rows that are in no pairs following the first blocking step cannot be in any pair of the interection + coords_right = self.right.block(df_shortened, motives=self.right.motives) result = merge_blocks_and(coords_left, coords_right) return result -class OrNode(Node): +class OrNode(BlockerNode): """Used to compute the union of the outputs of two Blockers.""" def __init__(self, left, right): @@ -278,6 +286,7 @@ def __eq__(self, other): return self.left == other.left and self.right == other.right def block(self, df, motives=False): + # Note: for performance, it would be wise to remove rows that are already paired with all other rows, though this case should be pretty rare in real situations coords_left = self.left.block(df, motives=motives) coords_right = self.right.block(df, motives=motives) @@ -286,7 +295,7 @@ def block(self, df, motives=False): return result -class AttributeEquivalenceBlocker(Node): # Leaf +class AttributeEquivalenceBlocker(BlockerNode): # Leaf """To regroup rows based on equality across columns.""" def __init__( @@ -391,7 +400,7 @@ def block(self, data, motives=False): return set(coords) # set is unnnecessary -class OverlapBlocker(Node): # Leaf +class OverlapBlocker(BlockerNode): # Leaf """To regroup rows based on overlap of one or more columns.""" def __init__( @@ -482,7 +491,7 @@ def block(self, data, motives=False): return set(coords) -class MixedBlocker(Node): # Leaf; For ANDs and RAM +class MixedBlocker(BlockerNode): # Leaf; For ANDs and RAM """Represent the intersection of an AttributeEquivalenceBlocker and an OverlapBlocker. Designed for performance and RAM efficiency. """ From 187716836a6f48c6815025fef891edfc4e2117ab Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Fri, 30 Jan 2026 14:41:41 +0100 Subject: [PATCH 3/7] refactor: move helpers to utils; docs: add docstrings and typehints to utils --- src/ms_blocking/ms_blocking.py | 372 +++++++++++++++------------------ src/ms_blocking/utils.py | 148 ++++++++++++- 2 files changed, 304 insertions(+), 216 deletions(-) diff --git a/src/ms_blocking/ms_blocking.py b/src/ms_blocking/ms_blocking.py index d9234b7..a7903c0 100644 --- a/src/ms_blocking/ms_blocking.py +++ b/src/ms_blocking/ms_blocking.py @@ -1,214 +1,7 @@ -import random -from itertools import combinations -from collections import Counter - from ms_blocking.utils import * # noqa: F403 -def merge_blockers(left, right): - """ - Convert two blockers into a single one for performance purposes - """ - - if ( - type(left) is AttributeEquivalenceBlocker - and type(right) is AttributeEquivalenceBlocker - and left.normalize == right.normalize - and left.must_not_be_different == right.must_not_be_different - ): - return AttributeEquivalenceBlocker( - blocking_columns=left.blocking_columns + right.blocking_columns, - normalize_strings=left.normalize, - must_not_be_different=left.must_not_be_different, - ) - - elif ( - type(left) is OverlapBlocker - and type(right) is OverlapBlocker - and left.normalize == right.normalize - and left.overlap == right.overlap - and left.word_level == right.word_level - ): - return OverlapBlocker( - blocking_columns=left.blocking_columns + right.blocking_columns, - normalize_strings=left.normalize, - overlap=left.overlap, - word_level=left.word_level, - ) - - elif ( - type(left) is AttributeEquivalenceBlocker - and type(right) is OverlapBlocker - and left.normalize == right.normalize - ): - return MixedBlocker( - equivalence_columns=left.blocking_columns, - overlap_columns=right.blocking_columns, - normalize_strings=left.normalize, - overlap=right.overlap, - word_level=right.word_level, - ) - - elif ( - type(left) is OverlapBlocker - and type(right) is AttributeEquivalenceBlocker - and left.normalize == right.normalize - ): - return MixedBlocker( - equivalence_columns=right.blocking_columns, - overlap_columns=left.blocking_columns, - normalize_strings=left.normalize, - overlap=left.overlap, - word_level=left.word_level, - ) - - elif ( - type(left) is MixedBlocker - and type(right) is MixedBlocker - and left.normalize == right.normalize - and left.overlap == right.overlap - and left.word_level == right.word_level - ): - return MixedBlocker( - equivalence_columns=left.equivalence_columns + right.equivalence_columns, - overlap_columns=left.overlap_columns + right.overlap_columns, - must_not_be_different=list( - set(left.must_not_be_different + right.must_not_be_different) - ), - normalize_strings=left.normalize, - overlap=left.overlap, - word_level=left.word_level, - ) - - elif ( - type(left) is MixedBlocker - and type(right) is AttributeEquivalenceBlocker - and left.normalize == right.normalize - ): - return MixedBlocker( - equivalence_columns=left.equivalence_columns + right.blocking_columns, - overlap_columns=left.overlap_columns, - must_not_be_different=list( - set(left.must_not_be_different + right.must_not_be_different) - ), - normalize_strings=left.normalize, - overlap=left.overlap, - word_level=left.word_level, - ) - - elif ( - type(left) is AttributeEquivalenceBlocker - and type(right) is MixedBlocker - and left.normalize == right.normalize - ): - return MixedBlocker( - equivalence_columns=left.blocking_columns + right.equivalence_columns, - overlap_columns=right.overlap_columns, - must_not_be_different=list( - set(left.must_not_be_different + right.must_not_be_different) - ), - normalize_strings=left.normalize, - overlap=right.overlap, - word_level=right.word_level, - ) - - elif ( - type(left) is MixedBlocker - and type(right) is OverlapBlocker - and left.normalize == right.normalize - and left.overlap == right.overlap - and left.word_level == right.word_level - ): - return MixedBlocker( - equivalence_columns=left.equivalence_columns, - overlap_columns=left.overlap_columns + right.blocking_columns, - must_not_be_different=left.must_not_be_different, - normalize_strings=left.normalize, - overlap=left.overlap, - word_level=left.word_level, - ) - - elif ( - type(left) is OverlapBlocker - and type(right) is MixedBlocker - and left.normalize == right.normalize - and left.overlap == right.overlap - and left.word_level == right.word_level - ): - return MixedBlocker( - equivalence_columns=right.equivalence_columns, - overlap_columns=left.blocking_columns + right.overlap_columns, - must_not_be_different=right.must_not_be_different, - normalize_strings=left.normalize, - overlap=left.overlap, - word_level=left.word_level, - ) - else: - return AndNode(left, right) - - -def must_not_be_different_apply( - temp_data, blocking_columns, must_not_be_different_columns -): - """Re-block DataFrame on a second column, where we require non-difference rather than equality""" - - temp_data["block_id"] = temp_data.groupby(blocking_columns).ngroup() - temp_data = temp_data[temp_data["block_id"].duplicated(keep=False)] - - reconstructed_data = pd.DataFrame(columns=temp_data.columns) - for block in temp_data["block_id"].unique(): - # noinspection PyArgumentList - current_block = ( - temp_data[temp_data["block_id"] == block] - .sort_values(must_not_be_different_columns) - .copy() - ) - if ( - len(current_block[current_block[must_not_be_different_columns].notnull()]) - == 0 - ): # All nulls - random_string = "".join( - random.choices("ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", k=10) - ) # As long as the string is not already in the column... - # There must be a better way to do it... - current_block[must_not_be_different_columns] = ( - current_block[must_not_be_different_columns] - .astype(str) - .fillna(random_string) - ) - else: - current_block[must_not_be_different_columns] = ( - current_block[must_not_be_different_columns].astype(str).ffill() - ) - if len(reconstructed_data) == 0: - reconstructed_data = current_block - else: - reconstructed_data = pd.concat([reconstructed_data, current_block]) - return reconstructed_data - - -def block_overlap(groups, overlap): - coords = { - frozenset(pair) for group_list in groups for pair in combinations(group_list, 2) - } - - if overlap > 1: - coords = [ # In this specific case, we want to keep duplicates to track the number of occurences of a pair - frozenset(pair) - for group_list in groups - for pair in combinations(group_list, 2) - ] - # Filter pairs that fulfill the minimum overlap condition - occurences_dict = Counter(coords) - coords = { - p for p in occurences_dict if occurences_dict[p] >= overlap - } # The collection of pairs that fulfill the overlap condition - - return coords - - -def add_motives_to_coords(coords, explanations): - return {pair: explanations for pair in coords} +# TODO: "block_id" class BlockerNode: @@ -217,6 +10,7 @@ class BlockerNode: def __init__(self, left=None, right=None): self.left = left self.right = right + self.blocking_columns = None self.equivalence_columns = None self.overlap_columns = None self.overlap = None @@ -267,7 +61,7 @@ def block(self, df, motives=False): else pd.DataFrame(columns=df.columns) ) # Rows that are in no pairs following the first blocking step cannot be in any pair of the interection - coords_right = self.right.block(df_shortened, motives=self.right.motives) + coords_right = self.right.block(df_shortened, motives=motives) result = merge_blocks_and(coords_left, coords_right) return result @@ -382,6 +176,7 @@ def block(self, data, motives=False): return set() # Use the DataFrame index for grouping and forming pairs + # Using frozenset since they are ahshable and thus can be used as dictionary keys groups = temp_data.groupby( self.blocking_columns + self.must_not_be_different ).apply(lambda x: frozenset(x.index), include_groups=False) @@ -475,6 +270,7 @@ def block(self, data, motives=False): return set() # Use the DataFrame index for grouping and forming pairs + # Using frozenset since they are ahshable and thus can be used as dictionary keys groups = temp_data.groupby(self.blocking_columns).apply( lambda x: frozenset(x.index), include_groups=False ) @@ -613,6 +409,7 @@ def block(self, data, motives=False): must_not_be_different_columns=self.must_not_be_different, ) + # Using frozenset since they are ahshable and thus can be used as dictionary keys groups_equivalence = temp_data.groupby(self.equivalence_columns).apply( lambda x: frozenset(x.index), include_groups=False ) @@ -642,4 +439,161 @@ def block(self, data, motives=False): return set(coords) +def merge_blockers( + left: BlockerNode, right: BlockerNode +) -> AttributeEquivalenceBlocker | OverlapBlocker | MixedBlocker | AndNode: + """Convert two blockers into a single one for performance purposes + + This function outputs a new blocker that combines the functionalities of the two input blockers, to prevent redundant operations. + + Parameters + ---------- + left : BlockerNode + Blocker that represents the first condition + + right : BlockerNode + Blocker that represents the second condition + + Returns + ------- + AttributeEquivalenceBlocker|OverlapBlocker|MixedBlocker|AndNode + Blocker that represents both conditions + """ + if ( + type(left) is AttributeEquivalenceBlocker + and type(right) is AttributeEquivalenceBlocker + and left.normalize == right.normalize + and left.must_not_be_different == right.must_not_be_different + ): + return AttributeEquivalenceBlocker( + blocking_columns=left.blocking_columns + right.blocking_columns, + normalize_strings=left.normalize, + must_not_be_different=left.must_not_be_different, + ) + + elif ( + type(left) is OverlapBlocker + and type(right) is OverlapBlocker + and left.normalize == right.normalize + and left.overlap == right.overlap + and left.word_level == right.word_level + ): + return OverlapBlocker( + blocking_columns=left.blocking_columns + right.blocking_columns, + normalize_strings=left.normalize, + overlap=left.overlap, + word_level=left.word_level, + ) + + elif ( + type(left) is AttributeEquivalenceBlocker + and type(right) is OverlapBlocker + and left.normalize == right.normalize + ): + return MixedBlocker( + equivalence_columns=left.blocking_columns, + overlap_columns=right.blocking_columns, + normalize_strings=left.normalize, + overlap=right.overlap, + word_level=right.word_level, + ) + + elif ( + type(left) is OverlapBlocker + and type(right) is AttributeEquivalenceBlocker + and left.normalize == right.normalize + ): + return MixedBlocker( + equivalence_columns=right.blocking_columns, + overlap_columns=left.blocking_columns, + normalize_strings=left.normalize, + overlap=left.overlap, + word_level=left.word_level, + ) + + elif ( + type(left) is MixedBlocker + and type(right) is MixedBlocker + and left.normalize == right.normalize + and left.overlap == right.overlap + and left.word_level == right.word_level + ): + return MixedBlocker( + equivalence_columns=left.equivalence_columns + right.equivalence_columns, + overlap_columns=left.overlap_columns + right.overlap_columns, + must_not_be_different=list( + set(left.must_not_be_different + right.must_not_be_different) + ), + normalize_strings=left.normalize, + overlap=left.overlap, + word_level=left.word_level, + ) + + elif ( + type(left) is MixedBlocker + and type(right) is AttributeEquivalenceBlocker + and left.normalize == right.normalize + ): + return MixedBlocker( + equivalence_columns=left.equivalence_columns + right.blocking_columns, + overlap_columns=left.overlap_columns, + must_not_be_different=list( + set(left.must_not_be_different + right.must_not_be_different) + ), + normalize_strings=left.normalize, + overlap=left.overlap, + word_level=left.word_level, + ) + + elif ( + type(left) is AttributeEquivalenceBlocker + and type(right) is MixedBlocker + and left.normalize == right.normalize + ): + return MixedBlocker( + equivalence_columns=left.blocking_columns + right.equivalence_columns, + overlap_columns=right.overlap_columns, + must_not_be_different=list( + set(left.must_not_be_different + right.must_not_be_different) + ), + normalize_strings=left.normalize, + overlap=right.overlap, + word_level=right.word_level, + ) + + elif ( + type(left) is MixedBlocker + and type(right) is OverlapBlocker + and left.normalize == right.normalize + and left.overlap == right.overlap + and left.word_level == right.word_level + ): + return MixedBlocker( + equivalence_columns=left.equivalence_columns, + overlap_columns=left.overlap_columns + right.blocking_columns, + must_not_be_different=left.must_not_be_different, + normalize_strings=left.normalize, + overlap=left.overlap, + word_level=left.word_level, + ) + + elif ( + type(left) is OverlapBlocker + and type(right) is MixedBlocker + and left.normalize == right.normalize + and left.overlap == right.overlap + and left.word_level == right.word_level + ): + return MixedBlocker( + equivalence_columns=right.equivalence_columns, + overlap_columns=left.blocking_columns + right.overlap_columns, + must_not_be_different=right.must_not_be_different, + normalize_strings=left.normalize, + overlap=left.overlap, + word_level=left.word_level, + ) + else: + return AndNode(left, right) + + # /!\ TODO: make class for motives (+ pair, motive dict)? diff --git a/src/ms_blocking/utils.py b/src/ms_blocking/utils.py index c32ebc1..596cee1 100644 --- a/src/ms_blocking/utils.py +++ b/src/ms_blocking/utils.py @@ -5,7 +5,10 @@ from scipy.sparse.csgraph import connected_components import pandas as pd import networkx as nx +import random +from collections import Counter +from itertools import combinations from typing import List, Set, Iterable, Dict, Collection, Any Columns = List[str] @@ -21,14 +24,14 @@ def remove_rows_if_value_appears_only_once( data: pd.DataFrame, cols: Columns ) -> pd.DataFrame: - """Drops rows of a Pandas DataFrame where a certain column's values appears only once. + """Drop rows of a Pandas DataFrame where a certain column's values appears only once. Ensures all elements of provided columns appear at least twice in their column Parameters ---------- data : DataFrame - The DataFrame to preprocess + DataFrame to preprocess cols : List[str] List of columns where rows that contain non-duplicated elements shall be discarded @@ -131,7 +134,7 @@ def normalize_function(string: Any) -> Any: Parameters ---------- string : Any - The text to preprocess + Text to preprocess Returns ------- @@ -160,7 +163,7 @@ def normalize(text: Any) -> Any: Parameters ---------- text : Any - The text(s) to preprocess + Text(s) to preprocess Returns ------- @@ -191,7 +194,7 @@ def flatten(list_of_iterables_: Collection[Iterable]) -> List[Any] | None: Parameters ---------- list_of_iterables_ : Collection[Iterable] - The list to flatten + List to flatten Returns ------- @@ -502,7 +505,7 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]: Parameters ---------- s : str - The stringified representation of a list e.g. "['string 1', 'string 2', ...]" + Stringified representation of a list e.g. "['string 1', 'string 2', ...]" word_level : bool Whether to return a list of all words within s instead of a list of each comma-separated element @@ -556,7 +559,7 @@ def scoring(data: pd.DataFrame, motives_column: str = "motive") -> pd.Series: A DataFrame with motives motives_column : str - The name of the column containing the motives + Name of the column containing the motives Returns ------- @@ -574,3 +577,134 @@ def scoring(data: pd.DataFrame, motives_column: str = "motive") -> pd.Series: scores = data[motives_column].apply(len) return scores + + +def must_not_be_different_apply( # WIP + temp_data: pd.DataFrame, + blocking_columns: List[str], + must_not_be_different_columns: List[str], +): + """Re-block DataFrame on a second column, where we require non-difference rather than equality + + Parameters + ---------- + temp_data : DataFrame + Partially blocked DataFrame + + blocking_columns : List[str] + Columns where we check for equality + + must_not_be_different_columns : List[str] + Columns where we only check for non-difference + + Returns + ------- + pd.DataFrame + A column of scores + """ + temp_data["block_id"] = temp_data.groupby(blocking_columns).ngroup() + temp_data = temp_data[temp_data["block_id"].duplicated(keep=False)] + + reconstructed_data = pd.DataFrame(columns=temp_data.columns) + for block in temp_data["block_id"].unique(): + # noinspection PyArgumentList + current_block = ( + temp_data[temp_data["block_id"] == block] + .sort_values(must_not_be_different_columns) + .copy() + ) + if ( + len(current_block[current_block[must_not_be_different_columns].notnull()]) + == 0 + ): # All nulls + random_string = "".join( + random.choices("ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", k=10) + ) # As long as the string is not already in the column... + # There must be a better way to do it... + current_block[must_not_be_different_columns] = ( + current_block[must_not_be_different_columns] + .astype(str) + .fillna(random_string) + ) + else: + current_block[must_not_be_different_columns] = ( + current_block[must_not_be_different_columns].astype(str).ffill() + ) + if len(reconstructed_data) == 0: + reconstructed_data = current_block + else: + reconstructed_data = pd.concat([reconstructed_data, current_block]) + return reconstructed_data + + +def block_overlap(groups: Iterable, overlap: int = 1) -> Coords: + """Block a DataFrame based on overlap accross columns + + Parameters + ---------- + groups : Iterable + Output of a groupby + + overlap : int + Minimum passing overlap + + Returns + ------- + Coords + Pairs obtained by blocking + """ + coords = { + frozenset(pair) for group_list in groups for pair in combinations(group_list, 2) + } + + if overlap > 1: + coords = [ # In this specific case, we want to keep duplicates to track the number of occurences of each pair + frozenset(pair) + for group_list in groups + for pair in combinations(group_list, 2) + ] + # Filter pairs that fulfill the minimum overlap condition + occurences_dict = Counter(coords) + coords = { + p for p in occurences_dict if occurences_dict[p] >= overlap + } # The collection of pairs that fulfill the overlap condition + + return coords + + +def add_motives_to_coords(coords: Coords, explanations: Set[str]) -> CoordsMotives: + """Block a DataFrame based on overlap accross columns + + Parameters + ---------- + coords : Coords + Coords obtained by blocking + + explanations : Set[str] + Set of explanations + + Returns + ------- + CoordsMotives + Pairs obtained by blocking + + Examples + -------- + >>> add_motives_to_coords({ + frozenset({1, 4}), + frozenset({8, 11}), + frozenset({2, 5}), + frozenset({10, 13}), + frozenset({3, 8}), + frozenset({3, 11}), + }, {"Same 'City'"}') + { + frozenset({1, 4}): {"Same 'City'"}, + frozenset({8, 11}): {"Same 'City'"}, + frozenset({2, 5}): {"Same 'City'"}, + frozenset({10, 13}): {"Same 'City'"}, + frozenset({3, 8}): {"Same 'City'"}, + frozenset({3, 11}): {"Same 'City'"}, + } + """ + return {pair: explanations for pair in coords} From 043ac2deb0841844d7ef2966727d00e0d44b3d85 Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Fri, 30 Jan 2026 15:30:54 +0100 Subject: [PATCH 4/7] refactor: add underscore in front of new column names; perf: do not add temp column to df in must_not_be_different_apply --- src/ms_blocking/utils.py | 80 ++++++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/src/ms_blocking/utils.py b/src/ms_blocking/utils.py index 596cee1..837645f 100644 --- a/src/ms_blocking/utils.py +++ b/src/ms_blocking/utils.py @@ -341,17 +341,6 @@ def add_blocks_to_dataset( id_l rank_l id_r rank_r block 0 0 first 2 first 0 """ - if output_columns is None: - output_columns = data.columns - data = data[output_columns].copy() - - if "motive" in data.columns: - print("Renaming 'motive' column to 'motive_old'") - data = data.rename(columns={"motive": "motive_old"}) - - if "block" in data.columns: - print("Renaming 'block' column to 'block_old'") - data = data.rename(columns={"block": "block_old"}) if show_as_pairs and keep_ungrouped_rows: raise ValueError("Cannot both return pairs and keep ungrouped rows") @@ -364,6 +353,19 @@ def add_blocks_to_dataset( if not data.index.is_unique: raise ValueError("DataFrame index must be unique to be used as an identifier.") + if "_motive" in data.columns: + if motives: + raise ValueError( + "Please rename existing '_motive' column OR do not pass 'motives=True'" + ) + + if "_block" in data.columns: + raise ValueError("Please rename existing '_block' column") + + if output_columns is None: + output_columns = data.columns + data = data[output_columns].copy() + if len(coords) == 0 and not keep_ungrouped_rows: # Empty graph if show_as_pairs: columns = [col + "_l" for col in data.columns] + [ @@ -414,16 +416,16 @@ def add_blocks_to_dataset( output_data = pd.concat([output_data, current_row]) # Assign blocks to rows based on their original index - output_data["block"] = output_data.index.map(matcher) + output_data["_block"] = output_data.index.map(matcher) if not merge_blocks: - output_data = output_data.explode("block") + output_data = output_data.explode("_block") if keep_ungrouped_rows: - output_data["block"] = output_data["block"].fillna(-1) + output_data["_block"] = output_data["_block"].fillna(-1) matcher_ungrouped_rows = {} block_temp = [] i = 0 # Track # of blocks processed - for b in output_data["block"]: + for b in output_data["_block"]: if b == -1: block_temp.append(i) i += 1 @@ -433,19 +435,19 @@ def add_blocks_to_dataset( i += 1 else: block_temp.append(matcher_ungrouped_rows[b]) - output_data["block"] = block_temp + output_data["_block"] = block_temp else: if not show_as_pairs: output_data = output_data[ - output_data["block"].duplicated(keep=False) - & output_data["block"].notna() + output_data["_block"].duplicated(keep=False) + & output_data["_block"].notna() ] - output_data.loc[:, ["block"]] = start_from_zero(output_data["block"]) + output_data.loc[:, ["_block"]] = start_from_zero(output_data["_block"]) if sort: # Sort by block, then by original index - sort_cols = ["block"] + sort_cols = ["_block"] if output_data.index.name: output_data = output_data.sort_values( sort_cols + [output_data.index.name] @@ -459,7 +461,7 @@ def add_blocks_to_dataset( output_data = output_data.set_index(output_data.columns[0]) if motives: - output_data["motive"] = "" + output_data["_motive"] = "" id_list = flatten(coords.keys()) motive_matcher = { row_id: frozenset( @@ -470,13 +472,14 @@ def add_blocks_to_dataset( ) for row_id in id_list } - output_data["motive"] = output_data.index.map(motive_matcher) + output_data["_motive"] = output_data.index.map(motive_matcher) - if "block" not in output_data.columns: # Empty coords - output_data["block"] = -1 + if "_block" not in output_data.columns: # Empty coords + output_data["_block"] = -1 output_data = output_data.reset_index(drop=True) - output_data["block"] = output_data["block"].astype(int) + output_data["_block"] = output_data["_block"].astype(int) + return output_data @@ -513,7 +516,7 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]: Returns ------- List[str] - A python list based on s + s turned into a List Examples -------- @@ -550,13 +553,13 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]: return [s for s in cleaned_items if len(s) > 0] -def scoring(data: pd.DataFrame, motives_column: str = "motive") -> pd.Series: +def scoring(data: pd.DataFrame, motives_column: str = "_motive") -> pd.Series: """Add a score to a blocked DataFrame based on the number of motives Parameters ---------- data : DataFrame - A DataFrame with motives + DataFrame with motives motives_column : str Name of the column containing the motives @@ -569,7 +572,12 @@ def scoring(data: pd.DataFrame, motives_column: str = "motive") -> pd.Series: # Check that we do have motives if motives_column not in data.columns: - raise ValueError(f'Specified motives column "{motives_column}" does not exist') + if motives_column == "_motive": + raise ValueError("No motives in DataFrame") + else: + raise ValueError( + f'Specified motives column "{motives_column}" does not exist' + ) if "score" in data.columns: print("Renaming 'score' column to 'score_old'") @@ -599,17 +607,18 @@ def must_not_be_different_apply( # WIP Returns ------- - pd.DataFrame - A column of scores + DataFrame + Column of scores """ - temp_data["block_id"] = temp_data.groupby(blocking_columns).ngroup() - temp_data = temp_data[temp_data["block_id"].duplicated(keep=False)] + + series_block_id = temp_data.groupby(blocking_columns).ngroup() + temp_data = temp_data[series_block_id.duplicated(keep=False)] reconstructed_data = pd.DataFrame(columns=temp_data.columns) - for block in temp_data["block_id"].unique(): + for block in series_block_id.unique(): # noinspection PyArgumentList current_block = ( - temp_data[temp_data["block_id"] == block] + temp_data[series_block_id == block] .sort_values(must_not_be_different_columns) .copy() ) @@ -634,6 +643,7 @@ def must_not_be_different_apply( # WIP reconstructed_data = current_block else: reconstructed_data = pd.concat([reconstructed_data, current_block]) + return reconstructed_data From c6f6fd6dff653dda93607ba59b4941b40e770905 Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Fri, 30 Jan 2026 15:31:23 +0100 Subject: [PATCH 5/7] refactor: rename block and motive to new names --- tests/test_ms_blocking.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/test_ms_blocking.py b/tests/test_ms_blocking.py index cbf8284..d3f9ab2 100644 --- a/tests/test_ms_blocking.py +++ b/tests/test_ms_blocking.py @@ -105,7 +105,7 @@ def attribute_city_show_as_pairs_true_id(): @pytest.fixture def attribute_city_show_as_pairs_true_columns(): - return ["id_l", "Name_l", "id_r", "Name_r", "block"] + return ["id_l", "Name_l", "id_r", "Name_r", "_block"] @pytest.fixture @@ -183,7 +183,7 @@ def test_merge_blocks(overlap_websites_merge_blocks): websites_blocker = msb.OverlapBlocker(["websites"]) links = websites_blocker.block(get_users()) actual = msb.add_blocks_to_dataset(get_users(), links, merge_blocks=False)[ - "block" + "_block" ].to_list() assert actual == expected, ( "Blocking on websites should return [0, 0, 0, 1, 1, 2, 2, 2]" @@ -240,7 +240,7 @@ def test_sort_false(attribute_city_sort_false_blocks): city_blocker = msb.AttributeEquivalenceBlocker(["City"]) links = city_blocker.block(get_users()) actual = msb.add_blocks_to_dataset(get_users(), links, sort=False)[ - "block" + "_block" ].to_list() assert actual == expected, ( "Blocking on websites and adding blocks with sort=False should return [0, 1, 2, 0, 1, 2, 3, 2, 3]" @@ -253,7 +253,7 @@ def test_keep_ungrouped_rows_false(attribute_city_keep_ungrouped_rows_false): city_blocker = msb.AttributeEquivalenceBlocker(["City"]) links = city_blocker.block(get_users()) actual = msb.add_blocks_to_dataset(get_users(), links, keep_ungrouped_rows=True)[ - "block" + "_block" ].to_list() assert actual == expected, ( "Blocking on Name with normalize_strings=False should return [0, 1, 1, 2, 2, 3, 3, 3, 4, 5, 6, 7, 7, 8]" @@ -274,7 +274,7 @@ def test_motives_when_adding_to_dataframe(attribute_city_motives_true_add): city_blocker = msb.AttributeEquivalenceBlocker(["City"]) links = city_blocker.block(get_users(), motives=True) actual = msb.add_blocks_to_dataset(get_users(), links, motives=True)[ - "motive" + "_motive" ].to_list() assert actual == expected @@ -337,7 +337,7 @@ def test_pipelining_motives(city_age_websites_pipelining_motives): links = final_blocker.block(get_users(), motives=True) actual = msb.add_blocks_to_dataset( get_users(), links, show_as_pairs=True, motives=True, merge_blocks=False - )["motive"].to_list() + )["_motive"].to_list() assert actual == expected @@ -508,7 +508,7 @@ def test_no_links_m(): def test_no_links_add_blocks_to_dataframe(): """Test that add_blocks_to_dataframe gracefully outputs an empty DataFrame when no pairs were found""" - expected = pd.DataFrame(columns=["id", "Name", "City", "Age", "websites", "block"]) + expected = pd.DataFrame(columns=["id", "Name", "City", "Age", "websites", "_block"]) expected_show_as_pairs = pd.DataFrame( columns=[ "id_l", @@ -521,11 +521,11 @@ def test_no_links_add_blocks_to_dataframe(): "City_r", "Age_r", "websites_r", - "block", + "_block", ] ) expected_motives = pd.DataFrame( - columns=["id", "Name", "City", "Age", "websites", "motive", "block"] + columns=["id", "Name", "City", "Age", "websites", "_motive", "_block"] ) id_blocker = msb.AttributeEquivalenceBlocker(["id"]) links = id_blocker.block(get_users()) From 3793a13af7d0ddb606d2c34e00ee288f31fad234 Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Fri, 30 Jan 2026 15:39:57 +0100 Subject: [PATCH 6/7] docs: run notebook --- docs/example.ipynb | 656 ++++++++++++++++++++++----------------------- 1 file changed, 328 insertions(+), 328 deletions(-) diff --git a/docs/example.ipynb b/docs/example.ipynb index 32bc8c4..6b82165 100644 --- a/docs/example.ipynb +++ b/docs/example.ipynb @@ -32,15 +32,15 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:26.824780400Z", - "start_time": "2026-01-29T15:07:26.781971700Z" + "end_time": "2026-01-30T14:21:14.010997600Z", + "start_time": "2026-01-30T14:21:13.420790Z" } }, "source": [ "import ms_blocking.ms_blocking as msb" ], "outputs": [], - "execution_count": 137 + "execution_count": 1 }, { "cell_type": "markdown", @@ -60,8 +60,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:27.000725500Z", - "start_time": "2026-01-29T15:07:26.849860500Z" + "end_time": "2026-01-30T14:21:14.049404600Z", + "start_time": "2026-01-30T14:21:14.010997600Z" } }, "source": [ @@ -250,12 +250,12 @@ "" ] }, - "execution_count": 138, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 138 + "execution_count": 2 }, { "cell_type": "markdown", @@ -282,15 +282,15 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:27.363604100Z", - "start_time": "2026-01-29T15:07:27.107402500Z" + "end_time": "2026-01-30T14:21:14.190107400Z", + "start_time": "2026-01-30T14:21:14.089762400Z" } }, "source": [ "city_blocker = msb.AttributeEquivalenceBlocker([\"City\"])" ], "outputs": [], - "execution_count": 139 + "execution_count": 3 }, { "cell_type": "markdown", @@ -310,8 +310,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:27.745120Z", - "start_time": "2026-01-29T15:07:27.573071300Z" + "end_time": "2026-01-30T14:21:14.309413300Z", + "start_time": "2026-01-30T14:21:14.278545600Z" } }, "source": [ @@ -326,7 +326,7 @@ ] } ], - "execution_count": 140 + "execution_count": 4 }, { "cell_type": "markdown", @@ -339,8 +339,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:27.933575500Z", - "start_time": "2026-01-29T15:07:27.819208200Z" + "end_time": "2026-01-30T14:21:14.378808Z", + "start_time": "2026-01-30T14:21:14.349508200Z" } }, "source": [ @@ -358,19 +358,19 @@ " frozenset({10, 13})}" ] }, - "execution_count": 141, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 141 + "execution_count": 5 }, { "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:28.123040400Z", - "start_time": "2026-01-29T15:07:27.999988Z" + "end_time": "2026-01-30T14:21:14.558644200Z", + "start_time": "2026-01-30T14:21:14.459573100Z" } }, "source": [ @@ -396,7 +396,7 @@ } } ], - "execution_count": 142 + "execution_count": 6 }, { "cell_type": "markdown", @@ -409,8 +409,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:28.188617600Z", - "start_time": "2026-01-29T15:07:28.160831900Z" + "end_time": "2026-01-30T14:21:14.635514Z", + "start_time": "2026-01-30T14:21:14.598913Z" } }, "source": [ @@ -431,16 +431,16 @@ "7 10 Caroline Dufour Lens 45 \n", "8 13 Benoît Benoît Lens 15 \n", "\n", - " websites block \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", - "1 ['jacquesdupond.fr'] 0 \n", - "2 ['somewebsite.com/users/rpz59'] 1 \n", - "3 [] 1 \n", - "4 ['roubaixlove.fr'] 2 \n", - "5 [] 2 \n", - "6 [] 2 \n", - "7 ['pythonensamusant.fr', 'lensfans.fr'] 3 \n", - "8 ['lensfans.fr'] 3 " + " websites _block \n", + "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", + "1 ['jacquesdupond.fr'] 0 \n", + "2 ['somewebsite.com/users/rpz59'] 1 \n", + "3 [] 1 \n", + "4 ['roubaixlove.fr'] 2 \n", + "5 [] 2 \n", + "6 [] 2 \n", + "7 ['pythonensamusant.fr', 'lensfans.fr'] 3 \n", + "8 ['lensfans.fr'] 3 " ], "text/html": [ "
\n", @@ -466,7 +466,7 @@ " City\n", " Age\n", " websites\n", - " block\n", + " _block\n", " \n", " \n", " \n", @@ -556,12 +556,12 @@ "
" ] }, - "execution_count": 143, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 143 + "execution_count": 7 }, { "cell_type": "markdown", @@ -574,8 +574,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:28.391859600Z", - "start_time": "2026-01-29T15:07:28.233676700Z" + "end_time": "2026-01-30T14:21:14.829719100Z", + "start_time": "2026-01-30T14:21:14.676157200Z" } }, "source": [ @@ -590,12 +590,12 @@ "array([-1, 0, 1, 2, 0, 1, -1, -1, 2, -1, 3, 2, -1, 3])" ] }, - "execution_count": 144, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 144 + "execution_count": 8 }, { "cell_type": "markdown", @@ -622,8 +622,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:28.636881700Z", - "start_time": "2026-01-29T15:07:28.555420800Z" + "end_time": "2026-01-30T14:21:15.027923700Z", + "start_time": "2026-01-30T14:21:14.926401Z" } }, "source": [ @@ -649,12 +649,12 @@ "3 10 Caroline Dufour Lens 45 \n", "4 13 Benoît Benoît Lens 15 \n", "\n", - " websites block \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", - "1 ['jacquesdupond.fr'] 0 \n", - "2 ['lensfans.fr', 'pythonensamusant.fr'] 0 \n", - "3 ['pythonensamusant.fr', 'lensfans.fr'] 0 \n", - "4 ['lensfans.fr'] 0 " + " websites _block \n", + "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", + "1 ['jacquesdupond.fr'] 0 \n", + "2 ['lensfans.fr', 'pythonensamusant.fr'] 0 \n", + "3 ['pythonensamusant.fr', 'lensfans.fr'] 0 \n", + "4 ['lensfans.fr'] 0 " ], "text/html": [ "
\n", @@ -680,7 +680,7 @@ " City\n", " Age\n", " websites\n", - " block\n", + " _block\n", " \n", " \n", " \n", @@ -734,12 +734,12 @@ "
" ] }, - "execution_count": 145, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 145 + "execution_count": 9 }, { "cell_type": "markdown", @@ -759,8 +759,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:28.897958600Z", - "start_time": "2026-01-29T15:07:28.814714900Z" + "end_time": "2026-01-30T14:21:15.403596500Z", + "start_time": "2026-01-30T14:21:15.279120300Z" } }, "source": [ @@ -783,7 +783,7 @@ } } ], - "execution_count": 146 + "execution_count": 10 }, { "cell_type": "markdown", @@ -796,8 +796,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:29.062518Z", - "start_time": "2026-01-29T15:07:29.011197700Z" + "end_time": "2026-01-30T14:21:15.686136800Z", + "start_time": "2026-01-30T14:21:15.608444400Z" } }, "source": [ @@ -817,15 +817,15 @@ "6 10 Caroline Dufour Lens 45 \n", "7 13 Benoît Benoît Lens 15 \n", "\n", - " websites block \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", - "1 ['jacquesdupond.fr'] 0 \n", - "2 ['somewebsite.com/users/jacquesdupond', 'jacqu... 1 \n", - "3 ['lensfans.fr', 'pythonensamusant.fr'] 1 \n", - "4 ['pythonensamusant.fr', 'lensfans.fr'] 1 \n", - "5 ['lensfans.fr', 'pythonensamusant.fr'] 2 \n", - "6 ['pythonensamusant.fr', 'lensfans.fr'] 2 \n", - "7 ['lensfans.fr'] 2 " + " websites _block \n", + "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", + "1 ['jacquesdupond.fr'] 0 \n", + "2 ['somewebsite.com/users/jacquesdupond', 'jacqu... 1 \n", + "3 ['lensfans.fr', 'pythonensamusant.fr'] 1 \n", + "4 ['pythonensamusant.fr', 'lensfans.fr'] 1 \n", + "5 ['lensfans.fr', 'pythonensamusant.fr'] 2 \n", + "6 ['pythonensamusant.fr', 'lensfans.fr'] 2 \n", + "7 ['lensfans.fr'] 2 " ], "text/html": [ "
\n", @@ -851,7 +851,7 @@ " City\n", " Age\n", " websites\n", - " block\n", + " _block\n", " \n", " \n", " \n", @@ -932,12 +932,12 @@ "
" ] }, - "execution_count": 147, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 147 + "execution_count": 11 }, { "cell_type": "markdown", @@ -971,8 +971,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:29.409293600Z", - "start_time": "2026-01-29T15:07:29.374108Z" + "end_time": "2026-01-30T14:21:15.998425200Z", + "start_time": "2026-01-30T14:21:15.931370100Z" } }, "source": [ @@ -995,9 +995,9 @@ "0 6 Jean-Michel Python Douai 49 ['lensfans.fr', 'pythonensamusant.fr'] \n", "1 10 Caroline Dufour Lens 45 ['pythonensamusant.fr', 'lensfans.fr'] \n", "\n", - " block \n", - "0 0 \n", - "1 0 " + " _block \n", + "0 0 \n", + "1 0 " ], "text/html": [ "
\n", @@ -1023,7 +1023,7 @@ " City\n", " Age\n", " websites\n", - " block\n", + " _block\n", " \n", " \n", " \n", @@ -1050,12 +1050,12 @@ "
" ] }, - "execution_count": 148, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 148 + "execution_count": 12 }, { "cell_type": "markdown", @@ -1075,8 +1075,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:29.806693100Z", - "start_time": "2026-01-29T15:07:29.577252700Z" + "end_time": "2026-01-30T14:21:16.305679100Z", + "start_time": "2026-01-30T14:21:16.212470400Z" } }, "source": [ @@ -1089,7 +1089,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['City', 'Age'], [])\n" + "Processing AttributeEquivalenceBlocker(['Age', 'City'], [])\n" ] }, { @@ -1103,13 +1103,13 @@ "4 8 Sophie Delarue Roubaix 33 \n", "5 11 sophie_delarue Roubaix 33 \n", "\n", - " websites block \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", - "1 ['jacquesdupond.fr'] 0 \n", - "2 ['somewebsite.com/users/rpz59'] 1 \n", - "3 [] 1 \n", - "4 [] 2 \n", - "5 [] 2 " + " websites _block \n", + "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", + "1 ['jacquesdupond.fr'] 0 \n", + "2 ['somewebsite.com/users/rpz59'] 1 \n", + "3 [] 1 \n", + "4 [] 2 \n", + "5 [] 2 " ], "text/html": [ "
\n", @@ -1135,7 +1135,7 @@ " City\n", " Age\n", " websites\n", - " block\n", + " _block\n", " \n", " \n", " \n", @@ -1198,12 +1198,12 @@ "
" ] }, - "execution_count": 149, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 149 + "execution_count": 13 }, { "cell_type": "markdown", @@ -1223,8 +1223,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:30.174760Z", - "start_time": "2026-01-29T15:07:30.089742500Z" + "end_time": "2026-01-30T14:21:16.678653800Z", + "start_time": "2026-01-30T14:21:16.558976200Z" } }, "source": [ @@ -1249,11 +1249,11 @@ "2 8 Sophie Delarue Roubaix 33 [] \n", "3 11 sophie_delarue Roubaix 33 [] \n", "\n", - " block \n", - "0 0 \n", - "1 0 \n", - "2 1 \n", - "3 1 " + " _block \n", + "0 0 \n", + "1 0 \n", + "2 1 \n", + "3 1 " ], "text/html": [ "
\n", @@ -1279,7 +1279,7 @@ " City\n", " Age\n", " websites\n", - " block\n", + " _block\n", " \n", " \n", " \n", @@ -1324,12 +1324,12 @@ "
" ] }, - "execution_count": 150, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 150 + "execution_count": 14 }, { "cell_type": "markdown", @@ -1342,8 +1342,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:30.712714100Z", - "start_time": "2026-01-29T15:07:30.650914300Z" + "end_time": "2026-01-30T14:21:17.354294400Z", + "start_time": "2026-01-30T14:21:17.316050200Z" } }, "source": [ @@ -1365,7 +1365,7 @@ "data": { "text/plain": [ "Empty DataFrame\n", - "Columns: [id, Name, City, Age, websites, block]\n", + "Columns: [id, Name, City, Age, websites, _block]\n", "Index: []" ], "text/html": [ @@ -1392,7 +1392,7 @@ " City\n", " Age\n", " websites\n", - " block\n", + " _block\n", " \n", " \n", " \n", @@ -1401,12 +1401,12 @@ "" ] }, - "execution_count": 151, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 151 + "execution_count": 15 }, { "cell_type": "markdown", @@ -1440,8 +1440,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:31.007955500Z", - "start_time": "2026-01-29T15:07:30.942038900Z" + "end_time": "2026-01-30T14:21:17.537043700Z", + "start_time": "2026-01-30T14:21:17.392490700Z" } }, "source": [ @@ -1458,14 +1458,14 @@ ] } ], - "execution_count": 152 + "execution_count": 16 }, { "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:31.269933200Z", - "start_time": "2026-01-29T15:07:31.230646800Z" + "end_time": "2026-01-30T14:21:17.655177300Z", + "start_time": "2026-01-30T14:21:17.573776300Z" } }, "source": [ @@ -1489,11 +1489,11 @@ "2 10 Caroline Dufour Lens 45 \n", "3 13 Benoît Benoît Lens 15 \n", "\n", - " websites block \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", - "1 ['jacquesdupond.fr'] 0 \n", - "2 ['pythonensamusant.fr', 'lensfans.fr'] 1 \n", - "3 ['lensfans.fr'] 1 " + " websites _block \n", + "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", + "1 ['jacquesdupond.fr'] 0 \n", + "2 ['pythonensamusant.fr', 'lensfans.fr'] 1 \n", + "3 ['lensfans.fr'] 1 " ], "text/html": [ "
\n", @@ -1519,7 +1519,7 @@ " City\n", " Age\n", " websites\n", - " block\n", + " _block\n", " \n", " \n", " \n", @@ -1564,12 +1564,12 @@ "
" ] }, - "execution_count": 153, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 153 + "execution_count": 17 }, { "cell_type": "markdown", @@ -1589,8 +1589,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:31.613284200Z", - "start_time": "2026-01-29T15:07:31.446107500Z" + "end_time": "2026-01-30T14:21:17.910335600Z", + "start_time": "2026-01-30T14:21:17.821453400Z" } }, "source": [ @@ -1621,17 +1621,17 @@ "8 8 Sophie Delarue Roubaix 33 \n", "9 11 sophie_delarue Roubaix 33 \n", "\n", - " websites block \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", - "1 ['jacquesdupond.fr'] 0 \n", - "2 ['lensfans.fr', 'pythonensamusant.fr'] 0 \n", - "3 ['pythonensamusant.fr', 'lensfans.fr'] 0 \n", - "4 ['lensfans.fr'] 0 \n", - "5 ['somewebsite.com/users/rpz59'] 1 \n", - "6 [] 1 \n", - "7 ['roubaixlove.fr'] 2 \n", - "8 [] 2 \n", - "9 [] 2 " + " websites _block \n", + "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", + "1 ['jacquesdupond.fr'] 0 \n", + "2 ['lensfans.fr', 'pythonensamusant.fr'] 0 \n", + "3 ['pythonensamusant.fr', 'lensfans.fr'] 0 \n", + "4 ['lensfans.fr'] 0 \n", + "5 ['somewebsite.com/users/rpz59'] 1 \n", + "6 [] 1 \n", + "7 ['roubaixlove.fr'] 2 \n", + "8 [] 2 \n", + "9 [] 2 " ], "text/html": [ "
\n", @@ -1657,7 +1657,7 @@ " City\n", " Age\n", " websites\n", - " block\n", + " _block\n", " \n", " \n", " \n", @@ -1756,12 +1756,12 @@ "
" ] }, - "execution_count": 154, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 154 + "execution_count": 18 }, { "cell_type": "markdown", @@ -1804,8 +1804,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:32.031630100Z", - "start_time": "2026-01-29T15:07:32.013496100Z" + "end_time": "2026-01-30T14:21:18.279899900Z", + "start_time": "2026-01-30T14:21:18.250988900Z" } }, "source": [ @@ -1815,7 +1815,7 @@ "websites_blocker = msb.OverlapBlocker([\"websites\"])" ], "outputs": [], - "execution_count": 155 + "execution_count": 19 }, { "cell_type": "markdown", @@ -1828,15 +1828,15 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:32.189589400Z", - "start_time": "2026-01-29T15:07:32.172200Z" + "end_time": "2026-01-30T14:21:18.481263300Z", + "start_time": "2026-01-30T14:21:18.466284300Z" } }, "source": [ "final_blocker = (city_blocker & age_blocker) | (name_blocker & websites_blocker)" ], "outputs": [], - "execution_count": 156 + "execution_count": 20 }, { "cell_type": "markdown", @@ -1849,8 +1849,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:32.274232600Z", - "start_time": "2026-01-29T15:07:32.231839Z" + "end_time": "2026-01-30T14:21:18.562779600Z", + "start_time": "2026-01-30T14:21:18.520368200Z" } }, "source": [ @@ -1862,7 +1862,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['City', 'Age'], [])\n", + "Processing AttributeEquivalenceBlocker(['Age', 'City'], [])\n", "Processing MixedBlocker(['Name'], ['websites'], 1)\n" ] }, @@ -1877,13 +1877,13 @@ "4 8 Sophie Delarue Roubaix 33 \n", "5 11 sophie_delarue Roubaix 33 \n", "\n", - " websites block \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", - "1 ['jacquesdupond.fr'] 0 \n", - "2 ['somewebsite.com/users/rpz59'] 1 \n", - "3 [] 1 \n", - "4 [] 2 \n", - "5 [] 2 " + " websites _block \n", + "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", + "1 ['jacquesdupond.fr'] 0 \n", + "2 ['somewebsite.com/users/rpz59'] 1 \n", + "3 [] 1 \n", + "4 [] 2 \n", + "5 [] 2 " ], "text/html": [ "
\n", @@ -1909,7 +1909,7 @@ " City\n", " Age\n", " websites\n", - " block\n", + " _block\n", " \n", " \n", " \n", @@ -1972,12 +1972,12 @@ "
" ] }, - "execution_count": 157, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 157 + "execution_count": 21 }, { "cell_type": "markdown", @@ -1990,8 +1990,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:32.514343300Z", - "start_time": "2026-01-29T15:07:32.366139Z" + "end_time": "2026-01-30T14:21:18.843568700Z", + "start_time": "2026-01-30T14:21:18.686911500Z" } }, "source": [ @@ -2007,7 +2007,7 @@ ] } ], - "execution_count": 158 + "execution_count": 22 }, { "cell_type": "markdown", @@ -2034,8 +2034,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:32.653038500Z", - "start_time": "2026-01-29T15:07:32.610025600Z" + "end_time": "2026-01-30T14:21:18.967168700Z", + "start_time": "2026-01-30T14:21:18.928864500Z" } }, "source": [ @@ -2056,16 +2056,16 @@ "7 11 sophie_delarue Roubaix 33 \n", "8 13 Benoît Benoît Lens 15 \n", "\n", - " websites block \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", - "1 ['somewebsite.com/users/rpz59'] 1 \n", - "2 ['roubaixlove.fr'] 2 \n", - "3 ['jacquesdupond.fr'] 0 \n", - "4 [] 1 \n", - "5 [] 2 \n", - "6 ['pythonensamusant.fr', 'lensfans.fr'] 3 \n", - "7 [] 2 \n", - "8 ['lensfans.fr'] 3 " + " websites _block \n", + "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", + "1 ['somewebsite.com/users/rpz59'] 1 \n", + "2 ['roubaixlove.fr'] 2 \n", + "3 ['jacquesdupond.fr'] 0 \n", + "4 [] 1 \n", + "5 [] 2 \n", + "6 ['pythonensamusant.fr', 'lensfans.fr'] 3 \n", + "7 [] 2 \n", + "8 ['lensfans.fr'] 3 " ], "text/html": [ "
\n", @@ -2091,7 +2091,7 @@ " City\n", " Age\n", " websites\n", - " block\n", + " _block\n", " \n", " \n", " \n", @@ -2181,12 +2181,12 @@ "
" ] }, - "execution_count": 159, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 159 + "execution_count": 23 }, { "cell_type": "markdown", @@ -2213,8 +2213,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:33.051456600Z", - "start_time": "2026-01-29T15:07:33.007723500Z" + "end_time": "2026-01-30T14:21:19.276047300Z", + "start_time": "2026-01-30T14:21:19.146886900Z" } }, "source": [ @@ -2240,21 +2240,21 @@ "12 13 Benoît Benoît Lens 15 \n", "13 12 Marcel Vandermersch Fourmies 48 \n", "\n", - " websites block \n", - "0 ['jeandaux.fr', 'lillefans.fr'] 0 \n", - "1 ['somewebsite.com/users/jacquesdupond', 'jacqu... 1 \n", - "2 ['jacquesdupond.fr'] 1 \n", - "3 ['somewebsite.com/users/rpz59'] 2 \n", - "4 [] 2 \n", - "5 ['roubaixlove.fr'] 3 \n", - "6 [] 3 \n", - "7 [] 3 \n", - "8 ['lensfans.fr', 'pythonensamusant.fr'] 4 \n", - "9 ['lorem.fr'] 5 \n", - "10 ['somewebsite.com/users/jajanne59'] 6 \n", - "11 ['pythonensamusant.fr', 'lensfans.fr'] 7 \n", - "12 ['lensfans.fr'] 7 \n", - "13 ['lesrecettesdemarcel.fr'] 8 " + " websites _block \n", + "0 ['jeandaux.fr', 'lillefans.fr'] 0 \n", + "1 ['somewebsite.com/users/jacquesdupond', 'jacqu... 1 \n", + "2 ['jacquesdupond.fr'] 1 \n", + "3 ['somewebsite.com/users/rpz59'] 2 \n", + "4 [] 2 \n", + "5 ['roubaixlove.fr'] 3 \n", + "6 [] 3 \n", + "7 [] 3 \n", + "8 ['lensfans.fr', 'pythonensamusant.fr'] 4 \n", + "9 ['lorem.fr'] 5 \n", + "10 ['somewebsite.com/users/jajanne59'] 6 \n", + "11 ['pythonensamusant.fr', 'lensfans.fr'] 7 \n", + "12 ['lensfans.fr'] 7 \n", + "13 ['lesrecettesdemarcel.fr'] 8 " ], "text/html": [ "
\n", @@ -2280,7 +2280,7 @@ " City\n", " Age\n", " websites\n", - " block\n", + " _block\n", " \n", " \n", " \n", @@ -2415,12 +2415,12 @@ "
" ] }, - "execution_count": 160, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 160 + "execution_count": 24 }, { "cell_type": "markdown", @@ -2443,8 +2443,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:33.425225900Z", - "start_time": "2026-01-29T15:07:33.285367100Z" + "end_time": "2026-01-30T14:21:19.820247800Z", + "start_time": "2026-01-30T14:21:19.653280100Z" } }, "source": [ @@ -2473,13 +2473,13 @@ "4 8 Sophie Delarue Roubaix 33 \n", "5 11 sophie_delarue Roubaix 33 \n", "\n", - " websites block \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", - "1 ['jacquesdupond.fr'] 0 \n", - "2 ['somewebsite.com/users/rpz59'] 1 \n", - "3 [] 1 \n", - "4 [] 2 \n", - "5 [] 2 " + " websites _block \n", + "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", + "1 ['jacquesdupond.fr'] 0 \n", + "2 ['somewebsite.com/users/rpz59'] 1 \n", + "3 [] 1 \n", + "4 [] 2 \n", + "5 [] 2 " ], "text/html": [ "
\n", @@ -2505,7 +2505,7 @@ " City\n", " Age\n", " websites\n", - " block\n", + " _block\n", " \n", " \n", " \n", @@ -2568,12 +2568,12 @@ "
" ] }, - "execution_count": 161, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 161 + "execution_count": 25 }, { "cell_type": "markdown", @@ -2593,8 +2593,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:33.836934900Z", - "start_time": "2026-01-29T15:07:33.664956Z" + "end_time": "2026-01-30T14:21:20.335572Z", + "start_time": "2026-01-30T14:21:20.302358700Z" } }, "source": [ @@ -2621,12 +2621,12 @@ " frozenset({3, 11}): {\"Same 'City'\"}}" ] }, - "execution_count": 162, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 162 + "execution_count": 26 }, { "cell_type": "markdown", @@ -2646,8 +2646,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:34.005705200Z", - "start_time": "2026-01-29T15:07:33.958769500Z" + "end_time": "2026-01-30T14:21:20.409405100Z", + "start_time": "2026-01-30T14:21:20.374573700Z" } }, "source": [ @@ -2668,16 +2668,16 @@ "7 10 Caroline Dufour Lens 45 \n", "8 13 Benoît Benoît Lens 15 \n", "\n", - " websites block motive \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 (Same 'City') \n", - "1 ['jacquesdupond.fr'] 0 (Same 'City') \n", - "2 ['somewebsite.com/users/rpz59'] 1 (Same 'City') \n", - "3 [] 1 (Same 'City') \n", - "4 ['roubaixlove.fr'] 2 (Same 'City') \n", - "5 [] 2 (Same 'City') \n", - "6 [] 2 (Same 'City') \n", - "7 ['pythonensamusant.fr', 'lensfans.fr'] 3 (Same 'City') \n", - "8 ['lensfans.fr'] 3 (Same 'City') " + " websites _block _motive \n", + "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 (Same 'City') \n", + "1 ['jacquesdupond.fr'] 0 (Same 'City') \n", + "2 ['somewebsite.com/users/rpz59'] 1 (Same 'City') \n", + "3 [] 1 (Same 'City') \n", + "4 ['roubaixlove.fr'] 2 (Same 'City') \n", + "5 [] 2 (Same 'City') \n", + "6 [] 2 (Same 'City') \n", + "7 ['pythonensamusant.fr', 'lensfans.fr'] 3 (Same 'City') \n", + "8 ['lensfans.fr'] 3 (Same 'City') " ], "text/html": [ "
\n", @@ -2703,8 +2703,8 @@ " City\n", " Age\n", " websites\n", - " block\n", - " motive\n", + " _block\n", + " _motive\n", " \n", " \n", " \n", @@ -2803,12 +2803,12 @@ "
" ] }, - "execution_count": 163, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 163 + "execution_count": 27 }, { "cell_type": "markdown", @@ -2828,8 +2828,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:34.269628100Z", - "start_time": "2026-01-29T15:07:34.186432Z" + "end_time": "2026-01-30T14:21:20.612990700Z", + "start_time": "2026-01-30T14:21:20.483928200Z" } }, "source": [ @@ -2855,13 +2855,13 @@ "4 [] 3 Paul Delarue \n", "5 ['pythonensamusant.fr', 'lensfans.fr'] 13 Benoît Benoît \n", "\n", - " City_r Age_r websites_r block motive \n", - "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] 0 (Same 'City') \n", - "1 Phalempin 24 [] 1 (Same 'City') \n", - "2 Roubaix 33 [] 2 (Same 'City') \n", - "3 Roubaix 33 [] 2 (Same 'City') \n", - "4 Roubaix 32 ['roubaixlove.fr'] 2 (Same 'City') \n", - "5 Lens 15 ['lensfans.fr'] 3 (Same 'City') " + " City_r Age_r websites_r _block _motive \n", + "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] 0 (Same 'City') \n", + "1 Phalempin 24 [] 1 (Same 'City') \n", + "2 Roubaix 33 [] 2 (Same 'City') \n", + "3 Roubaix 33 [] 2 (Same 'City') \n", + "4 Roubaix 32 ['roubaixlove.fr'] 2 (Same 'City') \n", + "5 Lens 15 ['lensfans.fr'] 3 (Same 'City') " ], "text/html": [ "
\n", @@ -2892,8 +2892,8 @@ " City_r\n", " Age_r\n", " websites_r\n", - " block\n", - " motive\n", + " _block\n", + " _motive\n", " \n", " \n", " \n", @@ -2992,12 +2992,12 @@ "
" ] }, - "execution_count": 164, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 164 + "execution_count": 28 }, { "cell_type": "markdown", @@ -3010,8 +3010,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:34.787375Z", - "start_time": "2026-01-29T15:07:34.745314800Z" + "end_time": "2026-01-30T14:21:20.944670700Z", + "start_time": "2026-01-30T14:21:20.834495500Z" } }, "source": [ @@ -3023,13 +3023,13 @@ { "data": { "text/plain": [ - " id_l Name_l id_r Name_r block motive\n", - "0 1 Jacques Dupond 4 Jacques Dupont 0 (Same 'City')\n", - "1 2 Pierre Dusquesnes 5 pierre dusquesnes 1 (Same 'City')\n", - "2 3 Paul Delarue 11 sophie_delarue 2 (Same 'City')\n", - "3 8 Sophie Delarue 11 sophie_delarue 2 (Same 'City')\n", - "4 8 Sophie Delarue 3 Paul Delarue 2 (Same 'City')\n", - "5 10 Caroline Dufour 13 Benoît Benoît 3 (Same 'City')" + " id_l Name_l id_r Name_r _block _motive\n", + "0 1 Jacques Dupond 4 Jacques Dupont 0 (Same 'City')\n", + "1 2 Pierre Dusquesnes 5 pierre dusquesnes 1 (Same 'City')\n", + "2 3 Paul Delarue 11 sophie_delarue 2 (Same 'City')\n", + "3 8 Sophie Delarue 11 sophie_delarue 2 (Same 'City')\n", + "4 8 Sophie Delarue 3 Paul Delarue 2 (Same 'City')\n", + "5 10 Caroline Dufour 13 Benoît Benoît 3 (Same 'City')" ], "text/html": [ "
\n", @@ -3054,8 +3054,8 @@ " Name_l\n", " id_r\n", " Name_r\n", - " block\n", - " motive\n", + " _block\n", + " _motive\n", " \n", " \n", " \n", @@ -3118,12 +3118,12 @@ "
" ] }, - "execution_count": 165, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 165 + "execution_count": 29 }, { "cell_type": "markdown", @@ -3136,8 +3136,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:35.067637700Z", - "start_time": "2026-01-29T15:07:34.976540100Z" + "end_time": "2026-01-30T14:21:21.591044600Z", + "start_time": "2026-01-30T14:21:21.517777200Z" } }, "source": [ @@ -3163,13 +3163,13 @@ "4 [] 3 Paul Delarue \n", "5 ['pythonensamusant.fr', 'lensfans.fr'] 13 Benoît Benoît \n", "\n", - " City_r Age_r websites_r block motive \n", - "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] 0 (Same 'City') \n", - "1 Phalempin 24 [] 1 (Same 'City') \n", - "2 Roubaix 33 [] 2 (Same 'City') \n", - "3 Roubaix 33 [] 2 (Same 'City') \n", - "4 Roubaix 32 ['roubaixlove.fr'] 2 (Same 'City') \n", - "5 Lens 15 ['lensfans.fr'] 3 (Same 'City') " + " City_r Age_r websites_r _block _motive \n", + "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] 0 (Same 'City') \n", + "1 Phalempin 24 [] 1 (Same 'City') \n", + "2 Roubaix 33 [] 2 (Same 'City') \n", + "3 Roubaix 33 [] 2 (Same 'City') \n", + "4 Roubaix 32 ['roubaixlove.fr'] 2 (Same 'City') \n", + "5 Lens 15 ['lensfans.fr'] 3 (Same 'City') " ], "text/html": [ "
\n", @@ -3200,8 +3200,8 @@ " City_r\n", " Age_r\n", " websites_r\n", - " block\n", - " motive\n", + " _block\n", + " _motive\n", " \n", " \n", " \n", @@ -3300,20 +3300,20 @@ "
" ] }, - "execution_count": 166, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 166 + "execution_count": 30 }, { "cell_type": "code", "metadata": { "scrolled": true, "ExecuteTime": { - "end_time": "2026-01-29T15:07:35.517819300Z", - "start_time": "2026-01-29T15:07:35.345233200Z" + "end_time": "2026-01-30T14:21:21.867809800Z", + "start_time": "2026-01-30T14:21:21.674986800Z" } }, "source": [ @@ -3337,35 +3337,35 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['City', 'Age'], [])\n", + "Processing AttributeEquivalenceBlocker(['Age', 'City'], [])\n", "Processing OverlapBlocker(['websites'], 1)\n" ] }, { "data": { "text/plain": [ - " id_l Name_l id_r Name_r block \\\n", - "0 1 Jacques Dupond 4 Jacques Dupont 0 \n", - "1 1 Jacques Dupond 6 Jean-Michel Python 0 \n", - "2 1 Jacques Dupond 10 Caroline Dufour 0 \n", - "3 1 Jacques Dupond 4 Jacques Dupont 1 \n", - "4 1 Jacques Dupond 6 Jean-Michel Python 1 \n", - "5 1 Jacques Dupond 10 Caroline Dufour 1 \n", - "6 10 Caroline Dufour 6 Jean-Michel Python 1 \n", - "7 10 Caroline Dufour 13 Benoît Benoît 1 \n", - "8 2 Pierre Dusquesnes 5 pierre dusquesnes 2 \n", - "9 8 Sophie Delarue 11 sophie_delarue 3 \n", - "10 10 Caroline Dufour 6 Jean-Michel Python 4 \n", - "11 10 Caroline Dufour 13 Benoît Benoît 4 \n", - "12 13 Benoît Benoît 6 Jean-Michel Python 4 \n", + " id_l Name_l id_r Name_r _block \\\n", + "0 1 Jacques Dupond 4 Jacques Dupont 0 \n", + "1 1 Jacques Dupond 6 Jean-Michel Python 0 \n", + "2 1 Jacques Dupond 10 Caroline Dufour 0 \n", + "3 1 Jacques Dupond 4 Jacques Dupont 1 \n", + "4 1 Jacques Dupond 6 Jean-Michel Python 1 \n", + "5 1 Jacques Dupond 10 Caroline Dufour 1 \n", + "6 10 Caroline Dufour 6 Jean-Michel Python 1 \n", + "7 10 Caroline Dufour 13 Benoît Benoît 1 \n", + "8 2 Pierre Dusquesnes 5 pierre dusquesnes 2 \n", + "9 8 Sophie Delarue 11 sophie_delarue 3 \n", + "10 10 Caroline Dufour 6 Jean-Michel Python 4 \n", + "11 10 Caroline Dufour 13 Benoît Benoît 4 \n", + "12 13 Benoît Benoît 6 Jean-Michel Python 4 \n", "\n", - " motive \n", - "0 (Same 'City', Same 'Age', >=1 overlap in 'webs... \n", - "1 (Same 'City', Same 'Age', >=1 overlap in 'webs... \n", - "2 (Same 'City', Same 'Age', >=1 overlap in 'webs... \n", - "3 (Same 'City', Same 'Age', >=1 overlap in 'webs... \n", - "4 (Same 'City', Same 'Age', >=1 overlap in 'webs... \n", - "5 (Same 'City', Same 'Age', >=1 overlap in 'webs... \n", + " _motive \n", + "0 (>=1 overlap in 'websites', Same 'City', Same ... \n", + "1 (>=1 overlap in 'websites', Same 'City', Same ... \n", + "2 (>=1 overlap in 'websites', Same 'City', Same ... \n", + "3 (>=1 overlap in 'websites', Same 'City', Same ... \n", + "4 (>=1 overlap in 'websites', Same 'City', Same ... \n", + "5 (>=1 overlap in 'websites', Same 'City', Same ... \n", "6 (>=1 overlap in 'websites') \n", "7 (>=1 overlap in 'websites') \n", "8 (Same 'City', Same 'Age') \n", @@ -3397,8 +3397,8 @@ " Name_l\n", " id_r\n", " Name_r\n", - " block\n", - " motive\n", + " _block\n", + " _motive\n", " \n", " \n", " \n", @@ -3409,7 +3409,7 @@ " 4\n", " Jacques Dupont\n", " 0\n", - " (Same 'City', Same 'Age', >=1 overlap in 'webs...\n", + " (>=1 overlap in 'websites', Same 'City', Same ...\n", " \n", " \n", " 1\n", @@ -3418,7 +3418,7 @@ " 6\n", " Jean-Michel Python\n", " 0\n", - " (Same 'City', Same 'Age', >=1 overlap in 'webs...\n", + " (>=1 overlap in 'websites', Same 'City', Same ...\n", " \n", " \n", " 2\n", @@ -3427,7 +3427,7 @@ " 10\n", " Caroline Dufour\n", " 0\n", - " (Same 'City', Same 'Age', >=1 overlap in 'webs...\n", + " (>=1 overlap in 'websites', Same 'City', Same ...\n", " \n", " \n", " 3\n", @@ -3436,7 +3436,7 @@ " 4\n", " Jacques Dupont\n", " 1\n", - " (Same 'City', Same 'Age', >=1 overlap in 'webs...\n", + " (>=1 overlap in 'websites', Same 'City', Same ...\n", " \n", " \n", " 4\n", @@ -3445,7 +3445,7 @@ " 6\n", " Jean-Michel Python\n", " 1\n", - " (Same 'City', Same 'Age', >=1 overlap in 'webs...\n", + " (>=1 overlap in 'websites', Same 'City', Same ...\n", " \n", " \n", " 5\n", @@ -3454,7 +3454,7 @@ " 10\n", " Caroline Dufour\n", " 1\n", - " (Same 'City', Same 'Age', >=1 overlap in 'webs...\n", + " (>=1 overlap in 'websites', Same 'City', Same ...\n", " \n", " \n", " 6\n", @@ -3524,12 +3524,12 @@ "" ] }, - "execution_count": 167, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 167 + "execution_count": 31 }, { "cell_type": "markdown", @@ -3545,8 +3545,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-29T15:07:35.899178900Z", - "start_time": "2026-01-29T15:07:35.837149Z" + "end_time": "2026-01-30T14:21:22.186415700Z", + "start_time": "2026-01-30T14:21:22.127304600Z" } }, "source": [ @@ -3557,28 +3557,28 @@ { "data": { "text/plain": [ - " id_l Name_l id_r Name_r block \\\n", - "0 1 Jacques Dupond 4 Jacques Dupont 0 \n", - "1 1 Jacques Dupond 6 Jean-Michel Python 0 \n", - "2 1 Jacques Dupond 10 Caroline Dufour 0 \n", - "3 1 Jacques Dupond 4 Jacques Dupont 1 \n", - "4 1 Jacques Dupond 6 Jean-Michel Python 1 \n", - "5 1 Jacques Dupond 10 Caroline Dufour 1 \n", - "8 2 Pierre Dusquesnes 5 pierre dusquesnes 2 \n", - "9 8 Sophie Delarue 11 sophie_delarue 3 \n", - "6 10 Caroline Dufour 6 Jean-Michel Python 1 \n", - "7 10 Caroline Dufour 13 Benoît Benoît 1 \n", - "10 10 Caroline Dufour 6 Jean-Michel Python 4 \n", - "11 10 Caroline Dufour 13 Benoît Benoît 4 \n", - "12 13 Benoît Benoît 6 Jean-Michel Python 4 \n", + " id_l Name_l id_r Name_r _block \\\n", + "0 1 Jacques Dupond 4 Jacques Dupont 0 \n", + "1 1 Jacques Dupond 6 Jean-Michel Python 0 \n", + "2 1 Jacques Dupond 10 Caroline Dufour 0 \n", + "3 1 Jacques Dupond 4 Jacques Dupont 1 \n", + "4 1 Jacques Dupond 6 Jean-Michel Python 1 \n", + "5 1 Jacques Dupond 10 Caroline Dufour 1 \n", + "8 2 Pierre Dusquesnes 5 pierre dusquesnes 2 \n", + "9 8 Sophie Delarue 11 sophie_delarue 3 \n", + "6 10 Caroline Dufour 6 Jean-Michel Python 1 \n", + "7 10 Caroline Dufour 13 Benoît Benoît 1 \n", + "10 10 Caroline Dufour 6 Jean-Michel Python 4 \n", + "11 10 Caroline Dufour 13 Benoît Benoît 4 \n", + "12 13 Benoît Benoît 6 Jean-Michel Python 4 \n", "\n", - " motive score \n", - "0 (Same 'City', Same 'Age', >=1 overlap in 'webs... 3 \n", - "1 (Same 'City', Same 'Age', >=1 overlap in 'webs... 3 \n", - "2 (Same 'City', Same 'Age', >=1 overlap in 'webs... 3 \n", - "3 (Same 'City', Same 'Age', >=1 overlap in 'webs... 3 \n", - "4 (Same 'City', Same 'Age', >=1 overlap in 'webs... 3 \n", - "5 (Same 'City', Same 'Age', >=1 overlap in 'webs... 3 \n", + " _motive score \n", + "0 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", + "1 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", + "2 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", + "3 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", + "4 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", + "5 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", "8 (Same 'City', Same 'Age') 2 \n", "9 (Same 'City', Same 'Age') 2 \n", "6 (>=1 overlap in 'websites') 1 \n", @@ -3610,8 +3610,8 @@ " Name_l\n", " id_r\n", " Name_r\n", - " block\n", - " motive\n", + " _block\n", + " _motive\n", " score\n", " \n", " \n", @@ -3623,7 +3623,7 @@ " 4\n", " Jacques Dupont\n", " 0\n", - " (Same 'City', Same 'Age', >=1 overlap in 'webs...\n", + " (>=1 overlap in 'websites', Same 'City', Same ...\n", " 3\n", " \n", " \n", @@ -3633,7 +3633,7 @@ " 6\n", " Jean-Michel Python\n", " 0\n", - " (Same 'City', Same 'Age', >=1 overlap in 'webs...\n", + " (>=1 overlap in 'websites', Same 'City', Same ...\n", " 3\n", " \n", " \n", @@ -3643,7 +3643,7 @@ " 10\n", " Caroline Dufour\n", " 0\n", - " (Same 'City', Same 'Age', >=1 overlap in 'webs...\n", + " (>=1 overlap in 'websites', Same 'City', Same ...\n", " 3\n", " \n", " \n", @@ -3653,7 +3653,7 @@ " 4\n", " Jacques Dupont\n", " 1\n", - " (Same 'City', Same 'Age', >=1 overlap in 'webs...\n", + " (>=1 overlap in 'websites', Same 'City', Same ...\n", " 3\n", " \n", " \n", @@ -3663,7 +3663,7 @@ " 6\n", " Jean-Michel Python\n", " 1\n", - " (Same 'City', Same 'Age', >=1 overlap in 'webs...\n", + " (>=1 overlap in 'websites', Same 'City', Same ...\n", " 3\n", " \n", " \n", @@ -3673,7 +3673,7 @@ " 10\n", " Caroline Dufour\n", " 1\n", - " (Same 'City', Same 'Age', >=1 overlap in 'webs...\n", + " (>=1 overlap in 'websites', Same 'City', Same ...\n", " 3\n", " \n", " \n", @@ -3751,12 +3751,12 @@ "" ] }, - "execution_count": 168, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 168 + "execution_count": 32 } ], "metadata": { From 1ec4d4d5c3b5fe7f11497f02a362fdd3e0437dfd Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Fri, 30 Jan 2026 15:55:49 +0100 Subject: [PATCH 7/7] style: formatting --- src/ms_blocking/ms_blocking.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/ms_blocking/ms_blocking.py b/src/ms_blocking/ms_blocking.py index a7903c0..57ccd4c 100644 --- a/src/ms_blocking/ms_blocking.py +++ b/src/ms_blocking/ms_blocking.py @@ -1,9 +1,6 @@ from ms_blocking.utils import * # noqa: F403 -# TODO: "block_id" - - class BlockerNode: """Abstract class from which derive all classes in the module"""