\n",
@@ -3118,12 +3118,12 @@
""
]
},
- "execution_count": 165,
+ "execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
- "execution_count": 165
+ "execution_count": 29
},
{
"cell_type": "markdown",
@@ -3136,8 +3136,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-29T15:07:35.067637700Z",
- "start_time": "2026-01-29T15:07:34.976540100Z"
+ "end_time": "2026-01-30T14:21:21.591044600Z",
+ "start_time": "2026-01-30T14:21:21.517777200Z"
}
},
"source": [
@@ -3163,13 +3163,13 @@
"4 [] 3 Paul Delarue \n",
"5 ['pythonensamusant.fr', 'lensfans.fr'] 13 Benoît Benoît \n",
"\n",
- " City_r Age_r websites_r block motive \n",
- "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] 0 (Same 'City') \n",
- "1 Phalempin 24 [] 1 (Same 'City') \n",
- "2 Roubaix 33 [] 2 (Same 'City') \n",
- "3 Roubaix 33 [] 2 (Same 'City') \n",
- "4 Roubaix 32 ['roubaixlove.fr'] 2 (Same 'City') \n",
- "5 Lens 15 ['lensfans.fr'] 3 (Same 'City') "
+ " City_r Age_r websites_r _block _motive \n",
+ "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] 0 (Same 'City') \n",
+ "1 Phalempin 24 [] 1 (Same 'City') \n",
+ "2 Roubaix 33 [] 2 (Same 'City') \n",
+ "3 Roubaix 33 [] 2 (Same 'City') \n",
+ "4 Roubaix 32 ['roubaixlove.fr'] 2 (Same 'City') \n",
+ "5 Lens 15 ['lensfans.fr'] 3 (Same 'City') "
],
"text/html": [
"\n",
@@ -3200,8 +3200,8 @@
"
City_r | \n",
" Age_r | \n",
" websites_r | \n",
- " block | \n",
- " motive | \n",
+ " _block | \n",
+ " _motive | \n",
" \n",
" \n",
" \n",
@@ -3300,20 +3300,20 @@
""
]
},
- "execution_count": 166,
+ "execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
- "execution_count": 166
+ "execution_count": 30
},
{
"cell_type": "code",
"metadata": {
"scrolled": true,
"ExecuteTime": {
- "end_time": "2026-01-29T15:07:35.517819300Z",
- "start_time": "2026-01-29T15:07:35.345233200Z"
+ "end_time": "2026-01-30T14:21:21.867809800Z",
+ "start_time": "2026-01-30T14:21:21.674986800Z"
}
},
"source": [
@@ -3337,35 +3337,35 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Processing AttributeEquivalenceBlocker(['City', 'Age'], [])\n",
+ "Processing AttributeEquivalenceBlocker(['Age', 'City'], [])\n",
"Processing OverlapBlocker(['websites'], 1)\n"
]
},
{
"data": {
"text/plain": [
- " id_l Name_l id_r Name_r block \\\n",
- "0 1 Jacques Dupond 4 Jacques Dupont 0 \n",
- "1 1 Jacques Dupond 6 Jean-Michel Python 0 \n",
- "2 1 Jacques Dupond 10 Caroline Dufour 0 \n",
- "3 1 Jacques Dupond 4 Jacques Dupont 1 \n",
- "4 1 Jacques Dupond 6 Jean-Michel Python 1 \n",
- "5 1 Jacques Dupond 10 Caroline Dufour 1 \n",
- "6 10 Caroline Dufour 6 Jean-Michel Python 1 \n",
- "7 10 Caroline Dufour 13 Benoît Benoît 1 \n",
- "8 2 Pierre Dusquesnes 5 pierre dusquesnes 2 \n",
- "9 8 Sophie Delarue 11 sophie_delarue 3 \n",
- "10 10 Caroline Dufour 6 Jean-Michel Python 4 \n",
- "11 10 Caroline Dufour 13 Benoît Benoît 4 \n",
- "12 13 Benoît Benoît 6 Jean-Michel Python 4 \n",
+ " id_l Name_l id_r Name_r _block \\\n",
+ "0 1 Jacques Dupond 4 Jacques Dupont 0 \n",
+ "1 1 Jacques Dupond 6 Jean-Michel Python 0 \n",
+ "2 1 Jacques Dupond 10 Caroline Dufour 0 \n",
+ "3 1 Jacques Dupond 4 Jacques Dupont 1 \n",
+ "4 1 Jacques Dupond 6 Jean-Michel Python 1 \n",
+ "5 1 Jacques Dupond 10 Caroline Dufour 1 \n",
+ "6 10 Caroline Dufour 6 Jean-Michel Python 1 \n",
+ "7 10 Caroline Dufour 13 Benoît Benoît 1 \n",
+ "8 2 Pierre Dusquesnes 5 pierre dusquesnes 2 \n",
+ "9 8 Sophie Delarue 11 sophie_delarue 3 \n",
+ "10 10 Caroline Dufour 6 Jean-Michel Python 4 \n",
+ "11 10 Caroline Dufour 13 Benoît Benoît 4 \n",
+ "12 13 Benoît Benoît 6 Jean-Michel Python 4 \n",
"\n",
- " motive \n",
- "0 (Same 'City', Same 'Age', >=1 overlap in 'webs... \n",
- "1 (Same 'City', Same 'Age', >=1 overlap in 'webs... \n",
- "2 (Same 'City', Same 'Age', >=1 overlap in 'webs... \n",
- "3 (Same 'City', Same 'Age', >=1 overlap in 'webs... \n",
- "4 (Same 'City', Same 'Age', >=1 overlap in 'webs... \n",
- "5 (Same 'City', Same 'Age', >=1 overlap in 'webs... \n",
+ " _motive \n",
+ "0 (>=1 overlap in 'websites', Same 'City', Same ... \n",
+ "1 (>=1 overlap in 'websites', Same 'City', Same ... \n",
+ "2 (>=1 overlap in 'websites', Same 'City', Same ... \n",
+ "3 (>=1 overlap in 'websites', Same 'City', Same ... \n",
+ "4 (>=1 overlap in 'websites', Same 'City', Same ... \n",
+ "5 (>=1 overlap in 'websites', Same 'City', Same ... \n",
"6 (>=1 overlap in 'websites') \n",
"7 (>=1 overlap in 'websites') \n",
"8 (Same 'City', Same 'Age') \n",
@@ -3397,8 +3397,8 @@
" Name_l | \n",
" id_r | \n",
" Name_r | \n",
- " block | \n",
- " motive | \n",
+ " _block | \n",
+ " _motive | \n",
" \n",
" \n",
" \n",
@@ -3409,7 +3409,7 @@
" 4 | \n",
" Jacques Dupont | \n",
" 0 | \n",
- " (Same 'City', Same 'Age', >=1 overlap in 'webs... | \n",
+ " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
" \n",
" \n",
" | 1 | \n",
@@ -3418,7 +3418,7 @@
" 6 | \n",
" Jean-Michel Python | \n",
" 0 | \n",
- " (Same 'City', Same 'Age', >=1 overlap in 'webs... | \n",
+ " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
"
\n",
" \n",
" | 2 | \n",
@@ -3427,7 +3427,7 @@
" 10 | \n",
" Caroline Dufour | \n",
" 0 | \n",
- " (Same 'City', Same 'Age', >=1 overlap in 'webs... | \n",
+ " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
"
\n",
" \n",
" | 3 | \n",
@@ -3436,7 +3436,7 @@
" 4 | \n",
" Jacques Dupont | \n",
" 1 | \n",
- " (Same 'City', Same 'Age', >=1 overlap in 'webs... | \n",
+ " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
"
\n",
" \n",
" | 4 | \n",
@@ -3445,7 +3445,7 @@
" 6 | \n",
" Jean-Michel Python | \n",
" 1 | \n",
- " (Same 'City', Same 'Age', >=1 overlap in 'webs... | \n",
+ " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
"
\n",
" \n",
" | 5 | \n",
@@ -3454,7 +3454,7 @@
" 10 | \n",
" Caroline Dufour | \n",
" 1 | \n",
- " (Same 'City', Same 'Age', >=1 overlap in 'webs... | \n",
+ " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
"
\n",
" \n",
" | 6 | \n",
@@ -3524,12 +3524,12 @@
""
]
},
- "execution_count": 167,
+ "execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
- "execution_count": 167
+ "execution_count": 31
},
{
"cell_type": "markdown",
@@ -3545,8 +3545,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-29T15:07:35.899178900Z",
- "start_time": "2026-01-29T15:07:35.837149Z"
+ "end_time": "2026-01-30T14:21:22.186415700Z",
+ "start_time": "2026-01-30T14:21:22.127304600Z"
}
},
"source": [
@@ -3557,28 +3557,28 @@
{
"data": {
"text/plain": [
- " id_l Name_l id_r Name_r block \\\n",
- "0 1 Jacques Dupond 4 Jacques Dupont 0 \n",
- "1 1 Jacques Dupond 6 Jean-Michel Python 0 \n",
- "2 1 Jacques Dupond 10 Caroline Dufour 0 \n",
- "3 1 Jacques Dupond 4 Jacques Dupont 1 \n",
- "4 1 Jacques Dupond 6 Jean-Michel Python 1 \n",
- "5 1 Jacques Dupond 10 Caroline Dufour 1 \n",
- "8 2 Pierre Dusquesnes 5 pierre dusquesnes 2 \n",
- "9 8 Sophie Delarue 11 sophie_delarue 3 \n",
- "6 10 Caroline Dufour 6 Jean-Michel Python 1 \n",
- "7 10 Caroline Dufour 13 Benoît Benoît 1 \n",
- "10 10 Caroline Dufour 6 Jean-Michel Python 4 \n",
- "11 10 Caroline Dufour 13 Benoît Benoît 4 \n",
- "12 13 Benoît Benoît 6 Jean-Michel Python 4 \n",
+ " id_l Name_l id_r Name_r _block \\\n",
+ "0 1 Jacques Dupond 4 Jacques Dupont 0 \n",
+ "1 1 Jacques Dupond 6 Jean-Michel Python 0 \n",
+ "2 1 Jacques Dupond 10 Caroline Dufour 0 \n",
+ "3 1 Jacques Dupond 4 Jacques Dupont 1 \n",
+ "4 1 Jacques Dupond 6 Jean-Michel Python 1 \n",
+ "5 1 Jacques Dupond 10 Caroline Dufour 1 \n",
+ "8 2 Pierre Dusquesnes 5 pierre dusquesnes 2 \n",
+ "9 8 Sophie Delarue 11 sophie_delarue 3 \n",
+ "6 10 Caroline Dufour 6 Jean-Michel Python 1 \n",
+ "7 10 Caroline Dufour 13 Benoît Benoît 1 \n",
+ "10 10 Caroline Dufour 6 Jean-Michel Python 4 \n",
+ "11 10 Caroline Dufour 13 Benoît Benoît 4 \n",
+ "12 13 Benoît Benoît 6 Jean-Michel Python 4 \n",
"\n",
- " motive score \n",
- "0 (Same 'City', Same 'Age', >=1 overlap in 'webs... 3 \n",
- "1 (Same 'City', Same 'Age', >=1 overlap in 'webs... 3 \n",
- "2 (Same 'City', Same 'Age', >=1 overlap in 'webs... 3 \n",
- "3 (Same 'City', Same 'Age', >=1 overlap in 'webs... 3 \n",
- "4 (Same 'City', Same 'Age', >=1 overlap in 'webs... 3 \n",
- "5 (Same 'City', Same 'Age', >=1 overlap in 'webs... 3 \n",
+ " _motive score \n",
+ "0 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n",
+ "1 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n",
+ "2 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n",
+ "3 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n",
+ "4 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n",
+ "5 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n",
"8 (Same 'City', Same 'Age') 2 \n",
"9 (Same 'City', Same 'Age') 2 \n",
"6 (>=1 overlap in 'websites') 1 \n",
@@ -3610,8 +3610,8 @@
" Name_l | \n",
" id_r | \n",
" Name_r | \n",
- " block | \n",
- " motive | \n",
+ " _block | \n",
+ " _motive | \n",
" score | \n",
"
\n",
" \n",
@@ -3623,7 +3623,7 @@
" 4 | \n",
" Jacques Dupont | \n",
" 0 | \n",
- " (Same 'City', Same 'Age', >=1 overlap in 'webs... | \n",
+ " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
" 3 | \n",
" \n",
" \n",
@@ -3633,7 +3633,7 @@
" | 6 | \n",
" Jean-Michel Python | \n",
" 0 | \n",
- " (Same 'City', Same 'Age', >=1 overlap in 'webs... | \n",
+ " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
" 3 | \n",
"
\n",
" \n",
@@ -3643,7 +3643,7 @@
" | 10 | \n",
" Caroline Dufour | \n",
" 0 | \n",
- " (Same 'City', Same 'Age', >=1 overlap in 'webs... | \n",
+ " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
" 3 | \n",
"
\n",
" \n",
@@ -3653,7 +3653,7 @@
" | 4 | \n",
" Jacques Dupont | \n",
" 1 | \n",
- " (Same 'City', Same 'Age', >=1 overlap in 'webs... | \n",
+ " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
" 3 | \n",
"
\n",
" \n",
@@ -3663,7 +3663,7 @@
" | 6 | \n",
" Jean-Michel Python | \n",
" 1 | \n",
- " (Same 'City', Same 'Age', >=1 overlap in 'webs... | \n",
+ " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
" 3 | \n",
"
\n",
" \n",
@@ -3673,7 +3673,7 @@
" | 10 | \n",
" Caroline Dufour | \n",
" 1 | \n",
- " (Same 'City', Same 'Age', >=1 overlap in 'webs... | \n",
+ " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
" 3 | \n",
"
\n",
" \n",
@@ -3751,12 +3751,12 @@
""
]
},
- "execution_count": 168,
+ "execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
- "execution_count": 168
+ "execution_count": 32
}
],
"metadata": {
diff --git a/src/ms_blocking/ms_blocking.py b/src/ms_blocking/ms_blocking.py
index d6ad2fb..57ccd4c 100644
--- a/src/ms_blocking/ms_blocking.py
+++ b/src/ms_blocking/ms_blocking.py
@@ -1,238 +1,37 @@
-import random
-from itertools import combinations
-from collections import Counter
-
from ms_blocking.utils import * # noqa: F403
-def merge_blockers(left, right):
- """
- Convert two blockers into a single one for performance purposes
- """
-
- if (
- type(left) is AttributeEquivalenceBlocker
- and type(right) is AttributeEquivalenceBlocker
- and left.normalize == right.normalize
- and left.must_not_be_different == right.must_not_be_different
- ):
- return AttributeEquivalenceBlocker(
- blocking_columns=left.blocking_columns + right.blocking_columns,
- normalize_strings=left.normalize,
- must_not_be_different=left.must_not_be_different,
- )
-
- elif (
- type(left) is OverlapBlocker
- and type(right) is OverlapBlocker
- and left.normalize == right.normalize
- and left.overlap == right.overlap
- and left.word_level == right.word_level
- ):
- return OverlapBlocker(
- blocking_columns=left.blocking_columns + right.blocking_columns,
- normalize_strings=left.normalize,
- overlap=left.overlap,
- word_level=left.word_level,
- )
-
- elif (
- type(left) is AttributeEquivalenceBlocker
- and type(right) is OverlapBlocker
- and left.normalize == right.normalize
- ):
- return MixedBlocker(
- equivalence_columns=left.blocking_columns,
- overlap_columns=right.blocking_columns,
- normalize_strings=left.normalize,
- overlap=right.overlap,
- word_level=right.word_level,
- )
-
- elif (
- type(left) is OverlapBlocker
- and type(right) is AttributeEquivalenceBlocker
- and left.normalize == right.normalize
- ):
- return MixedBlocker(
- equivalence_columns=right.blocking_columns,
- overlap_columns=left.blocking_columns,
- normalize_strings=left.normalize,
- overlap=left.overlap,
- word_level=left.word_level,
- )
-
- elif (
- type(left) is MixedBlocker
- and type(right) is MixedBlocker
- and left.normalize == right.normalize
- and left.overlap == right.overlap
- and left.word_level == right.word_level
- ):
- return MixedBlocker(
- equivalence_columns=left.equivalence_columns + right.equivalence_columns,
- overlap_columns=left.overlap_columns + right.overlap_columns,
- must_not_be_different=list(
- set(left.must_not_be_different + right.must_not_be_different)
- ),
- normalize_strings=left.normalize,
- overlap=left.overlap,
- word_level=left.word_level,
- )
-
- elif (
- type(left) is MixedBlocker
- and type(right) is AttributeEquivalenceBlocker
- and left.normalize == right.normalize
- ):
- return MixedBlocker(
- equivalence_columns=left.equivalence_columns + right.blocking_columns,
- overlap_columns=left.overlap_columns,
- must_not_be_different=list(
- set(left.must_not_be_different + right.must_not_be_different)
- ),
- normalize_strings=left.normalize,
- overlap=left.overlap,
- word_level=left.word_level,
- )
-
- elif (
- type(left) is AttributeEquivalenceBlocker
- and type(right) is MixedBlocker
- and left.normalize == right.normalize
- ):
- return MixedBlocker(
- equivalence_columns=left.blocking_columns + right.equivalence_columns,
- overlap_columns=right.overlap_columns,
- must_not_be_different=list(
- set(left.must_not_be_different + right.must_not_be_different)
- ),
- normalize_strings=left.normalize,
- overlap=right.overlap,
- word_level=right.word_level,
- )
-
- elif (
- type(left) is MixedBlocker
- and type(right) is OverlapBlocker
- and left.normalize == right.normalize
- and left.overlap == right.overlap
- and left.word_level == right.word_level
- ):
- return MixedBlocker(
- equivalence_columns=left.equivalence_columns,
- overlap_columns=left.overlap_columns + right.blocking_columns,
- must_not_be_different=left.must_not_be_different,
- normalize_strings=left.normalize,
- overlap=left.overlap,
- word_level=left.word_level,
- )
-
- elif (
- type(left) is OverlapBlocker
- and type(right) is MixedBlocker
- and left.normalize == right.normalize
- and left.overlap == right.overlap
- and left.word_level == right.word_level
- ):
- return MixedBlocker(
- equivalence_columns=right.equivalence_columns,
- overlap_columns=left.blocking_columns + right.overlap_columns,
- must_not_be_different=right.must_not_be_different,
- normalize_strings=left.normalize,
- overlap=left.overlap,
- word_level=left.word_level,
- )
- else:
- return AndNode(left, right)
-
-
-def must_not_be_different_apply(
- temp_data, blocking_columns, must_not_be_different_columns
-):
- """Re-block DataFrame on a second column, where we require non-difference rather than equality"""
-
- temp_data["block_id"] = temp_data.groupby(blocking_columns).ngroup()
- temp_data = temp_data[temp_data["block_id"].duplicated(keep=False)]
-
- reconstructed_data = pd.DataFrame(columns=temp_data.columns)
- for block in temp_data["block_id"].unique():
- # noinspection PyArgumentList
- current_block = (
- temp_data[temp_data["block_id"] == block]
- .sort_values(must_not_be_different_columns)
- .copy()
- )
- if (
- len(current_block[current_block[must_not_be_different_columns].notnull()])
- == 0
- ): # All nulls
- random_string = "".join(
- random.choices("ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", k=10)
- ) # As long as the string is not already in the column...
- # There must be a better way to do it...
- current_block[must_not_be_different_columns] = (
- current_block[must_not_be_different_columns]
- .astype(str)
- .fillna(random_string)
- )
- else:
- current_block[must_not_be_different_columns] = (
- current_block[must_not_be_different_columns].astype(str).ffill()
- )
- if len(reconstructed_data) == 0:
- reconstructed_data = current_block
- else:
- reconstructed_data = pd.concat([reconstructed_data, current_block])
- return reconstructed_data
-
-
-def block_overlap(groups, overlap):
- coords = {
- frozenset(pair) for group_list in groups for pair in combinations(group_list, 2)
- }
-
- if overlap > 1:
- coords = [ # In this specific case, we want to keep duplicates to track the number of occurences of a pair
- frozenset(pair)
- for group_list in groups
- for pair in combinations(group_list, 2)
- ]
- # Filter pairs that fulfill the minimum overlap condition
- occurences_dict = Counter(coords)
- coords = {
- p for p in occurences_dict if occurences_dict[p] >= overlap
- } # The collection of pairs that fulfill the overlap condition
-
- return coords
-
-
-def add_motives_to_coords(coords, explanations):
- return {pair: explanations for pair in coords}
-
-
-class Node:
+class BlockerNode:
"""Abstract class from which derive all classes in the module"""
def __init__(self, left=None, right=None):
self.left = left
self.right = right
+ self.blocking_columns = None
+ self.equivalence_columns = None
+ self.overlap_columns = None
self.overlap = None
self.normalize = None
self.must_not_be_different = None
self.word_level = None
def __and__(self, other):
- return merge_blockers(self, other)
+ if self == other:
+ return self
+ else:
+ return merge_blockers(self, other)
def __or__(self, other):
- return OrNode(self, other)
+ if self == other:
+ return self
+ else:
+ return OrNode(self, other)
def __repr__(self):
return f"Node{{{self.left}, {self.right}}}"
-class AndNode(Node):
+class AndNode(BlockerNode):
"""Used to compute the intersection of the outputs of two Blockers."""
def __init__(self, left, right):
@@ -242,7 +41,7 @@ def __repr__(self):
return f"AndNode{{{self.left}, {self.right}}}"
def __eq__(self, other):
- return self.left==other.left and self.right==other.right
+ return self.left == other.left and self.right == other.right
def block(self, df, motives=False):
# In order not to perform redundant computations, we first filter out the rows that were not considered by the first blocker before running the second blocker
@@ -258,14 +57,14 @@ def block(self, df, motives=False):
if id_lists
else pd.DataFrame(columns=df.columns)
)
-
+ # Rows that are in no pairs following the first blocking step cannot be in any pair of the interection
coords_right = self.right.block(df_shortened, motives=motives)
result = merge_blocks_and(coords_left, coords_right)
return result
-class OrNode(Node):
+class OrNode(BlockerNode):
"""Used to compute the union of the outputs of two Blockers."""
def __init__(self, left, right):
@@ -275,9 +74,10 @@ def __repr__(self):
return f"OrNode{{{self.left}, {self.right}}}"
def __eq__(self, other):
- return self.left==other.left and self.right==other.right
+ return self.left == other.left and self.right == other.right
def block(self, df, motives=False):
+ # Note: for performance, it would be wise to remove rows that are already paired with all other rows, though this case should be pretty rare in real situations
coords_left = self.left.block(df, motives=motives)
coords_right = self.right.block(df, motives=motives)
@@ -286,7 +86,7 @@ def block(self, df, motives=False):
return result
-class AttributeEquivalenceBlocker(Node): # Leaf
+class AttributeEquivalenceBlocker(BlockerNode): # Leaf
"""To regroup rows based on equality across columns."""
def __init__(
@@ -373,6 +173,7 @@ def block(self, data, motives=False):
return set()
# Use the DataFrame index for grouping and forming pairs
+ # Using frozenset since they are ahshable and thus can be used as dictionary keys
groups = temp_data.groupby(
self.blocking_columns + self.must_not_be_different
).apply(lambda x: frozenset(x.index), include_groups=False)
@@ -391,7 +192,7 @@ def block(self, data, motives=False):
return set(coords) # set is unnnecessary
-class OverlapBlocker(Node): # Leaf
+class OverlapBlocker(BlockerNode): # Leaf
"""To regroup rows based on overlap of one or more columns."""
def __init__(
@@ -466,6 +267,7 @@ def block(self, data, motives=False):
return set()
# Use the DataFrame index for grouping and forming pairs
+ # Using frozenset since they are ahshable and thus can be used as dictionary keys
groups = temp_data.groupby(self.blocking_columns).apply(
lambda x: frozenset(x.index), include_groups=False
)
@@ -482,7 +284,7 @@ def block(self, data, motives=False):
return set(coords)
-class MixedBlocker(Node): # Leaf; For ANDs and RAM
+class MixedBlocker(BlockerNode): # Leaf; For ANDs and RAM
"""Represent the intersection of an AttributeEquivalenceBlocker and an OverlapBlocker.
Designed for performance and RAM efficiency.
"""
@@ -604,6 +406,7 @@ def block(self, data, motives=False):
must_not_be_different_columns=self.must_not_be_different,
)
+ # Using frozenset since they are ahshable and thus can be used as dictionary keys
groups_equivalence = temp_data.groupby(self.equivalence_columns).apply(
lambda x: frozenset(x.index), include_groups=False
)
@@ -633,4 +436,161 @@ def block(self, data, motives=False):
return set(coords)
+def merge_blockers(
+ left: BlockerNode, right: BlockerNode
+) -> AttributeEquivalenceBlocker | OverlapBlocker | MixedBlocker | AndNode:
+ """Convert two blockers into a single one for performance purposes
+
+ This function outputs a new blocker that combines the functionalities of the two input blockers, to prevent redundant operations.
+
+ Parameters
+ ----------
+ left : BlockerNode
+ Blocker that represents the first condition
+
+ right : BlockerNode
+ Blocker that represents the second condition
+
+ Returns
+ -------
+ AttributeEquivalenceBlocker|OverlapBlocker|MixedBlocker|AndNode
+ Blocker that represents both conditions
+ """
+ if (
+ type(left) is AttributeEquivalenceBlocker
+ and type(right) is AttributeEquivalenceBlocker
+ and left.normalize == right.normalize
+ and left.must_not_be_different == right.must_not_be_different
+ ):
+ return AttributeEquivalenceBlocker(
+ blocking_columns=left.blocking_columns + right.blocking_columns,
+ normalize_strings=left.normalize,
+ must_not_be_different=left.must_not_be_different,
+ )
+
+ elif (
+ type(left) is OverlapBlocker
+ and type(right) is OverlapBlocker
+ and left.normalize == right.normalize
+ and left.overlap == right.overlap
+ and left.word_level == right.word_level
+ ):
+ return OverlapBlocker(
+ blocking_columns=left.blocking_columns + right.blocking_columns,
+ normalize_strings=left.normalize,
+ overlap=left.overlap,
+ word_level=left.word_level,
+ )
+
+ elif (
+ type(left) is AttributeEquivalenceBlocker
+ and type(right) is OverlapBlocker
+ and left.normalize == right.normalize
+ ):
+ return MixedBlocker(
+ equivalence_columns=left.blocking_columns,
+ overlap_columns=right.blocking_columns,
+ normalize_strings=left.normalize,
+ overlap=right.overlap,
+ word_level=right.word_level,
+ )
+
+ elif (
+ type(left) is OverlapBlocker
+ and type(right) is AttributeEquivalenceBlocker
+ and left.normalize == right.normalize
+ ):
+ return MixedBlocker(
+ equivalence_columns=right.blocking_columns,
+ overlap_columns=left.blocking_columns,
+ normalize_strings=left.normalize,
+ overlap=left.overlap,
+ word_level=left.word_level,
+ )
+
+ elif (
+ type(left) is MixedBlocker
+ and type(right) is MixedBlocker
+ and left.normalize == right.normalize
+ and left.overlap == right.overlap
+ and left.word_level == right.word_level
+ ):
+ return MixedBlocker(
+ equivalence_columns=left.equivalence_columns + right.equivalence_columns,
+ overlap_columns=left.overlap_columns + right.overlap_columns,
+ must_not_be_different=list(
+ set(left.must_not_be_different + right.must_not_be_different)
+ ),
+ normalize_strings=left.normalize,
+ overlap=left.overlap,
+ word_level=left.word_level,
+ )
+
+ elif (
+ type(left) is MixedBlocker
+ and type(right) is AttributeEquivalenceBlocker
+ and left.normalize == right.normalize
+ ):
+ return MixedBlocker(
+ equivalence_columns=left.equivalence_columns + right.blocking_columns,
+ overlap_columns=left.overlap_columns,
+ must_not_be_different=list(
+ set(left.must_not_be_different + right.must_not_be_different)
+ ),
+ normalize_strings=left.normalize,
+ overlap=left.overlap,
+ word_level=left.word_level,
+ )
+
+ elif (
+ type(left) is AttributeEquivalenceBlocker
+ and type(right) is MixedBlocker
+ and left.normalize == right.normalize
+ ):
+ return MixedBlocker(
+ equivalence_columns=left.blocking_columns + right.equivalence_columns,
+ overlap_columns=right.overlap_columns,
+ must_not_be_different=list(
+ set(left.must_not_be_different + right.must_not_be_different)
+ ),
+ normalize_strings=left.normalize,
+ overlap=right.overlap,
+ word_level=right.word_level,
+ )
+
+ elif (
+ type(left) is MixedBlocker
+ and type(right) is OverlapBlocker
+ and left.normalize == right.normalize
+ and left.overlap == right.overlap
+ and left.word_level == right.word_level
+ ):
+ return MixedBlocker(
+ equivalence_columns=left.equivalence_columns,
+ overlap_columns=left.overlap_columns + right.blocking_columns,
+ must_not_be_different=left.must_not_be_different,
+ normalize_strings=left.normalize,
+ overlap=left.overlap,
+ word_level=left.word_level,
+ )
+
+ elif (
+ type(left) is OverlapBlocker
+ and type(right) is MixedBlocker
+ and left.normalize == right.normalize
+ and left.overlap == right.overlap
+ and left.word_level == right.word_level
+ ):
+ return MixedBlocker(
+ equivalence_columns=right.equivalence_columns,
+ overlap_columns=left.blocking_columns + right.overlap_columns,
+ must_not_be_different=right.must_not_be_different,
+ normalize_strings=left.normalize,
+ overlap=left.overlap,
+ word_level=left.word_level,
+ )
+ else:
+ return AndNode(left, right)
+
+
# /!\ TODO: make class for motives (+ pair, motive dict)?
diff --git a/src/ms_blocking/utils.py b/src/ms_blocking/utils.py
index da01fa6..837645f 100644
--- a/src/ms_blocking/utils.py
+++ b/src/ms_blocking/utils.py
@@ -5,7 +5,10 @@
from scipy.sparse.csgraph import connected_components
import pandas as pd
import networkx as nx
+import random
+from collections import Counter
+from itertools import combinations
from typing import List, Set, Iterable, Dict, Collection, Any
Columns = List[str]
@@ -21,14 +24,14 @@
def remove_rows_if_value_appears_only_once(
data: pd.DataFrame, cols: Columns
) -> pd.DataFrame:
- """Drops rows of a Pandas DataFrame where a certain column's values appears only once.
+ """Drop rows of a Pandas DataFrame where a certain column's values appears only once.
Ensures all elements of provided columns appear at least twice in their column
Parameters
----------
data : DataFrame
- The DataFrame to preprocess
+ DataFrame to preprocess
cols : List[str]
List of columns where rows that contain non-duplicated elements shall be discarded
@@ -131,7 +134,7 @@ def normalize_function(string: Any) -> Any:
Parameters
----------
string : Any
- The text to preprocess
+ Text to preprocess
Returns
-------
@@ -160,7 +163,7 @@ def normalize(text: Any) -> Any:
Parameters
----------
text : Any
- The text(s) to preprocess
+ Text(s) to preprocess
Returns
-------
@@ -191,7 +194,7 @@ def flatten(list_of_iterables_: Collection[Iterable]) -> List[Any] | None:
Parameters
----------
list_of_iterables_ : Collection[Iterable]
- The list to flatten
+ List to flatten
Returns
-------
@@ -338,17 +341,6 @@ def add_blocks_to_dataset(
id_l rank_l id_r rank_r block
0 0 first 2 first 0
"""
- if output_columns is None:
- output_columns = data.columns
- data = data[output_columns].copy()
-
- if "motive" in data.columns:
- print("Renaming 'motive' column to 'motive_old'")
- data = data.rename(columns={"motive": "motive_old"})
-
- if "block" in data.columns:
- print("Renaming 'block' column to 'block_old'")
- data = data.rename(columns={"block": "block_old"})
if show_as_pairs and keep_ungrouped_rows:
raise ValueError("Cannot both return pairs and keep ungrouped rows")
@@ -361,6 +353,19 @@ def add_blocks_to_dataset(
if not data.index.is_unique:
raise ValueError("DataFrame index must be unique to be used as an identifier.")
+ if "_motive" in data.columns:
+ if motives:
+ raise ValueError(
+ "Please rename existing '_motive' column OR do not pass 'motives=True'"
+ )
+
+ if "_block" in data.columns:
+ raise ValueError("Please rename existing '_block' column")
+
+ if output_columns is None:
+ output_columns = data.columns
+ data = data[output_columns].copy()
+
if len(coords) == 0 and not keep_ungrouped_rows: # Empty graph
if show_as_pairs:
columns = [col + "_l" for col in data.columns] + [
@@ -411,16 +416,16 @@ def add_blocks_to_dataset(
output_data = pd.concat([output_data, current_row])
# Assign blocks to rows based on their original index
- output_data["block"] = output_data.index.map(matcher)
+ output_data["_block"] = output_data.index.map(matcher)
if not merge_blocks:
- output_data = output_data.explode("block")
+ output_data = output_data.explode("_block")
if keep_ungrouped_rows:
- output_data["block"] = output_data["block"].fillna(-1)
+ output_data["_block"] = output_data["_block"].fillna(-1)
matcher_ungrouped_rows = {}
block_temp = []
i = 0 # Track # of blocks processed
- for b in output_data["block"]:
+ for b in output_data["_block"]:
if b == -1:
block_temp.append(i)
i += 1
@@ -430,19 +435,19 @@ def add_blocks_to_dataset(
i += 1
else:
block_temp.append(matcher_ungrouped_rows[b])
- output_data["block"] = block_temp
+ output_data["_block"] = block_temp
else:
if not show_as_pairs:
output_data = output_data[
- output_data["block"].duplicated(keep=False)
- & output_data["block"].notna()
+ output_data["_block"].duplicated(keep=False)
+ & output_data["_block"].notna()
]
- output_data.loc[:, ["block"]] = start_from_zero(output_data["block"])
+ output_data.loc[:, ["_block"]] = start_from_zero(output_data["_block"])
if sort:
# Sort by block, then by original index
- sort_cols = ["block"]
+ sort_cols = ["_block"]
if output_data.index.name:
output_data = output_data.sort_values(
sort_cols + [output_data.index.name]
@@ -456,7 +461,7 @@ def add_blocks_to_dataset(
output_data = output_data.set_index(output_data.columns[0])
if motives:
- output_data["motive"] = ""
+ output_data["_motive"] = ""
id_list = flatten(coords.keys())
motive_matcher = {
row_id: frozenset(
@@ -467,13 +472,14 @@ def add_blocks_to_dataset(
)
for row_id in id_list
}
- output_data["motive"] = output_data.index.map(motive_matcher)
+ output_data["_motive"] = output_data.index.map(motive_matcher)
- if "block" not in output_data.columns: # Empty coords
- output_data["block"] = -1
+ if "_block" not in output_data.columns: # Empty coords
+ output_data["_block"] = -1
output_data = output_data.reset_index(drop=True)
- output_data["block"] = output_data["block"].astype(int)
+ output_data["_block"] = output_data["_block"].astype(int)
+
return output_data
@@ -502,7 +508,7 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]:
Parameters
----------
s : str
- The stringified representation of a list e.g. "['string 1', 'string 2', ...]"
+ Stringified representation of a list e.g. "['string 1', 'string 2', ...]"
word_level : bool
Whether to return a list of all words within s instead of a list of each comma-separated element
@@ -510,7 +516,7 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]:
Returns
-------
List[str]
- A python list based on s
+ s turned into a List
Examples
--------
@@ -546,16 +552,17 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]:
else:
return [s for s in cleaned_items if len(s) > 0]
-def scoring(data: pd.DataFrame, motives_column: str="motive") -> pd.Series:
+
+def scoring(data: pd.DataFrame, motives_column: str = "_motive") -> pd.Series:
"""Add a score to a blocked DataFrame based on the number of motives
Parameters
----------
data : DataFrame
- A DataFrame with motives
+ DataFrame with motives
motives_column : str
- The name of the column containing the motives
+ Name of the column containing the motives
Returns
-------
@@ -565,11 +572,149 @@ def scoring(data: pd.DataFrame, motives_column: str="motive") -> pd.Series:
# Check that we do have motives
if motives_column not in data.columns:
- raise ValueError(f"Specified motives column \"{motives_column}\" does not exist")
+ if motives_column == "_motive":
+ raise ValueError("No motives in DataFrame")
+ else:
+ raise ValueError(
+ f'Specified motives column "{motives_column}" does not exist'
+ )
if "score" in data.columns:
print("Renaming 'score' column to 'score_old'")
data = data.rename(columns={"score": "score_old"})
scores = data[motives_column].apply(len)
- return scores
\ No newline at end of file
+ return scores
+
+
+def must_not_be_different_apply( # WIP
+ temp_data: pd.DataFrame,
+ blocking_columns: List[str],
+ must_not_be_different_columns: List[str],
+):
+ """Re-block DataFrame on a second column, where we require non-difference rather than equality
+
+ Parameters
+ ----------
+ temp_data : DataFrame
+ Partially blocked DataFrame
+
+ blocking_columns : List[str]
+ Columns where we check for equality
+
+ must_not_be_different_columns : List[str]
+ Columns where we only check for non-difference
+
+ Returns
+ -------
+ DataFrame
+ Column of scores
+ """
+
+ series_block_id = temp_data.groupby(blocking_columns).ngroup()
+ temp_data = temp_data[series_block_id.duplicated(keep=False)]
+
+ reconstructed_data = pd.DataFrame(columns=temp_data.columns)
+ for block in series_block_id.unique():
+ # noinspection PyArgumentList
+ current_block = (
+ temp_data[series_block_id == block]
+ .sort_values(must_not_be_different_columns)
+ .copy()
+ )
+ if (
+ len(current_block[current_block[must_not_be_different_columns].notnull()])
+ == 0
+ ): # All nulls
+ random_string = "".join(
+ random.choices("ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", k=10)
+ ) # As long as the string is not already in the column...
+ # There must be a better way to do it...
+ current_block[must_not_be_different_columns] = (
+ current_block[must_not_be_different_columns]
+ .astype(str)
+ .fillna(random_string)
+ )
+ else:
+ current_block[must_not_be_different_columns] = (
+ current_block[must_not_be_different_columns].astype(str).ffill()
+ )
+ if len(reconstructed_data) == 0:
+ reconstructed_data = current_block
+ else:
+ reconstructed_data = pd.concat([reconstructed_data, current_block])
+
+ return reconstructed_data
+
+
+def block_overlap(groups: Iterable, overlap: int = 1) -> Coords:
+ """Block a DataFrame based on overlap accross columns
+
+ Parameters
+ ----------
+ groups : Iterable
+ Output of a groupby
+
+ overlap : int
+ Minimum passing overlap
+
+ Returns
+ -------
+ Coords
+ Pairs obtained by blocking
+ """
+ coords = {
+ frozenset(pair) for group_list in groups for pair in combinations(group_list, 2)
+ }
+
+ if overlap > 1:
+ coords = [ # In this specific case, we want to keep duplicates to track the number of occurences of each pair
+ frozenset(pair)
+ for group_list in groups
+ for pair in combinations(group_list, 2)
+ ]
+ # Filter pairs that fulfill the minimum overlap condition
+ occurences_dict = Counter(coords)
+ coords = {
+ p for p in occurences_dict if occurences_dict[p] >= overlap
+ } # The collection of pairs that fulfill the overlap condition
+
+ return coords
+
+
+def add_motives_to_coords(coords: Coords, explanations: Set[str]) -> CoordsMotives:
+ """Block a DataFrame based on overlap accross columns
+
+ Parameters
+ ----------
+ coords : Coords
+ Coords obtained by blocking
+
+ explanations : Set[str]
+ Set of explanations
+
+ Returns
+ -------
+ CoordsMotives
+ Pairs obtained by blocking
+
+ Examples
+ --------
+ >>> add_motives_to_coords({
+ frozenset({1, 4}),
+ frozenset({8, 11}),
+ frozenset({2, 5}),
+ frozenset({10, 13}),
+ frozenset({3, 8}),
+ frozenset({3, 11}),
+ }, {"Same 'City'"}')
+ {
+ frozenset({1, 4}): {"Same 'City'"},
+ frozenset({8, 11}): {"Same 'City'"},
+ frozenset({2, 5}): {"Same 'City'"},
+ frozenset({10, 13}): {"Same 'City'"},
+ frozenset({3, 8}): {"Same 'City'"},
+ frozenset({3, 11}): {"Same 'City'"},
+ }
+ """
+ return {pair: explanations for pair in coords}
diff --git a/tests/test_ms_blocking.py b/tests/test_ms_blocking.py
index cbf8284..d3f9ab2 100644
--- a/tests/test_ms_blocking.py
+++ b/tests/test_ms_blocking.py
@@ -105,7 +105,7 @@ def attribute_city_show_as_pairs_true_id():
@pytest.fixture
def attribute_city_show_as_pairs_true_columns():
- return ["id_l", "Name_l", "id_r", "Name_r", "block"]
+ return ["id_l", "Name_l", "id_r", "Name_r", "_block"]
@pytest.fixture
@@ -183,7 +183,7 @@ def test_merge_blocks(overlap_websites_merge_blocks):
websites_blocker = msb.OverlapBlocker(["websites"])
links = websites_blocker.block(get_users())
actual = msb.add_blocks_to_dataset(get_users(), links, merge_blocks=False)[
- "block"
+ "_block"
].to_list()
assert actual == expected, (
"Blocking on websites should return [0, 0, 0, 1, 1, 2, 2, 2]"
@@ -240,7 +240,7 @@ def test_sort_false(attribute_city_sort_false_blocks):
city_blocker = msb.AttributeEquivalenceBlocker(["City"])
links = city_blocker.block(get_users())
actual = msb.add_blocks_to_dataset(get_users(), links, sort=False)[
- "block"
+ "_block"
].to_list()
assert actual == expected, (
"Blocking on websites and adding blocks with sort=False should return [0, 1, 2, 0, 1, 2, 3, 2, 3]"
@@ -253,7 +253,7 @@ def test_keep_ungrouped_rows_false(attribute_city_keep_ungrouped_rows_false):
city_blocker = msb.AttributeEquivalenceBlocker(["City"])
links = city_blocker.block(get_users())
actual = msb.add_blocks_to_dataset(get_users(), links, keep_ungrouped_rows=True)[
- "block"
+ "_block"
].to_list()
assert actual == expected, (
"Blocking on Name with normalize_strings=False should return [0, 1, 1, 2, 2, 3, 3, 3, 4, 5, 6, 7, 7, 8]"
@@ -274,7 +274,7 @@ def test_motives_when_adding_to_dataframe(attribute_city_motives_true_add):
city_blocker = msb.AttributeEquivalenceBlocker(["City"])
links = city_blocker.block(get_users(), motives=True)
actual = msb.add_blocks_to_dataset(get_users(), links, motives=True)[
- "motive"
+ "_motive"
].to_list()
assert actual == expected
@@ -337,7 +337,7 @@ def test_pipelining_motives(city_age_websites_pipelining_motives):
links = final_blocker.block(get_users(), motives=True)
actual = msb.add_blocks_to_dataset(
get_users(), links, show_as_pairs=True, motives=True, merge_blocks=False
- )["motive"].to_list()
+ )["_motive"].to_list()
assert actual == expected
@@ -508,7 +508,7 @@ def test_no_links_m():
def test_no_links_add_blocks_to_dataframe():
"""Test that add_blocks_to_dataframe gracefully outputs an empty DataFrame when no pairs were found"""
- expected = pd.DataFrame(columns=["id", "Name", "City", "Age", "websites", "block"])
+ expected = pd.DataFrame(columns=["id", "Name", "City", "Age", "websites", "_block"])
expected_show_as_pairs = pd.DataFrame(
columns=[
"id_l",
@@ -521,11 +521,11 @@ def test_no_links_add_blocks_to_dataframe():
"City_r",
"Age_r",
"websites_r",
- "block",
+ "_block",
]
)
expected_motives = pd.DataFrame(
- columns=["id", "Name", "City", "Age", "websites", "motive", "block"]
+ columns=["id", "Name", "City", "Age", "websites", "_motive", "_block"]
)
id_blocker = msb.AttributeEquivalenceBlocker(["id"])
links = id_blocker.block(get_users())