-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpre_processing.py
More file actions
528 lines (414 loc) · 23.7 KB
/
pre_processing.py
File metadata and controls
528 lines (414 loc) · 23.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
import pandas as pd
import re
from collections import Counter
from functools import reduce
import fuzzywuzzy
from fuzzywuzzy import fuzz
import openpyxl
import xlrd
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, regexp_replace, upper, col, when, length, split, regexp_extract, trim, concat_ws, lit
from pyspark.sql import DataFrame
from pyspark.sql.types import StringType, IntegerType, StructType, StructField, ArrayType
from address_functions.config.settings import town_list
####################################################################################
def clean_punctuation(df: DataFrame, input_col="supplied_query_address", create_flag=True):
"""
Cleans up punctuation from address strings by removing or fixing unwanted characters, while preserving hyphens
and periods where necessary (e.g., between numbers or block names).
Parameters:
- df (DataFrame): The input DataFrame containing address data.
- input_col (str): The name of the column containing the address strings to be cleaned.
- create_flag (bool): Whether to create a flag indicating if punctuation was cleaned (default is True).
Returns:
- df (DataFrame): A DataFrame with cleaned address strings in 'final_cleaned_address', and an optional flag
('punctuation_cleaned_flag') indicating if changes were made.
Examples:
- Input: ",.123- MAIN STREET, LONDON,"
Output: "123 MAIN STREET, LONDON"
(Leading commas and periods are removed, and the hyphen is preserved between numbers.)
- Input: "BLOCK A-1-2, 14 - 16, SOMEWHERE ROAD"
Output: "BLOCK A-1-2, 14-16, SOMEWHERE ROAD"
(Hyphens between alphanumeric chars are preserved, while hyphens between numbers are standardised.)
- Input: "FLAT 4 . 2, 67 HIGH STREET - 123, CITY - 45, TOWN"
Output: "FLAT 4.2, 67 HIGH STREET-123, CITY-45, TOWN"
(Unnecessary spaces around periods and hyphens are removed, standardising punctuation between numbers and alphanumeric strings.)
- Input: ",,,, UNIT 9,, BLOCK-5,, RANDOM ROAD,,"
Output: "UNIT 9, BLOCK-5, RANDOM ROAD"
(Multiple commas and unnecessary punctuation are cleaned up to create a clearer string.)
"""
def clean_part(part):
if part:
# Replace hyphen between room numbers with a comma (e.g., "Room 7 - 1"), account for varying spaces
part = re.sub(r'(Room\s+\d+)\s*-\s*(\d+)', r'\1, \2', part)
# Preserve hyphens between numbers, even if there are spaces around the hyphen (e.g., "14 - 16")
part = re.sub(r'(?<=\d)\s*-\s*(?=\d)', ' TEMP_HYPHEN ', part)
# Preserve periods (.) between numbers (e.g., "14.16")
part = re.sub(r'(?<=\d)\s*\.\s*(?=\d)', ' TEMP_DOT ', part)
# Preserve hyphens in cases like "II-2" or "Gp2-4-B-7"
part = re.sub(r'(?<=\w)-(?=\w)', ' TEMP_HYPHEN ', part)
part = re.sub(r'(?<=BLOCK\s\w)-(?=\d)', ' TEMP_HYPHEN ', part) # Preserve hyphen in block names
part = re.sub(r'(?<=\w)-(?=\d\w)', ' TEMP_HYPHEN ', part) # Preserve hyphens like "C-11E"
# Remove leading hyphens before numbers, except in preserved cases
part = re.sub(r'-\s*(?=\d)', '', part)
# Remove punctuation at the beginning and end
part = re.sub(r"^[^a-zA-Z0-9]+|[^a-zA-Z0-9]+$", "", part)
# Normalise whitespace
part = re.sub(r"\s+", " ", part)
# Restore preserved hyphens and periods
part = part.replace(' TEMP_HYPHEN ', '-')
part = part.replace(' TEMP_DOT ', '.')
return part.strip()
# Apply cleaning logic to each address part
@udf(ArrayType(StringType()))
def clean_parts_udf(parts):
return [clean_part(part) for part in parts]
# Step 1: Temporarily preserve hyphens between numbers or in special cases
df = df.withColumn("cleaned_address", regexp_replace(col(input_col), r'(?<=\d)-(?=\d)', ' TEMP_HYPHEN '))
df = df.withColumn("cleaned_address", regexp_replace(col("cleaned_address"), r'(?<=\d)\.\s*(?=\d)', ' TEMP_DOT '))
df = df.withColumn("cleaned_address", regexp_replace(col("cleaned_address"), r'(?<=\w)-(?=\d\w)', ' TEMP_HYPHEN '))
# Step 2: Handle punctuation marks and spaces, excluding preserved hyphens and periods
df = df.withColumn("cleaned_address",
regexp_replace(col("cleaned_address"),
r"[\s,.-]*\.,[\s,.-]*|[\s,.-]+\,|,\s*[\s,.-]+",
", "))
df = df.withColumn("cleaned_address",
regexp_replace(col("cleaned_address"),
r",\s*,|(^[\s,.-]+)|([\s,.-]+$)",
", "))
# Step 3: Remove leading punctuation or commas at the start of the string (again after transformations)
df = df.withColumn("cleaned_address", regexp_replace(col("cleaned_address"), r"^[,.\s]+", ""))
# Step 4: Split the address into parts to handle each part separately
df = df.withColumn("address_parts", split(col("cleaned_address"), ",\\s*"))
# Step 5: Clean each part and join back into a single address string
df = df.withColumn("cleaned_parts", clean_parts_udf(col("address_parts")))
df = df.withColumn("final_cleaned_address", concat_ws(", ", col("cleaned_parts")))
# Step 6: Restore preserved hyphens and periods in the final cleaned address
df = df.withColumn("final_cleaned_address", regexp_replace(col("final_cleaned_address"), ' TEMP_HYPHEN ', '-'))
df = df.withColumn("final_cleaned_address", regexp_replace(col("final_cleaned_address"), ' TEMP_DOT ', '.'))
# Step 7: Remove any trailing commas and spaces in the final cleaned address
df = df.withColumn("final_cleaned_address", regexp_replace(col("final_cleaned_address"), r",\s*$", ""))
# Step 8: Create a flag indicating whether punctuation was cleaned
if create_flag:
df = df.withColumn("punctuation_cleaned_flag",
when(col(input_col) == col("final_cleaned_address"), 0).otherwise(1))
else:
df = df.withColumn("punctuation_cleaned_flag",
when(col("punctuation_cleaned_flag").isNotNull(), col("punctuation_cleaned_flag"))
.otherwise(when(col(input_col) == col("final_cleaned_address"), 0).otherwise(1)))
# Drop intermediate columns
df = df.drop("cleaned_address", "address_parts", "cleaned_parts")
return df
##############################################################################
def remove_noise_words_with_flag(df, input_col="final_cleaned_address"):
"""
Removes noise words from the input address column and flags any rows where noise words were removed.
Noise words are defined as sequences of the same uppercase letter repeated three or more times (e.g., "AAAA").
Parameters:
- df (DataFrame): The input DataFrame containing address data.
- input_col (str): The name of the column containing the address strings to be cleaned (default is "final_cleaned_address").
Returns:
- df (DataFrame): The updated DataFrame with noise words removed and a flag ('noise_removed_flag') indicating
whether any noise words were removed.
Example:
- Input: "123 MAIN ROAD, AAAA, LONDON"
- Output: "123 MAIN ROAD, LONDON"
(The noise word "AAAA" is removed, and the 'noise_removed_flag' is set to 1 for this row.)
"""
# Define the regex pattern for noise words: sequences of the same uppercase letter repeated 3 or more times.
noise_pattern = r"\b([A-Z])\1{3,}\b"
# Replace noise words in the input address column with an empty string
df = df.withColumn("cleaned_address", regexp_replace(col(input_col), noise_pattern, ""))
# Create a flag that indicates whether noise words have been removed
df = df.withColumn("noise_removed_flag", when(col("cleaned_address") != col(input_col), 1).otherwise(0))
# Update the original address column with the cleaned address and drop the column created for this function
df = df.withColumn(input_col, col("cleaned_address"))
df = df.drop("cleaned_address")
return df
##############################################################################
def get_process_and_deduplicate_address_udf(column_name="final_cleaned_address"):
"""
Processes and deduplicates parts of an address based on similarity.
The function compares consecutive parts of the address, and if they are highly similar (based on a threshold),
it keeps only one. It also flags rows where changes were made.
Parameters:
- column_name (str): The name of the address column to process (default is "final_cleaned_address").
Returns:
- UDF: A User-Defined Function (UDF) that takes an address string and returns:
- cleaned_address: The address with deduplicated parts.
- words_deduplicated_flag: A flag indicating whether any deduplication was done (1 if changes were made, 0 otherwise).
Example:
- Input: "123 MAIN ROAD, MAIN ROAD, LONDON"
- Output: ("123 MAIN ROAD, LONDON", 1)
(The repeated part "MAIN ROAD" is removed, and the 'words_deduplicated_flag' is set to 1.)
"""
def process_and_deduplicate_address(address, threshold=95):
"""
Deduplicates parts of the address by comparing consecutive parts based on a similarity threshold. (fuzzy uses Levenshtein)
Parameters:
- address (str): The address string to process.
- threshold (int): The similarity threshold (default is 95). Parts with a similarity ratio above this will be considered duplicates.
Returns:
- tuple: A tuple containing:
- cleaned_address (str): The deduplicated address string.
- words_deduplicated_flag (int): A flag indicating whether any deduplication was done (1 if changes were made, 0 otherwise).
"""
def contains_numbers(s):
# Check if the string contains any numbers
return bool(re.search(r'\d', s))
parts = [part.strip() for part in address.split(',')]
processed = []
seen = set()
skip_next = False
changes_made = False
for i in range(len(parts)):
if skip_next:
skip_next = False
continue
current_part = parts[i]
if i < len(parts) - 1:
next_part = parts[i + 1]
# Check if the current part and the next part are highly similar
if fuzz.ratio(current_part, next_part) >= threshold and abs(len(current_part) - len(next_part)) < 3:
if contains_numbers(current_part) or not contains_numbers(next_part):
chosen_part = current_part
else:
chosen_part = next_part
processed.append(chosen_part)
skip_next = True
changes_made = True
else:
processed.append(current_part)
else:
processed.append(current_part)
final_parts = []
for part in processed:
# Add the part if it's not already seen, ensuring no duplicates are added
if part not in seen:
final_parts.append(part)
seen.add(part)
else:
changes_made = True
# Flag is set to 1 if any changes were made, 0 otherwise
flag = 1 if changes_made else 0
return (', '.join(final_parts), flag)
# Return a UDF to process and deduplicate addresses
return udf(process_and_deduplicate_address, StructType([
StructField("cleaned_address", StringType(), True),
StructField("words_deduplicated_flag", IntegerType(), True)
]))
###################################################################################
def deduplicate_postcodes_udf():
import re
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
# UK postcode regex pattern (with or without space)
postcode_regex = r"([Gg][Ii][Rr] 0[Aa]{2})|((([A-Za-z]\d{1,2})|(([A-Za-z][A-Ha-hJ-Yj-y]\d{1,2})|(([A-Za-z]\d[A-Za-z])|([A-Za-z][A-Ha-hJ-Yj-y]\d[A-Za-z]?))))\s?\d[A-Za-z]{2})"
def normalise_postcode(postcode):
"""Normalize postcode by removing spaces and dashes."""
return re.sub(r'[\s-]', '', postcode.upper())
def ensure_postcode_format(postcode):
"""Ensure the postcode has the correct format (with a space)."""
postcode = normalise_postcode(postcode)
if len(postcode) > 3:
return postcode[:-3] + " " + postcode[-3:]
return postcode
def extract_postcodes(address):
"""Extract all valid UK postcodes from an address string."""
postcodes = re.findall(postcode_regex, address)
if not postcodes:
return []
return [next(pc for pc in pc_tuple if pc) for pc_tuple in postcodes] # Flatten tuples
def remove_prefix_duplicates(address, formatted_postcodes):
"""Remove all occurrences of postcodes (both spaced and unspaced) except the last one."""
for postcode in formatted_postcodes[:-1]: # Leave the last one untouched for now
normalised_pc = normalise_postcode(postcode)
# Remove the postcode regardless of spacing
address = re.sub(re.escape(normalised_pc), '', address)
address = re.sub(re.escape(postcode), '', address)
return re.sub(r'\s+', ' ', address.strip()) # Clean up extra spaces
def move_last_postcode_to_end(address, formatted_postcodes):
"""Ensure that the last formatted postcode appears at the end of the address."""
last_postcode = formatted_postcodes[-1] # Take the last correctly formatted postcode
# Remove all instances of postcodes from the address
for pc in formatted_postcodes:
address = re.sub(re.escape(pc), '', address)
# Append the last formatted postcode to the end
address = address.strip(', ') + ", " + last_postcode
return re.sub(r'\s*,\s*', ', ', address.strip(', ')) # Clean up punctuation
def deduplicate_postcodes(address):
"""Main function to deduplicate postcodes in an address."""
postcodes = extract_postcodes(address)
if not postcodes:
return address, 0 # No valid postcodes found
formatted_postcodes = [ensure_postcode_format(pc) for pc in postcodes]
changes_flag = 0
new_address = address
# Step 1: Remove all duplicate postcodes except the last formatted one
new_address = remove_prefix_duplicates(new_address, formatted_postcodes)
# Step 2: Ensure the last formatted postcode is kept at the end
new_address = move_last_postcode_to_end(new_address, formatted_postcodes)
# Check if any changes were made
if new_address != address:
changes_flag = 1
return new_address, changes_flag
return udf(deduplicate_postcodes, StructType([
StructField("final_cleaned_address", StringType()),
StructField("changes_flag", IntegerType())
]))
###################################################################################
def map_and_check_postcode(address):
"""
Corrects and validates UK postcodes within an address string by applying character mapping
to fix common misinterpretations (e.g., 'I' to '1') and checks if the resulting postcode is valid.
The function performs the following steps:
1. Splits the address into parts to identify potential postcodes.
2. Checks if any valid UK postcodes exist using a regex pattern.
3. If no valid postcode is found, applies a character map to correct common mistakes such as:
- 'I' -> '1'
- 'O' -> '0'
- 'S' -> '5'
- 'Z' -> '2'
4. Re-checks if the corrected postcode is now valid.
5. Returns the cleaned address with the valid postcode, if found or corrected, and sets a flag if any changes were made.
Parameters:
----------
None. The UDF will be applied to a DataFrame column.
Returns:
--------
pyspark.sql.functions.udf: A PySpark UDF that processes an address string to identify, correct, and validate postcodes.
UDF Output Schema:
- final_cleaned_address (StringType): The cleaned address with a valid postcode, if applicable.
- changes_flag (IntegerType): A flag indicating if any changes were made to the postcode (1 if changed, 0 otherwise).
Example:
-------
Input: "123 MAIN STREET, LNI 2QB, LONDON"
Output: "123 MAIN STREET, LN1 2QB, LONDON", 1 # 'I' corrected to '1' in the postcode
Input: "FLAT 4, 1-3 FAKE STREET, AB1 2CD, CITY"
Output: "FLAT 4, 1-3 FAKE STREET, AB1 2CD, CITY", 0 # Postcode already valid, no changes
Input: "SO41 NLP, 123 MAIN ROAD" (fake postcode)
Output: "123 MAIN ROAD, SO41 NLP", 1 # Postcode moved to the end of the address
"""
import re
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
# The preferred postcode regex
postcode_regex = r"([Gg][Ii][Rr] 0[Aa]{2})|((([A-Za-z]\d{1,2})|(([A-Za-z][A-Ha-hJ-Yj-y]\d{1,2})|(([A-Za-z]\d[A-Za-z])|([A-Za-z][A-Ha-hJ-Yj-y]\d[A-Za-z]?))))\s?\d[A-Za-z]{2})"
def normalise_postcode(postcode):
return postcode.replace(" ", "").replace("-", "").upper()
def format_postcode(postcode):
postcode = normalise_postcode(postcode)
if len(postcode) > 3:
return postcode[:-3] + " " + postcode[-3:]
return postcode
def is_valid_postcode(postcode):
# Use the preferred postcode regex
return re.fullmatch(postcode_regex, postcode) is not None
def looks_like_postcode(part):
# Check if a part looks like a UK postcode (combination of letters and numbers)
return bool(re.search(r'[A-Za-z0-9]', part)) and not part.isalpha() # Ensure it's not just a word
def correct_postcode(postcode):
# Character mapping for commonly misinterpreted characters
char_map = {
'I': '1', 'O': '0', 'S': '5', 'Z': '2',
'B': '8', 'D': '0', 'G': '6', 'J': '1',
'A': '4', 'E': '3', 'H': '4', 'L': '1',
'U': '0', 'Y': '4', 'C': '0', 'K': '1', 'M': '1'
}
return ''.join([char_map.get(char, char) for char in postcode])
def map_and_check_postcode(address):
parts = [part.strip() for part in address.split(',')]
changes_flag = 0
valid_postcode_found = False
print(f"Processing address: {address}") # Debugging statement
# First, check if a valid UK postcode already exists in the string
for part in parts:
if is_valid_postcode(part):
valid_postcode_found = True
print(f"Valid postcode found: {part}") # Debugging statement
break # Exit if a valid postcode is found
# If no valid postcode was found, apply mapping and correction
if not valid_postcode_found:
for i, part in enumerate(parts):
print(f"Checking part: {part}") # Debugging statement
# Only apply mapping to parts that look like they might be postcodes (but not words)
if looks_like_postcode(part) and not is_valid_postcode(part):
print(f"Applying correction to part: {part}") # Debugging statement
corrected = correct_postcode(part)
if is_valid_postcode(corrected):
print(f"Corrected postcode: {corrected}") # Debugging statement
parts[i] = format_postcode(corrected)
changes_flag = 1 # Set the flag if a valid postcode is corrected
final_address = ', '.join(parts)
return (final_address, changes_flag)
map_and_check_postcode_udf = udf(map_and_check_postcode, StructType([
StructField("final_cleaned_address", StringType()),
StructField("changes_flag", IntegerType())
]))
return {
"map_and_check_postcode_udf": map_and_check_postcode_udf
}
#############################################################################################
def standardise_street_types(df, address_col="final_cleaned_address"):
"""
Standardises street type abbreviations and common misspellings within an address column,
applying a set of predefined rules to replace short forms like 'ST' with 'STREET' and
fix common typos. Adds a flag to indicate if any standardisation occurred.
The function performs the following steps:
1. Identifies and replaces abbreviations or misspellings of street types (e.g., 'ST' becomes 'STREET').
2. Applies regex-based transformations for common street type abbreviations such as:
- 'STR' or 'STRT' -> 'STREET'
- 'ST' (not followed by 'REET') -> 'STREET'
- 'RD' or 'RAOD' -> 'ROAD'
- 'AVE', 'AVE.' or 'AVENEU' -> 'AVENUE'
- 'CRT', 'CRT.', or 'CT' -> 'COURT'
- 'CRESENT' or 'CRSNT' -> 'CRESCENT'
- 'DRV' or 'DR' -> 'DRIVE'
- 'GRDN' or 'GDN' -> 'GARDEN'
- 'PK' -> 'PARK'
- 'CL' -> 'CLOSE'
3. Compares the original and modified address columns to determine if any changes were made.
4. Adds a flag (`street_type_standardised_flag`) indicating whether any street type standardisation occurred.
Parameters:
----------
df : pyspark.sql.DataFrame
The input DataFrame containing address data.
address_col : str, optional
The name of the address column to apply standardisation to. Default is "final_cleaned_address".
Returns:
--------
pyspark.sql.DataFrame
A DataFrame with the following additions:
- The standardised address column with street type abbreviations expanded and misspellings corrected.
- 'street_type_standardised_flag' (IntegerType): A flag (1 if changes were made, 0 otherwise) indicating whether any standardisation occurred.
Example:
-------
Input: "123 MAIN ST, LONDON"
Output: "123 MAIN STREET, LONDON", 1 # 'ST' expanded to 'STREET'
Input: "456 PARK AVE., CITY"
Output: "456 PARK AVENUE, CITY", 1 # 'AVE.' corrected to 'AVENUE'
Input: "789 GARDEN CRT, TOWN"
Output: "789 GARDEN COURT, TOWN", 1 # 'CRT' corrected to 'COURT'
Input: "321 DRIVE LANE"
Output: "321 DRIVE LANE", 0 # No changes, street type already standardised
"""
original_column = col(address_col)
# Apply standardisation rules for street types
df = df.withColumn(address_col, regexp_replace(col(address_col), r'\bSTR\b|\bSTRT\b', 'STREET'))
df = df.withColumn(address_col, regexp_replace(col(address_col), r'\bST\b(?!REET\b)', 'STREET'))
df = df.withColumn(address_col, regexp_replace(col(address_col), r'\bRD\b|RAOD', 'ROAD'))
df = df.withColumn(address_col, regexp_replace(col(address_col), r'\bAVE\b|\bAVE\.\b|\bAVENEU\b', 'AVENUE'))
df = df.withColumn(address_col, regexp_replace(col(address_col), r'\bCRT\b|\bCRT\.\b|\bCT\b', 'COURT'))
df = df.withColumn(address_col, regexp_replace(col(address_col), r'\bCRESENT\b|\bCRSNT\b', 'CRESCENT'))
df = df.withColumn(address_col, regexp_replace(col(address_col), r'\bDRV\b|\bDR\b', 'DRIVE'))
df = df.withColumn(address_col, regexp_replace(col(address_col), r'\bGRDN(?=S\b)?\b|\bGDN(?=S\b)?\b', 'GARDEN'))
df = df.withColumn(address_col, regexp_replace(col(address_col), r'\bPK\b', 'PARK'))
df = df.withColumn(address_col, regexp_replace(col(address_col), r'\bCL\b', 'CLOSE'))
# Add a flag to indicate if any standardization has occurred by comparing the original column and the modified one
df = df.withColumn(
'street_type_standardised_flag',
when(col(address_col) != original_column, lit(1)).otherwise(lit(0))
)
return df