-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreproc2.py
More file actions
118 lines (99 loc) · 3.95 KB
/
preproc2.py
File metadata and controls
118 lines (99 loc) · 3.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import argparse
import os
import re
from glob import glob
import spacy
# fmt: off
# Notable NER:Persons that should not be converted to "XXX"
PERSONS_WHITELIST = [ # fmt: off
'Vladimir V. Putin', 'Vladimir Putin', 'Putin',
'Volodymyr Zelensky', 'Volodymyr Zelenskyy', 'Zelensky',
'Biden',
'Xi Jinping', 'Xi',
'Olaf Scholz', 'Scholz',
'Boris Johnson', 'Johnson',
'Emmanuel Macron', 'Macron',
'Sergey V. Lavrov', 'Lavrov',
'Aleksei A. Navalny', 'Navalny',
'Jens Stoltenberg', 'Stoltenberg',
'Antony J. Blinken', 'Blinken',
'Mark A. Milley', 'Milley',
'Kamala Harris', 'Harris',
'Barack Obama', 'Obama',
'Donald J. Trump', 'Trump',
### Misclassified as PERSON (by Spacy) ###
'Stinger', 'Javelin', 'Brexit', 'C.I.A.', 'Twitter', 'Mykolaiv'
]
# Notable NER:Places that should not be converted to "YYY"
PLACES_WHITELIST = [ # fmt: off
'U.S.', 'United States', 'Washington',
'Russia', 'Soviet Union', 'Moscow', 'Crimea', 'Belarus', 'Chechnya',
'Ukraine', 'Kyiv', 'Kharkiv', 'Lviv', 'Kherson', 'Odessa', 'Mariupol', 'Donetsk', 'Irpin', 'Mykolaiv',
'China', 'Beijing',
'Germany', 'Berlin',
'U.K.', 'Britain', 'London',
'France', 'Paris',
'Poland', 'Warsaw',
'Brussels', 'Netherlands',
'Lithuania', 'Romania', 'Latvia', 'Estonia', 'Moldova', 'Slovakia',
'Canada', 'China', 'Israel', 'Syria', 'Afghanistan', 'Iran', 'Iraq', 'North Korea',
### Misclassified as PLACE (my Spacy) ###
'Ukrainian'
]
# fmt: on
def argparser() -> argparse.Namespace:
def dir_path(path: str) -> str:
if os.path.isdir(path):
return path
else:
raise argparse.ArgumentTypeError(f"{path} is not a valid directory path")
parser = argparse.ArgumentParser(
description="Preprocess Text Files - Step #2",
)
parser.add_argument("input_dir", type=dir_path, help="Input Files Directory")
parser.add_argument("output_dir", type=dir_path, help="Output Files Directory")
args = parser.parse_args()
if args.input_dir == args.output_dir:
print(f"<input_dir> and <output_dir> must be different!")
exit(1)
return args
def main() -> int:
def process_file(text: str) -> str:
doc = nlp(text)
for ent in reversed(doc.ents):
if ent.label == 380: # persons
ent_text = ent.text
if m := pa_apostr.match(ent_text): # e.g. "Biden's"
ent_text = m.group(1)
if ent_text not in PERSONS_WHITELIST:
# Normalize non-significant Persons to "XXX"
text = text[: ent.start_char] + "XXX" + text[ent.end_char :]
if ent.label == 384: # places
ent_text = ent.text.lower()
if m := pa_the.match(ent_text): # e.g. "the Soviet Union"
ent_text = m.group(1)
if ent_text not in places_whitelist:
# Normalize non-significant Places to "YYY"
text = text[: ent.start_char] + "YYY" + text[ent.end_char :]
text = text.replace(
"XXX XXX", "XXX"
) # incomplete NER parsing (e.g. "Ursula von der Leyen")
return text
args = argparser()
files = glob(os.path.join(args.input_dir, "*.txt"))
nlp = spacy.load("en_core_web_md")
places_whitelist = [place.lower() for place in PLACES_WHITELIST]
pa_apostr = re.compile(r"(.+)'s") # Spacy doesn't strip off "'s" when returning entities
pa_the = re.compile(r"the (.+)") # Spacy includes "the" (e.g. "the U.S.") with entities
for file in files:
with open(file) as fin:
fname = os.path.basename(file)
with open(os.path.join(args.output_dir, fname), "w") as fout:
print(".", end="", flush=True)
text_in = fin.read()
text_out = process_file(text_in)
fout.write(text_out)
print()
return 0
if __name__ == "__main__":
exit(main())