-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclean_data.py
More file actions
32 lines (27 loc) · 930 Bytes
/
clean_data.py
File metadata and controls
32 lines (27 loc) · 930 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import json
import re
def reformat_title(title):
# Add spacing around punctuation
title = re.sub(r'([^\w\s])', r' \1 ', title)
title_arr = title.split()
reformatted_title = ""
for word in title_arr:
if word.isnumeric():
reformatted_title += "_num_ "
else:
reformatted_title += word + " "
return reformatted_title + "\n"
def main():
titles_file = open("titles.txt", "w")
with open('test.json', 'r') as file:
for line in file:
print(line)
try:
data = json.loads(line) # Parse the JSON object in each line
if 'title' in data:
reformatted_title = reformat_title(data['title'])
titles_file.write(reformatted_title)
except json.JSONDecodeError:
pass # Skip lines that are not valid JSON
if "__main__" == __name__:
main()