-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtest_sent_splitting.py
More file actions
37 lines (28 loc) · 842 Bytes
/
test_sent_splitting.py
File metadata and controls
37 lines (28 loc) · 842 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import pandas as pd
from ast import literal_eval
# This script checks for extremely long or short sentences in the CopCo data.
word2char_mapping = pd.read_csv("word2char_IA_mapping.csv", converters={"characters": literal_eval, "char_IA_ids": literal_eval})
speeches = word2char_mapping.groupby("speechId")
for id, speech in speeches:
print(id, len(speech['sentenceId'].unique()))
print()
print()
sents = word2char_mapping.groupby("sentenceId")
count = 0
for id, sent in sents:
if len(sent) > 80:
count += 1
text_list = list(sent["word"])
full_text = " ".join(text_list)
print(len(sent), full_text)
print(count)
print()
print()
count = 0
for id, sent in sents:
if len(sent) < 3:
count += 1
text_list = list(sent["word"])
full_text = " ".join(text_list)
print(len(sent), full_text)
print(count)