-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscore_extract.py
More file actions
37 lines (25 loc) · 1.07 KB
/
score_extract.py
File metadata and controls
37 lines (25 loc) · 1.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# The script reads the top-N (if specified) or all the line numbers from the score file
# and then goes through the actual corpus(monolingual or bilingual), extracting those lines to the output file
__author__ = 'Angelos Constantinides'
import sys
import argparse
def main():
parser = argparse.ArgumentParser()
parser.add_argument("score", help="The name of the file containing the score of each sentence")
parser.add_argument("corpus", help="The name of the file that contains the coprpus")
parser.add_argument("output", help="The name of the file that the extracted sentences will be written to")
args = parser.parse_args()
score_fileName = args.score
corpus_fileName = args.corpus
output_fileName = args.output
lines_set=set()
# Read Scores
with open(score_fileName,'r') as file_score:
for line in file_score:
lines_set.add(int(line.split('\t')[0]))
with open(corpus_fileName,'r') as file_in, open(output_fileName,'w') as file_out:
for i, line in enumerate(file_in, start=1):
if i in lines_set:
file_out.write(line)
if __name__ == "__main__":
main()