-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathacocr_postprocessor.py
More file actions
74 lines (64 loc) · 2.6 KB
/
acocr_postprocessor.py
File metadata and controls
74 lines (64 loc) · 2.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# -*- coding: utf-8 -*-
# @Time : 2021/8/9 14:08
# @Author : beyoung
# @Email : linbeyoung@stu.pku.edu.cn
# @File : acocr_postprocessor.py
import os
def tag_convert2xml(input_path, output_path='', mod='normal'):
flag = 1
contents = ''
with open(input_path, 'r', encoding='utf-8') as f:
# a = f.read()
# b = f.readline()
lines = f.readlines()
for line in lines:
line = line.replace('\n', '')
if len(line.split('\t')) == 2:
content, tag = line.split('\t')[0], line.split('\t')[1]
else:
print(input_path)
content, tag = '', line.split('\t')[0]
# print(tag)
tag_s = tag.replace('[', '').replace(']', '')
content_new = '<' + tag_s + '>' + content + '</' + tag_s + '>'
if flag < len(lines):
flag += 1
content_new = content_new + '\n'
contents = contents + content_new
# print(contents)
if output_path == '':
output_path = input_path[:-4] + '_xml.txt'
with open(output_path, 'w', encoding='utf-8') as of:
of.write(contents)
return
def getfiles(root, mod='singel'):
files_new = []
dirs = os.listdir(root)
for dir in dirs:
dir_path = os.path.join(root, dir)
if os.path.isdir(dir_path):
files = os.listdir(dir_path)
for file in files:
file_path = os.path.join(dir_path, file)
if file.endswith('_recog_adv.txt'):
files_new.append(file_path)
#
# for curDir, dirs, files in os.walk(root):
# # print("====================")
# # print("现在的目录:" + curDir)
# # print("该目录下包含的子目录:" + str(dirs))
# # print("该目录下包含的文件:" + str(files))
# for file in files:
# if file.endswith('_recog_adv.txt'):
# files_new.append(os.path.join())
return files_new
if __name__ == '__main__':
txt_files = getfiles(r'/Users/Beyoung/Desktop/Projects/AC_OCR/ER007_jpg_res/')
for txt_file in txt_files:
dirname, filename = txt_file.split('/')[-2], txt_file.split('/')[-1]
save_file = filename.replace('_res_recog_adv.txt', '.txt')
if not os.path.exists(os.path.join('/Users/Beyoung/Desktop/Projects/AC_OCR/ER007_txt', dirname)):
os.mkdir(os.path.join('/Users/Beyoung/Desktop/Projects/AC_OCR/ER007_txt', dirname))
save_path = os.path.join('/Users/Beyoung/Desktop/Projects/AC_OCR/ER007_txt', dirname, save_file)
# print(os.path.join(dirname, txt_file))
tag_convert2xml(txt_file, save_path)