-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmain.py
More file actions
116 lines (103 loc) · 5.02 KB
/
main.py
File metadata and controls
116 lines (103 loc) · 5.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import re
import googletrans
import functions
location_documents = r"/Users/philipp/Projects/PycharmProjects/PDF-scraping/pdf_input" # change this to location of folder where pdfs are. (i.e usually inside /Documents)
ls2 = []
functions.change_name()
l = os.listdir()
for ls in l:
a = os.path.splitext(ls)
if a[1] == ".pdf":
a = "".join(a)
ls2.append(a)
# find all pdfs inside the folder.
for items in ls2:
try:
if not functions.check_dir(items): # check if the pdf file is already translated and appended to csv
# call check_dir function in functions.py
i = 1
with open("temp.txt", "r") as f: # check for known arabic string format of the pdf.
for line in f:
if "نوع الخدمة" in line:
line = line.split()
seven = line[3]
three = " ".join(line[6::])
if "اسم الشارع" in line:
line1 = line.split()
eight = " ".join(line1[2::])
if "درجة الأهمية" in line:
line2 = line.split()
nine = line2[2]
if "اسم المقاول" in line:
ten = line
if "تشوه بصري" in line:
line3 = line.split()
twelve = " ".join(line3[1::])
if "الزيارة الميدانية" in line:
eleven = line
if "ملاحظات المهندس المشرف /" in line:
thirteen = line.split()
thirteen = " ".join(thirteen[5::])
if len(thirteen) == 0:
thirteen = "الموقع تابع لكم حسب الاختصاص"
if "المراقب الميداتي" in line:
fourteen = line.split()
fourteen = " ".join(fourteen[3::])
if "التاري" in line:
line = line.split(":")
line = line[0].split(" ")
zero = []
for e in line:
if e != "التاريخ":
zero.append(e)
else:
break
zero1 = " ".join(zero)
i += 1
four = "هبوط ترنش خدمات"
f.close()
with open("temp_en.txt", "r") as t: # extract numbers and numeric values using regular expression.
cont = t.read()
patt1 = re.compile(r"\d{2}-\d{2}-\d{4}")
matches = patt1.finditer(cont)
i = 0
for item in matches:
if i != 0:
break
one = item.group(0)
i += 1
patt2 = re.compile(r"\b\d{12} ")
matches1 = patt2.finditer(cont)
for item in matches1:
two = item.group(0)
patt3 = re.compile(r"(\d{2}-\d{2}-\d{4}) (\d{6})")
matches3 = patt3.finditer(cont)
for item in matches3:
six = item.group(2)
patt4 = re.compile(r"N:(\d{2}\.\d*) [_ ]*E:(\d{2}.\d*)")
matches4 = patt4.finditer(cont)
for item in matches4:
five_a = item.group(2)
five_b = item.group(1)
patt5 = re.compile(r"900\d{4}")
matches5 = patt5.finditer(cont)
for item in matches5:
fifteen = item.group(0)
t.close()
# lst_to_append1 = [
# ['AR', zero1, one, two, three, four, 'وصف الموقع', five_a, five_b, six, seven, eight,
# nine, ten, eleven, twelve, thirteen, fourteen, fifteen]
# ] #append the arabic text in csv.
lst_to_append = [['Content', zero1, functions.get_trans(zero1), one, two, three, functions.get_trans(three), four, functions.get_trans(four), 'وصف الموقع', 'site location', five_a, five_b, six, seven, functions.get_trans(seven), eight, functions.get_trans(eight), nine,functions.get_trans(nine), ten, functions.get_trans(ten), eleven, functions.get_trans(eleven), twelve,functions.get_trans(twelve),thirteen, functions.get_trans(thirteen), fourteen, functions.get_trans(fourteen), fifteen]] # append the translated text in csv.
print(lst_to_append)
# lst2 = [
# ['','','','','','','','','','','','','','','','','','']
# ]
# functions.append_csv(lst2)
# functions.append_csv(lst_to_append1)
functions.append_csv(lst_to_append)
functions.append_txt(items)
os.chdir(f"{location_documents}")
except:
print(f"Problem in PDF {items}.")