-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
81 lines (69 loc) · 2.17 KB
/
main.py
File metadata and controls
81 lines (69 loc) · 2.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from pathlib import Path
import json
from scripts import extractor
from scripts.parse import parse_char_prop, parse_lee_kesler_tables, parse_antoine_table, parse_cp_tables
def main():
pdf_source = "data/text.pdf"
appendix_pdf = "data/appendix_BCD.pdf"
b1_pdf = "data/b1_char_prop.pdf"
b2_pdf = "data/b2_antoine.pdf"
lee_kesler_pdf = "data/lee_kesler.pdf"
cp_pdf = "data/cp.pdf"
json_out_lee_kesler = "data/lee_kesler.json"
json_out_b1 = "data/b1_char_prop.json"
json_out_b2 = "data/b2_antoine.json"
extractor.extract_pages(
input_pdf=pdf_source,
start=682,
end=711,
out=appendix_pdf,
)
#properties
extractor.extract_pages(
input_pdf=appendix_pdf,
start=1,
end=3,
out=b1_pdf
)
#antoine
extractor.extract_pages(
input_pdf=appendix_pdf,
start=4,
end=5,
out=b2_pdf
)
#lee-kesler
extractor.extract_pages(
input_pdf=appendix_pdf,
start=14,
end=30,
out=lee_kesler_pdf
)
#heat capacity
extractor.extract_pages(
input_pdf=appendix_pdf,
start=7,
end=9,
out=cp_pdf
)
data_properties = parse_char_prop(b1_pdf)
Path(json_out_b1).parent.mkdir(parents=True, exist_ok=True)
with open(json_out_b1, "w") as f:
json.dump(data_properties, f, indent=2)
print(f"Extracted {len(data_properties)} substances -> {json_out_b1}")
data_antoine = parse_antoine_table(b2_pdf)
with open(json_out_b2, "w") as f:
json.dump(data_antoine, f, indent=2)
print(f"Extracted {len(data_antoine)} antoine entries -> {json_out_b2}")
data_lee = parse_lee_kesler_tables(lee_kesler_pdf)
Path(json_out_lee_kesler).parent.mkdir(parents=True, exist_ok=True)
with open(json_out_lee_kesler, "w") as f:
json.dump(data_lee, f, indent=2)
print(f"Extracted {len(data_lee)} tables -> {json_out_lee_kesler}")
data_cp = parse_cp_tables(cp_pdf)
json_out_cp = "data/cp.json"
with open(json_out_cp, "w") as f:
json.dump(data_cp, f, indent=2)
print(f"Extracted CP tables -> {json_out_cp}")
if __name__ == "__main__":
main()