forked from x4nth055/pythoncode-tutorials
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdf_table_extractor_tabula.py
More file actions
24 lines (21 loc) · 967 Bytes
/
pdf_table_extractor_tabula.py
File metadata and controls
24 lines (21 loc) · 967 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import tabula
import os
# uncomment if you want to pass pdf file from command line arguments
# import sys
# read PDF file
# uncomment if you want to pass pdf file from command line arguments
# tables = tabula.read_pdf(sys.argv[1], pages="all")
tables = tabula.read_pdf("1710.05006.pdf", pages="all")
# save them in a folder
folder_name = "tables"
if not os.path.isdir(folder_name):
os.mkdir(folder_name)
# iterate over extracted tables and export as excel individually
for i, table in enumerate(tables, start=1):
table.to_excel(os.path.join(folder_name, f"table_{i}.xlsx"), index=False)
# convert all tables of a PDF file into a single CSV file
# supported output_formats are "csv", "json" or "tsv"
tabula.convert_into("1710.05006.pdf", "output.csv", output_format="csv", pages="all")
# convert all PDFs in a folder into CSV format
# `pdfs` folder should exist in the current directory
tabula.convert_into_by_batch("pdfs", output_format="csv", pages="all")