-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathword-limit.py
More file actions
129 lines (109 loc) · 5.32 KB
/
word-limit.py
File metadata and controls
129 lines (109 loc) · 5.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# word-limit.py: counts how many words a .docx or .pdf file has.
# This can be used on a file for ACM CHI conference or ACM UIST, you can skip automatically over captions or include them, or even see the count with or without references.
# How to use (in your command line):
# >python3 word-limit.py -f -v file.docx
# ...this example counts words in a file, including captions of figures. Excludes references.
# >python3 word-limit.py --help
# ... this will allow you to get started and see the options
# File type? This only supports .pdf or .docx
# No flags are needed, the filetype is detected from extension
from docx import Document
import sys
import re
import argparse
import datetime
import subprocess
# figure is detected from the following template
# "Figure <number>. <text until end of line>"
# So make sure to use "F" and not "f"
# And use the period (.) as well after the <number>.
fig = re.compile(r"^Figure\s\d\.\s.*$")
abstract_seen = False
# limits (as of UIST 2025)
LIMIT_SHORT = 5000
LIMIT_AVERAGE = 8000
LIMIT_LONG = 12000
# words used to delimite sections
REFERENCES = "REFERENCES"
ABSTRACT = "ABSTRACT"
INTRODUCTION = "INTRODUCTION"
parser = argparse.ArgumentParser(description='Word-counter for papers in .docx format.')
parser.add_argument('-f', '--count_figure', action='store_true', help='Counts words inside of Figure captions')
parser.add_argument('-na', '--no_abstract', action='store_true', help='Does not count Authors, Fig 1 (if it comes before abstract), and abstract, i.e., starts at introduction')
parser.add_argument('-refs', '--count_references', action='store_true', help='Counts words in the reference section')
parser.add_argument('-v', '--verbose', action='store_true', help='verbose output indicates where words are used (e.g., Figures, Abstract, References, etc.')
parser.add_argument('-d', '--debug', action='store_true', help='Enables debug output that indicates when elements are skipped and not counted.')
parser.add_argument('-n', '--no_determination', action='store_true', help='Does not check the lenght of your paper against the limits set in the file (e.g., SHORT paper vs. LONG paper')
parser.add_argument('filename')
args = parser.parse_args()
paper = None
class Text: #honestly, kind of a ugly solution
def __init__(self, pdf=None, doc=None):
self.pdf = pdf
self.doc = doc
def paragraphs(self):
if self.pdf is not None:
return self.pdf
else:
return self.doc.paragraphs
def text(self, par):
print(par)
if self.pdf is not None:
return par
else:
return par.text
if args.filename[-4:].lower() == ".pdf":
#print("PDF")
now = datetime.datetime.now()
formatted_date = now.strftime("%Y_%m_%d_%H_%M_%S")
result = subprocess.run(['pdftotext', args.filename, 'out_' + formatted_date + '.txt'], capture_output=True, text=True)
# note that pdftotext was run without layout flag
with open('out_' + formatted_date + '.txt', 'r') as file:
paper = Text(pdf=file.readlines())
elif args.filename[-5:].lower() == ".docx":
#print("DOCX")
paper = Text(doc=Document(args.filename))
# store counts
count_all = []
count_figures = []
count_references = []
REFERENCES_FOUND = False
for par in paper.paragraphs(): #no ()
#print(paper.text(par))
if fig.match(paper.text(par).strip()) is not None: # target text is a figure
if args.count_figure: # and we have figure flag enabled
count_all.append(paper.text(par)) # thus, counted figure
count_figures.append(paper.text(par)) # counted figures for stats
else:
count_figures.append(paper.text(par)) # counted figures for stats
if args.debug:
print("Debug\tFIGURE_SKIP\t" + paper.text(par))
elif paper.text(par).strip() == REFERENCES: #passing over the references
REFERENCES_FOUND = True
continue
elif args.count_references and REFERENCES_FOUND:
count_all.append(par.text) # enabled so counted reference as main
count_references.append(par.text) # count reference for stats
elif not args.count_references and REFERENCES_FOUND:
count_references.append(paper.text(par)) # count reference for stats
if args.debug:
print("Debug\tREFERENCE_SKIP\t" + paper.text(par))
else: # catch all
count_all.append(paper.text(par))
if args.debug:
print(len('\n'.join(count_all).split()))
if args.verbose:
print("Breakdown")
print("\ttotal of words in this run\t" + str(len('\n'.join(count_all).split())))
print("\twords in figure caption text\t" + str(len('\n'.join(count_figures).split())))
print("\twords in reference section\t" + str(len('\n'.join(count_references).split())))
if not args.no_determination:
word_count = len('\n'.join(count_all).split())
if word_count < LIMIT_SHORT:
print("SHORT OK: " + str(word_count) + " is below " + str(LIMIT_SHORT))
elif (word_count > LIMIT_SHORT and word_count < LIMIT_AVERAGE):
print("NOT SHORT, AVERAGE OK: " + str(word_count) + " is below " + str(LIMIT_AVERAGE))
elif (word_count > LIMIT_AVERAGE and word_count < LIMIT_LONG):
print("NOT AVERAGE!, LONG OK: " + str(word_count) + " is below " + str(LIMIT_LONG))
else:
print("VERY LONG!: " + str(word_count) + " is OVER " + str(LIMIT_LONG))