@@ -114,6 +114,7 @@ def process(
114114 segment_sentences = False ,
115115 force = True ,
116116 verbose = False ,
117+ flavor = None
117118 ):
118119 batch_size_pdf = self .config ["batch_size" ]
119120 input_files = []
@@ -147,6 +148,7 @@ def process(
147148 segment_sentences ,
148149 force ,
149150 verbose ,
151+ flavor
150152 )
151153 input_files = []
152154
@@ -185,6 +187,7 @@ def process_batch(
185187 segment_sentences ,
186188 force ,
187189 verbose = False ,
190+ flavor = None
188191 ):
189192 if verbose :
190193 print (len (input_files ), "files to process in current batch" )
@@ -203,6 +206,9 @@ def process_batch(
203206 selected_process = self .process_pdf
204207 if service == 'processCitationList' :
205208 selected_process = self .process_txt
209+
210+ if verbose :
211+ print (f"Adding { input_file } to the queue." )
206212
207213 r = executor .submit (
208214 selected_process ,
@@ -214,7 +220,8 @@ def process_batch(
214220 include_raw_citations ,
215221 include_raw_affiliations ,
216222 tei_coordinates ,
217- segment_sentences )
223+ segment_sentences ,
224+ flavor )
218225
219226 results .append (r )
220227
@@ -255,7 +262,8 @@ def process_pdf(
255262 tei_coordinates ,
256263 segment_sentences ,
257264 start = - 1 ,
258- end = - 1
265+ end = - 1 ,
266+ flavor = None
259267 ):
260268 pdf_handle = open (pdf_file , "rb" )
261269 files = {
@@ -285,6 +293,8 @@ def process_pdf(
285293 the_data ["teiCoordinates" ] = self .config ["coordinates" ]
286294 if segment_sentences :
287295 the_data ["segmentSentences" ] = "1"
296+ if flavor :
297+ the_data ["flavor" ] = flavor
288298 if start > 0 :
289299 the_data ["start" ] = str (start )
290300 if end > 0 :
@@ -368,6 +378,7 @@ def process_txt(
368378
369379def main ():
370380 valid_services = [
381+ "processFulltextDocumentBlank" ,
371382 "processFulltextDocument" ,
372383 "processHeaderDocument" ,
373384 "processReferences" ,
@@ -441,11 +452,18 @@ def main():
441452 help = "print information about processed files in the console" ,
442453 )
443454
455+ parser .add_argument (
456+ "--flavor" ,
457+ default = None ,
458+ help = "Define the flavor to be used for the fulltext extraction" ,
459+ )
460+
444461 args = parser .parse_args ()
445462
446463 input_path = args .input
447464 config_path = args .config
448465 output_path = args .output
466+ flavor = args .flavor
449467
450468 if args .n is not None :
451469 try :
@@ -500,6 +518,7 @@ def main():
500518 segment_sentences = segment_sentences ,
501519 force = force ,
502520 verbose = verbose ,
521+ flavor = flavor
503522 )
504523
505524 runtime = round (time .time () - start_time , 3 )
0 commit comments