From 2298612d1137de4b42452601d758351be5bba6be Mon Sep 17 00:00:00 2001 From: Norman Gilmore Date: Mon, 18 Sep 2017 12:01:56 -0700 Subject: [PATCH 1/4] Fix parse_document to parse file from command line filename arg. --- data/parse_document.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data/parse_document.py b/data/parse_document.py index 0d78bb1..c568297 100644 --- a/data/parse_document.py +++ b/data/parse_document.py @@ -412,13 +412,13 @@ def parse_documents(directory_path, error_directory_paths): } outfile = os.path.join(OUTPUT_FOLDER, "articles.json") + data = [] if len(sys.argv) > 1: - data = [] file_path = sys.argv[1] file_name, ext = os.path.splitext(os.path.basename(file_path)) outfile = os.path.join(OUTPUT_FOLDER, os.path.basename(file_name) + ".json") try: - data.append(parse_document(file_path)) + data.append(parse_document(os.path.abspath(file_path), os.path.basename(file_path))) except ArticleParseError as e: print e else: From ff074def40d87e82389d7819b00318f8477d217f Mon Sep 17 00:00:00 2001 From: Steven Elleman Date: Wed, 11 Oct 2017 18:24:24 -0700 Subject: [PATCH 2/4] Deciding Force article parser modified to handle highlights and to throw errors for improperly formatted highlights --- data/parse_document.py | 60 ++++++++++++++++++++++++++++++++++++++---- data/pybossa_api.py | 3 +++ 2 files changed, 58 insertions(+), 5 deletions(-) diff --git a/data/parse_document.py b/data/parse_document.py index c568297..a5df747 100644 --- a/data/parse_document.py +++ b/data/parse_document.py @@ -97,11 +97,60 @@ def parse_article(raw_text, filename): raise ArticleParseError("Only found Useless tuas!", ArticleParseError.DUPLICATE_ERROR) + #print(clean_text) # Warning: brackets left over are usually bad news. - if '[' in clean_text or ']' in clean_text: - print "Unparsed brackets left in article:", article_number -# raise ArticleParseError("Brackets remain in clean text!", -# ArticleParseError.BRACKET_WARNING) + + #Trace highlights + index = 0 + highlight_open = False + highlights = [] + + while index < len(clean_text): + + #Assume that highlights cannot have internal square brackets... throw error. + if (clean_text[index] == "[" and highlight_open): + raise Exception("Extra [ in " + filename + " article number " + article_number + "\n") + + #Assume that right square bracket with its corresponding left square bracket is an error. + elif (clean_text[index] == "]" and not highlight_open): + raise Exception("Extra ] in " + filename + " article number " + article_number + "\n") + + #Start of highlight. + elif clean_text[index] == "[": + highlight_open = True + start = index + + #Close of highlight... Remove highlight from text. + elif clean_text[index] == "]" and highlight_open: + highlight = {'start': start, 'end': index, 'text': clean_text[start:index+1]} + clean_text = clean_text[0:start] + clean_text[index+1:] + index -= index+1 - start + highlights.append(highlight) + highlight_open = False + index += 1 + + #Find highlight offsets with new clean doc + offsets = [] + for highlight in highlights: + text = highlight['text'][1:-1].strip() + text_length = len(text) + + start_index = clean_text.find(text) + if start_index == -1: + raise Exception("Highlight '" + text + "' not recognized in " + filename + "\n") + end_index = start_index + text_length + offsets.append([start_index, end_index]) + + + #1. make error type in pybossa_api.py + #2. try/catch in load_data.py (where parse_document is used) + + #Can I delete below the comments below? + #if '[' in clean_text or ']' in clean_text: + # print "Unparsed brackets left in article:", article_number + # raise ArticleParseError("Brackets remain in clean text!", + # ArticleParseError.BRACKET_WARNING) + # # print out our data. # TODO: store this somewhere. @@ -115,6 +164,7 @@ def parse_article(raw_text, filename): 'periodical': periodical, 'periodical_code': periodical_code, 'filename': filename, + 'highlight_offsets': offsets, } return { 'metadata': metadata, @@ -396,7 +446,7 @@ def parse_documents(directory_path, error_directory_paths): if __name__ == '__main__': DATA_FOLDER = os.path.dirname(os.path.abspath(__file__)) ARTICLE_FOLDER = os.path.join(DATA_FOLDER, "sample/articles") - OUTPUT_FOLDER = os.path.join(DATA_FOLDER, "DocumentsParsed") + OUTPUT_FOLDER = os.path.join(DATA_FOLDER, "DocumentsParsed/all_articles") FILENAME_ERROR_FOLDER = os.path.join(DATA_FOLDER, "DocumentErrors/filename") HEADER_ERROR_FOLDER = os.path.join(DATA_FOLDER, "DocumentErrors/header") TEXT_ERROR_FOLDER = os.path.join(DATA_FOLDER, "DocumentErrors/text") diff --git a/data/pybossa_api.py b/data/pybossa_api.py index 66b8649..ab07e3b 100644 --- a/data/pybossa_api.py +++ b/data/pybossa_api.py @@ -36,6 +36,9 @@ class ImproperConfigForRemote(Exception): class InvalidTaskRun(Exception): pass +class DecidingForceParserError(Exception): + pass + @django_rq.job('task_exporter', timeout=60, result_ttl=24*3600) def create_or_update_remote_project_worker(project_id, debug_presenter=False, From 75b35f0ace3c5bd3949f1adceca7df56d94906c6 Mon Sep 17 00:00:00 2001 From: Steven Elleman Date: Wed, 11 Oct 2017 18:26:44 -0700 Subject: [PATCH 3/4] Clean up --- data/parse_document.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data/parse_document.py b/data/parse_document.py index a5df747..beec267 100644 --- a/data/parse_document.py +++ b/data/parse_document.py @@ -446,7 +446,7 @@ def parse_documents(directory_path, error_directory_paths): if __name__ == '__main__': DATA_FOLDER = os.path.dirname(os.path.abspath(__file__)) ARTICLE_FOLDER = os.path.join(DATA_FOLDER, "sample/articles") - OUTPUT_FOLDER = os.path.join(DATA_FOLDER, "DocumentsParsed/all_articles") + OUTPUT_FOLDER = os.path.join(DATA_FOLDER, "DocumentsParsed") FILENAME_ERROR_FOLDER = os.path.join(DATA_FOLDER, "DocumentErrors/filename") HEADER_ERROR_FOLDER = os.path.join(DATA_FOLDER, "DocumentErrors/header") TEXT_ERROR_FOLDER = os.path.join(DATA_FOLDER, "DocumentErrors/text") @@ -462,13 +462,13 @@ def parse_documents(directory_path, error_directory_paths): } outfile = os.path.join(OUTPUT_FOLDER, "articles.json") - data = [] if len(sys.argv) > 1: + data = [] file_path = sys.argv[1] file_name, ext = os.path.splitext(os.path.basename(file_path)) outfile = os.path.join(OUTPUT_FOLDER, os.path.basename(file_name) + ".json") try: - data.append(parse_document(os.path.abspath(file_path), os.path.basename(file_path))) + data.append(parse_document(file_path)) except ArticleParseError as e: print e else: From ae919461a19c37e14b5818861f09376058dfa5b0 Mon Sep 17 00:00:00 2001 From: Steven Elleman Date: Wed, 11 Oct 2017 18:40:26 -0700 Subject: [PATCH 4/4] Clean up --- data/parse_document.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/data/parse_document.py b/data/parse_document.py index beec267..91c6273 100644 --- a/data/parse_document.py +++ b/data/parse_document.py @@ -97,9 +97,6 @@ def parse_article(raw_text, filename): raise ArticleParseError("Only found Useless tuas!", ArticleParseError.DUPLICATE_ERROR) - #print(clean_text) - # Warning: brackets left over are usually bad news. - #Trace highlights index = 0 highlight_open = False @@ -111,7 +108,7 @@ def parse_article(raw_text, filename): if (clean_text[index] == "[" and highlight_open): raise Exception("Extra [ in " + filename + " article number " + article_number + "\n") - #Assume that right square bracket with its corresponding left square bracket is an error. + #Assume that right square bracket without its corresponding left square bracket is an error. elif (clean_text[index] == "]" and not highlight_open): raise Exception("Extra ] in " + filename + " article number " + article_number + "\n") @@ -141,10 +138,6 @@ def parse_article(raw_text, filename): end_index = start_index + text_length offsets.append([start_index, end_index]) - - #1. make error type in pybossa_api.py - #2. try/catch in load_data.py (where parse_document is used) - #Can I delete below the comments below? #if '[' in clean_text or ']' in clean_text: # print "Unparsed brackets left in article:", article_number