From 2298612d1137de4b42452601d758351be5bba6be Mon Sep 17 00:00:00 2001
From: Norman Gilmore <git@teamforecast.com>
Date: Mon, 18 Sep 2017 12:01:56 -0700
Subject: [PATCH 1/4] Fix parse_document to parse file from command line
 filename arg.

---
 data/parse_document.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/parse_document.py b/data/parse_document.py
index 0d78bb1..c568297 100644
--- a/data/parse_document.py
+++ b/data/parse_document.py
@@ -412,13 +412,13 @@ def parse_documents(directory_path, error_directory_paths):
     }
 
     outfile = os.path.join(OUTPUT_FOLDER, "articles.json")
+    data = []
     if len(sys.argv) > 1:
-        data = []
         file_path = sys.argv[1]
         file_name, ext = os.path.splitext(os.path.basename(file_path))
         outfile = os.path.join(OUTPUT_FOLDER, os.path.basename(file_name) + ".json")
         try:
-            data.append(parse_document(file_path))
+            data.append(parse_document(os.path.abspath(file_path), os.path.basename(file_path)))
         except ArticleParseError as e:
             print e
     else:

From ff074def40d87e82389d7819b00318f8477d217f Mon Sep 17 00:00:00 2001
From: Steven Elleman <selleman@berkeley.edu>
Date: Wed, 11 Oct 2017 18:24:24 -0700
Subject: [PATCH 2/4] Deciding Force article parser modified to handle
 highlights and to throw errors for improperly formatted highlights

---
 data/parse_document.py | 60 ++++++++++++++++++++++++++++++++++++++----
 data/pybossa_api.py    |  3 +++
 2 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/data/parse_document.py b/data/parse_document.py
index c568297..a5df747 100644
--- a/data/parse_document.py
+++ b/data/parse_document.py
@@ -97,11 +97,60 @@ def parse_article(raw_text, filename):
         raise ArticleParseError("Only found Useless tuas!",
                                 ArticleParseError.DUPLICATE_ERROR)
 
+    #print(clean_text)
     # Warning: brackets left over are usually bad news.
-    if '[' in clean_text or ']' in clean_text:
-        print "Unparsed brackets left in article:", article_number
-#        raise ArticleParseError("Brackets remain in clean text!",
-#                                ArticleParseError.BRACKET_WARNING)
+
+    #Trace highlights
+    index = 0
+    highlight_open = False
+    highlights = []
+
+    while index < len(clean_text):
+
+        #Assume that highlights cannot have internal square brackets... throw error.
+        if (clean_text[index] == "[" and highlight_open):
+            raise Exception("Extra [ in " + filename + " article number " + article_number + "\n")
+
+        #Assume that right square bracket with its corresponding left square bracket is an error.
+        elif (clean_text[index] == "]" and not highlight_open):
+            raise Exception("Extra ] in " + filename + " article number " + article_number + "\n")
+
+        #Start of highlight.
+        elif clean_text[index] == "[":
+            highlight_open = True
+            start = index
+
+        #Close of highlight... Remove highlight from text.
+        elif clean_text[index] == "]" and highlight_open:
+            highlight = {'start': start, 'end': index, 'text': clean_text[start:index+1]}
+            clean_text = clean_text[0:start] + clean_text[index+1:]
+            index -= index+1 - start
+            highlights.append(highlight)
+            highlight_open = False
+        index += 1
+
+    #Find highlight offsets with new clean doc
+    offsets = []
+    for highlight in highlights:
+        text = highlight['text'][1:-1].strip()
+        text_length = len(text)
+
+        start_index = clean_text.find(text)
+        if start_index == -1:
+            raise Exception("Highlight '" + text + "' not recognized in " + filename + "\n")
+        end_index = start_index + text_length
+        offsets.append([start_index, end_index])
+
+
+    #1. make error type in pybossa_api.py
+    #2. try/catch in load_data.py (where parse_document is used)
+
+    #Can I delete below the comments below?
+    #if '[' in clean_text or ']' in clean_text:
+    #    print "Unparsed brackets left in article:", article_number
+    #        raise ArticleParseError("Brackets remain in clean text!",
+    #                                ArticleParseError.BRACKET_WARNING)
+    #
 
     # print out our data.
     # TODO: store this somewhere.
@@ -115,6 +164,7 @@ def parse_article(raw_text, filename):
         'periodical': periodical,
         'periodical_code': periodical_code,
         'filename': filename,
+        'highlight_offsets': offsets,
     }
     return {
         'metadata': metadata,
@@ -396,7 +446,7 @@ def parse_documents(directory_path, error_directory_paths):
 if __name__ == '__main__':
     DATA_FOLDER = os.path.dirname(os.path.abspath(__file__))
     ARTICLE_FOLDER = os.path.join(DATA_FOLDER, "sample/articles")
-    OUTPUT_FOLDER = os.path.join(DATA_FOLDER, "DocumentsParsed")
+    OUTPUT_FOLDER = os.path.join(DATA_FOLDER, "DocumentsParsed/all_articles")
     FILENAME_ERROR_FOLDER = os.path.join(DATA_FOLDER, "DocumentErrors/filename")
     HEADER_ERROR_FOLDER = os.path.join(DATA_FOLDER, "DocumentErrors/header")
     TEXT_ERROR_FOLDER = os.path.join(DATA_FOLDER, "DocumentErrors/text")
diff --git a/data/pybossa_api.py b/data/pybossa_api.py
index 66b8649..ab07e3b 100644
--- a/data/pybossa_api.py
+++ b/data/pybossa_api.py
@@ -36,6 +36,9 @@ class ImproperConfigForRemote(Exception):
 class InvalidTaskRun(Exception):
     pass
 
+class DecidingForceParserError(Exception):
+    pass
+
 @django_rq.job('task_exporter', timeout=60, result_ttl=24*3600)
 def create_or_update_remote_project_worker(project_id,
                                            debug_presenter=False,

From 75b35f0ace3c5bd3949f1adceca7df56d94906c6 Mon Sep 17 00:00:00 2001
From: Steven Elleman <selleman@berkeley.edu>
Date: Wed, 11 Oct 2017 18:26:44 -0700
Subject: [PATCH 3/4] Clean up

---
 data/parse_document.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/data/parse_document.py b/data/parse_document.py
index a5df747..beec267 100644
--- a/data/parse_document.py
+++ b/data/parse_document.py
@@ -446,7 +446,7 @@ def parse_documents(directory_path, error_directory_paths):
 if __name__ == '__main__':
     DATA_FOLDER = os.path.dirname(os.path.abspath(__file__))
     ARTICLE_FOLDER = os.path.join(DATA_FOLDER, "sample/articles")
-    OUTPUT_FOLDER = os.path.join(DATA_FOLDER, "DocumentsParsed/all_articles")
+    OUTPUT_FOLDER = os.path.join(DATA_FOLDER, "DocumentsParsed")
     FILENAME_ERROR_FOLDER = os.path.join(DATA_FOLDER, "DocumentErrors/filename")
     HEADER_ERROR_FOLDER = os.path.join(DATA_FOLDER, "DocumentErrors/header")
     TEXT_ERROR_FOLDER = os.path.join(DATA_FOLDER, "DocumentErrors/text")
@@ -462,13 +462,13 @@ def parse_documents(directory_path, error_directory_paths):
     }
 
     outfile = os.path.join(OUTPUT_FOLDER, "articles.json")
-    data = []
     if len(sys.argv) > 1:
+        data = []
         file_path = sys.argv[1]
         file_name, ext = os.path.splitext(os.path.basename(file_path))
         outfile = os.path.join(OUTPUT_FOLDER, os.path.basename(file_name) + ".json")
         try:
-            data.append(parse_document(os.path.abspath(file_path), os.path.basename(file_path)))
+            data.append(parse_document(file_path))
         except ArticleParseError as e:
             print e
     else:

From ae919461a19c37e14b5818861f09376058dfa5b0 Mon Sep 17 00:00:00 2001
From: Steven Elleman <selleman@berkeley.edu>
Date: Wed, 11 Oct 2017 18:40:26 -0700
Subject: [PATCH 4/4] Clean up

---
 data/parse_document.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/data/parse_document.py b/data/parse_document.py
index beec267..91c6273 100644
--- a/data/parse_document.py
+++ b/data/parse_document.py
@@ -97,9 +97,6 @@ def parse_article(raw_text, filename):
         raise ArticleParseError("Only found Useless tuas!",
                                 ArticleParseError.DUPLICATE_ERROR)
 
-    #print(clean_text)
-    # Warning: brackets left over are usually bad news.
-
     #Trace highlights
     index = 0
     highlight_open = False
@@ -111,7 +108,7 @@ def parse_article(raw_text, filename):
         if (clean_text[index] == "[" and highlight_open):
             raise Exception("Extra [ in " + filename + " article number " + article_number + "\n")
 
-        #Assume that right square bracket with its corresponding left square bracket is an error.
+        #Assume that right square bracket without its corresponding left square bracket is an error.
         elif (clean_text[index] == "]" and not highlight_open):
             raise Exception("Extra ] in " + filename + " article number " + article_number + "\n")
 
@@ -141,10 +138,6 @@ def parse_article(raw_text, filename):
         end_index = start_index + text_length
         offsets.append([start_index, end_index])
 
-
-    #1. make error type in pybossa_api.py
-    #2. try/catch in load_data.py (where parse_document is used)
-
     #Can I delete below the comments below?
     #if '[' in clean_text or ']' in clean_text:
     #    print "Unparsed brackets left in article:", article_number