diff --git a/data/parse_document.py b/data/parse_document.py index 0d78bb1..91c6273 100644 --- a/data/parse_document.py +++ b/data/parse_document.py @@ -97,11 +97,53 @@ def parse_article(raw_text, filename): raise ArticleParseError("Only found Useless tuas!", ArticleParseError.DUPLICATE_ERROR) - # Warning: brackets left over are usually bad news. - if '[' in clean_text or ']' in clean_text: - print "Unparsed brackets left in article:", article_number -# raise ArticleParseError("Brackets remain in clean text!", -# ArticleParseError.BRACKET_WARNING) + #Trace highlights + index = 0 + highlight_open = False + highlights = [] + + while index < len(clean_text): + + #Assume that highlights cannot have internal square brackets... throw error. + if (clean_text[index] == "[" and highlight_open): + raise Exception("Extra [ in " + filename + " article number " + article_number + "\n") + + #Assume that right square bracket without its corresponding left square bracket is an error. + elif (clean_text[index] == "]" and not highlight_open): + raise Exception("Extra ] in " + filename + " article number " + article_number + "\n") + + #Start of highlight. + elif clean_text[index] == "[": + highlight_open = True + start = index + + #Close of highlight... Remove highlight from text. + elif clean_text[index] == "]" and highlight_open: + highlight = {'start': start, 'end': index, 'text': clean_text[start:index+1]} + clean_text = clean_text[0:start] + clean_text[index+1:] + index -= index+1 - start + highlights.append(highlight) + highlight_open = False + index += 1 + + #Find highlight offsets with new clean doc + offsets = [] + for highlight in highlights: + text = highlight['text'][1:-1].strip() + text_length = len(text) + + start_index = clean_text.find(text) + if start_index == -1: + raise Exception("Highlight '" + text + "' not recognized in " + filename + "\n") + end_index = start_index + text_length + offsets.append([start_index, end_index]) + + #Can I delete below the comments below? + #if '[' in clean_text or ']' in clean_text: + # print "Unparsed brackets left in article:", article_number + # raise ArticleParseError("Brackets remain in clean text!", + # ArticleParseError.BRACKET_WARNING) + # # print out our data. # TODO: store this somewhere. @@ -115,6 +157,7 @@ def parse_article(raw_text, filename): 'periodical': periodical, 'periodical_code': periodical_code, 'filename': filename, + 'highlight_offsets': offsets, } return { 'metadata': metadata, diff --git a/data/pybossa_api.py b/data/pybossa_api.py index 66b8649..ab07e3b 100644 --- a/data/pybossa_api.py +++ b/data/pybossa_api.py @@ -36,6 +36,9 @@ class ImproperConfigForRemote(Exception): class InvalidTaskRun(Exception): pass +class DecidingForceParserError(Exception): + pass + @django_rq.job('task_exporter', timeout=60, result_ttl=24*3600) def create_or_update_remote_project_worker(project_id, debug_presenter=False,