From 9117f061ea77f0dde5cbdc045776e7d09dc41a68 Mon Sep 17 00:00:00 2001 From: Sarah Weissman Date: Tue, 11 Jun 2013 07:33:07 -0400 Subject: [PATCH 1/2] Error checks for parsing wikipedia dumps --- .../cloud9/collection/wikipedia/WikipediaPage.java | 11 +++++++---- .../wikipedia/language/EnglishWikipediaPage.java | 8 ++++++-- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java b/src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java index 497e767c8..ea55ea7d8 100644 --- a/src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java +++ b/src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java @@ -84,8 +84,9 @@ public abstract class WikipediaPage extends Indexable { /** * Start delimiter of the text, which is <text xml:space=\"preserve\">. + * Note: No close bracket because text element can have multiple attributes. */ - protected static final String XML_START_TAG_TEXT = ""; + protected static final String XML_START_TAG_TEXT = "/text>. @@ -172,9 +173,11 @@ public String getLanguage() { */ public String getContent() { String s = getWikiMarkup(); - + if(s == null) return null; // Bliki doesn't seem to properly handle inter-language links, so remove manually. - s = LANG_LINKS.matcher(s).replaceAll(" "); + if(LANG_LINKS.matcher(s).matches()){ + s = LANG_LINKS.matcher(s).replaceAll(" "); + } wikiModel.setUp(); s = getTitle() + "\n" + wikiModel.render(textConverter, s); @@ -222,7 +225,7 @@ public String getRawXML() { * Returns the text of this page. */ public String getWikiMarkup() { - if (textStart == -1) + if (textStart == -1 || textStat + 27 > textEnd) return null; return page.substring(textStart + 27, textEnd); diff --git a/src/dist/edu/umd/cloud9/collection/wikipedia/language/EnglishWikipediaPage.java b/src/dist/edu/umd/cloud9/collection/wikipedia/language/EnglishWikipediaPage.java index e8f9c3b8e..a8b1a9d72 100644 --- a/src/dist/edu/umd/cloud9/collection/wikipedia/language/EnglishWikipediaPage.java +++ b/src/dist/edu/umd/cloud9/collection/wikipedia/language/EnglishWikipediaPage.java @@ -53,18 +53,22 @@ protected void processPage(String s) { // parse out title int start = s.indexOf(XML_START_TAG_TITLE); int end = s.indexOf(XML_END_TAG_TITLE, start); + if(start < 0 || end < 0){ + textStart = -1; + return; + } this.title = StringEscapeUtils.unescapeHtml(s.substring(start + 7, end)); // determine if article belongs to the article namespace start = s.indexOf(XML_START_TAG_NAMESPACE); end = s.indexOf(XML_END_TAG_NAMESPACE); - this.isArticle = start == -1 ? true : s.substring(start + 4, end).trim().equals("0"); + this.isArticle = (start == -1 || end == -1 || start > end) ? false : s.substring(start + 4, end).trim().equals("0"); // add check because namespace tag not present in older dumps // parse out the document id start = s.indexOf(XML_START_TAG_ID); end = s.indexOf(XML_END_TAG_ID); - this.mId = s.substring(start + 4, end); + this.mId = (start == -1 || end == -1 || start > end) ? "0" : s.substring(start + 4, end); // parse out actual text of article this.textStart = s.indexOf(XML_START_TAG_TEXT); From ebb59da6d6aa731580a2f9d09f6bd6880a910242 Mon Sep 17 00:00:00 2001 From: Sarah Weissman Date: Tue, 11 Jun 2013 07:43:52 -0400 Subject: [PATCH 2/2] Fixed Typo --- src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java b/src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java index ea55ea7d8..3ba500db3 100644 --- a/src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java +++ b/src/dist/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java @@ -225,7 +225,7 @@ public String getRawXML() { * Returns the text of this page. */ public String getWikiMarkup() { - if (textStart == -1 || textStat + 27 > textEnd) + if (textStart == -1 || textStart + 27 > textEnd) return null; return page.substring(textStart + 27, textEnd);