From b02682632c7986ddc509ca6408c31b8c4f8f7767 Mon Sep 17 00:00:00 2001 From: "Earl J. Wagner" Date: Sun, 25 Sep 2011 19:33:42 -0400 Subject: [PATCH 01/18] minor fix --- build.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build.xml b/build.xml index 33c5efe22..20b532d14 100644 --- a/build.xml +++ b/build.xml @@ -103,10 +103,10 @@ - + - + From 2fecb8d44580b1b86cd95efb9b09716f9ff47086 Mon Sep 17 00:00:00 2001 From: "Earl J. Wagner" Date: Sun, 25 Sep 2011 19:34:54 -0400 Subject: [PATCH 02/18] added optional printing of elements --- src/dist/edu/umd/cloud9/util/map/HMapIF.java | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/dist/edu/umd/cloud9/util/map/HMapIF.java b/src/dist/edu/umd/cloud9/util/map/HMapIF.java index 14ec8cbd4..b7c449fc2 100644 --- a/src/dist/edu/umd/cloud9/util/map/HMapIF.java +++ b/src/dist/edu/umd/cloud9/util/map/HMapIF.java @@ -810,22 +810,31 @@ float loadFactor() { return loadFactor; } - public String toString() { + public String toString () { + return toString (-1); + } + + public String toString (int n) { Iterator i = entrySet().iterator(); - if (!i.hasNext()) + if (!i.hasNext() || n == 0) return "{}"; StringBuilder sb = new StringBuilder(); sb.append('{'); - for (;;) { + for (int m = 2; ; m++) { MapIF.Entry e = i.next(); int key = e.getKey(); float value = e.getValue(); + //sb.append("(m: " + m + ", n: " + n + ")"); sb.append(key); sb.append('='); sb.append(value); - if (!i.hasNext()) + if (! i.hasNext() || (m > n && n > 0)) { + if (i.hasNext()) { + sb.append (", ..."); + } return sb.append('}').toString(); + } sb.append(", "); } } From ea493b94f8981eab7b2b9ad69d0d6e20c67a4f80 Mon Sep 17 00:00:00 2001 From: "Earl J. Wagner" Date: Fri, 7 Oct 2011 14:10:23 -0400 Subject: [PATCH 03/18] deterministically convert between Aquaint2 docnos and docids --- .../aquaint2/Aquaint2DocnoMapping.java | 184 ++++++++++++++++-- .../aquaint2/BuildAquaint2ForwardIndex.java | 2 +- 2 files changed, 166 insertions(+), 20 deletions(-) diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java index 26a116072..65fb43ed6 100644 --- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java +++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java @@ -27,6 +27,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.util.LineReader; +import org.apache.log4j.Level; import org.apache.log4j.Logger; import com.google.common.base.Preconditions; @@ -36,24 +37,128 @@ public class Aquaint2DocnoMapping implements DocnoMapping { private static final Logger LOG = Logger.getLogger(Aquaint2DocnoMapping.class); + // { LOG.setLevel (Level.TRACE); } - private String[] docids; + private String[] docidEntries; @Override public int getDocno(String docid) { + LOG.trace("getDocno(docid: " + docid + ")"); Preconditions.checkNotNull(docid); - return Arrays.binarySearch(docids, docid); + String source = docid.substring(0, 7); + int year = Integer.parseInt(docid.substring (8, 12)); + int month = Integer.parseInt(docid.substring (12, 14)); + int day = Integer.parseInt(docid.substring (14, 16)); + int articleNo = Integer.parseInt(docid.substring (17, 21)); + + // first traverse the entries to find the month entry and get its days + int entryId = findEntryId(source, year, month); + LOG.debug("entryId: " + entryId); + + String entryElt = docidEntries[entryId].split("\t")[day]; + LOG.debug("entryElt: " + entryElt); + + // then traverse the days to find the day and skip over missing articles to get the article number + String[] entryEltParts = entryElt.split(" "); + int result = articleNo + Integer.parseInt(entryEltParts[0]); + String[] entryDayParts = entryEltParts[1].split(","); + for (int i = 1; i < entryDayParts.length; i++) { + int missingNo = Integer.parseInt(entryDayParts[i]); + if (articleNo < missingNo) break; + LOG.debug("skipping missingNo: " + missingNo); + result--; + } + + LOG.trace("getDocno returning: " + result); + return result; + } + + private int findEntryId(String source, int year, int month) { + for (int i = 0; i < docidEntries.length; i++) { + LOG.debug("docidEntries [" + i + "]: " + docidEntries[i]); + String[] entryElts = docidEntries[i].split("\t"); + String[] entryMetaInfo = entryElts[0].split(" "); + String entrySource = entryMetaInfo[1]; + if (entrySource.equals (source)) { + int entryYear = Integer.parseInt(entryMetaInfo[2]); + if (entryYear == year) { + int entryMonth = Integer.parseInt(entryMetaInfo[3]); + if (entryMonth == month) { + return i; + } + } + } + } + return -1; } + @Override public String getDocid(int docno) { Preconditions.checkArgument(docno > 0); - return docids[docno]; + LOG.trace("getDocid(docno: " + docno + ")"); + + // first traverse the entries to find the month entry and get its source, year, month + int entryId = findEntryId(docno); + LOG.debug("entryId: " + entryId); + String[] entryElts = docidEntries[entryId].split("\t"); + String[] entryMetaInfo = entryElts[0].split(" "); + String source = entryMetaInfo[1]; + int year = Integer.parseInt(entryMetaInfo[2]); + int month = Integer.parseInt(entryMetaInfo[3]); + LOG.debug("looking at: " + String.format("%s_%04d%02d__.____", source, year, month)); + + // then traverse the days to find the day and skip over missing articles to get the article number + String[] entryEltParts = findEntryEltParts (docno, entryElts); + int offset = Integer.parseInt(entryEltParts[0]); + String[] entryDayParts = entryEltParts[1].split(","); + int day = Integer.parseInt(entryDayParts[0]); + LOG.debug("found day: " + day + ", looking at: " + String.format("%s_%04d%02d%02d.____", source, year, month, day)); + int articleNo = docno - offset; + for (int i = 1; i < entryDayParts.length; i++) { + int missingNo = Integer.parseInt(entryDayParts[i]); + if (articleNo < missingNo) break; + LOG.debug("skipping missingNo: " + missingNo); + articleNo++; + } + LOG.debug("found articleNo: " + articleNo + ", looking at: " + String.format("%s_%04d%02d%02d.%04d", source, year, month, day, articleNo)); + return String.format ("%s_%04d%02d%02d.%04d", source, year, month, day, articleNo); + } + + + private int findEntryId(int docno) { + for (int i = 0; i < docidEntries.length; i++) { + LOG.debug("docidEntries [" + i + "]: " + docidEntries[i]); + int entryOffset = Integer.parseInt(docidEntries[i].split(" ") [0]); + if (entryOffset >= docno) { + return i - 1; + } + } + return docidEntries.length - 1; } + + private String[] findEntryEltParts(int docno, String[] entryElts) { + String[] thisEltParts = new String[0]; + int prevOffset = -1; + String[] prevEltParts = new String[0]; + + for (int i = 1; i < entryElts.length; i++) { + thisEltParts = entryElts[i].split(" "); + int thisOffset = Integer.parseInt(thisEltParts[0]); + if (thisOffset >= docno) { + return prevEltParts; + } + prevOffset = thisOffset; + prevEltParts = thisEltParts; + } + return thisEltParts; + } + + @Override public void loadMapping(Path p, FileSystem fs) throws IOException { - docids = Aquaint2DocnoMapping.readDocnoData(p, fs); + docidEntries = Aquaint2DocnoMapping.readDocnoData(p, fs); } static public void writeDocnoData(Path input, Path output, FileSystem fs) throws IOException { @@ -64,14 +169,59 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws LOG.info("Reading " + input); int cnt = 0; Text line = new Text(); + + String prevSource = null; + int prevYear = -1; + int prevMonth = -1; + int prevDay = -1; + int prevArticleNo = -1; + StringBuilder currentEntry = null; + while (reader.readLine(line) > 0) { - String[] arr = line.toString().split("\\t"); - list.add(arr[0]); + String docid = line.toString(); + + String source = docid.substring(0, 7); + int year = Integer.parseInt(docid.substring (8, 12)); + int month = Integer.parseInt(docid.substring (12, 14)); + int day = Integer.parseInt(docid.substring (14, 16)); + int articleNo = Integer.parseInt(docid.substring (17, 21)); + LOG.debug("prevSource: " + prevSource + ", prevYear: " + prevYear + ", prevMonth: " + prevMonth + ", prevDay: " + prevDay + ", prevArticleNo: " + prevArticleNo); + LOG.debug("source: " + source + ", year: " + year + ", month: " + month + ", day: " + day + ", articleNo: " + articleNo); + + if (! source.equals(prevSource) || + year != prevYear || + month != prevMonth) { + LOG.debug("currentEntry: " + currentEntry); + if (currentEntry != null) list.add(currentEntry.toString()); + currentEntry = new StringBuilder(cnt + " " + source + " " + year + " " + month); + prevDay = 0; + prevArticleNo = 0; + } + if (day != prevDay) { + for (int i = prevDay + 1; i <= day; i++) { + currentEntry.append("\t" + cnt + " " + i); + } + prevArticleNo = 0; + } + if (articleNo != prevArticleNo + 1) { + // we have missing article numbers - gather them + for (int i = prevArticleNo + 1; i < articleNo; i++) { + currentEntry.append("," + i); + } + } + prevSource = source; + prevYear = year; + prevMonth = month; + prevDay = day; + prevArticleNo = articleNo; + cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " docs"); } } + list.add(currentEntry.toString()); + list.add("" + cnt); reader.close(); LOG.info(cnt + " docs total. Done!"); @@ -83,31 +233,27 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws out.writeUTF(list.get(i)); cnt++; if (cnt % 100000 == 0) { - LOG.info(cnt + " docs"); + LOG.info(cnt + " months of docs"); } } out.close(); - LOG.info(cnt + " docs total. Done!"); + LOG.info(cnt + " months of docs total. Done!"); } static public String[] readDocnoData(Path p, FileSystem fs) throws IOException { - LOG.warn("p: " + p); + LOG.trace("readDocnoData (p: " + p + ", fs)"); FSDataInputStream in = fs.open(p); - // Docnos start at one, so we need an array that's one larger than number of docs. - int sz = in.readInt() + 1; - LOG.warn("creating array of length: " + sz); + int sz = in.readInt(); + LOG.debug("creating a month array of length: " + sz); String[] arr = new String[sz]; - for (int i = 1; i < sz; i++) { + for (int i = 0; i < sz; i++) { arr[i] = in.readUTF(); + LOG.debug("arr[" + i + "]: " + arr[i]); } in.close(); - // Can't leave the zero'th entry null, or else we might get a null pointer exception during a - // binary search on the array. - arr[0] = ""; - return arr; } @@ -125,8 +271,8 @@ public static void main(String[] args) throws IOException { mapping.loadMapping(new Path(args[1]), fs); if (args[0].equals("list")) { - for (int i = 1; i < mapping.docids.length; i++) { - System.out.println(i + "\t" + mapping.docids[i]); + for (int i = 1; i < mapping.docidEntries.length; i++) { + System.out.println(i + "\t" + mapping.docidEntries[i]); } } else if (args[0].equals("getDocno")) { System.out.println("looking up docno for \"" + args[2] + "\""); diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/BuildAquaint2ForwardIndex.java b/src/dist/edu/umd/cloud9/collection/aquaint2/BuildAquaint2ForwardIndex.java index 9a6d3f2fc..3163f0cf0 100644 --- a/src/dist/edu/umd/cloud9/collection/aquaint2/BuildAquaint2ForwardIndex.java +++ b/src/dist/edu/umd/cloud9/collection/aquaint2/BuildAquaint2ForwardIndex.java @@ -164,7 +164,7 @@ public int runTool (Configuration config, String collectionPath, String outputPa String inputFile = outputPath + "/" + "part-00000"; - sLogger.info("Writing " + numDocs + " doc offseta to " + indexFile); + sLogger.info("Writing " + numDocs + " doc offsets to " + indexFile); FSLineReader reader = new FSLineReader(inputFile, fs); FSDataOutputStream writer = fs.create(new Path(indexFile), true); From c4e081c4ddeb2232cdb1d601174de68bd961d7ab Mon Sep 17 00:00:00 2001 From: "Earl J. Wagner" Date: Tue, 11 Oct 2011 21:38:33 -0400 Subject: [PATCH 04/18] outputting docno mapping data in smaller chunks for writeUTF 64k limit --- .../aquaint2/Aquaint2DocnoMapping.java | 53 ++++++++++++++----- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java index 65fb43ed6..d5b9d6627 100644 --- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java +++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java @@ -37,7 +37,8 @@ public class Aquaint2DocnoMapping implements DocnoMapping { private static final Logger LOG = Logger.getLogger(Aquaint2DocnoMapping.class); - // { LOG.setLevel (Level.TRACE); } + // { LOG.setLevel (Level.TRACE); } + { LOG.setLevel (Level.INFO); } private String[] docidEntries; @@ -176,6 +177,7 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws int prevDay = -1; int prevArticleNo = -1; StringBuilder currentEntry = null; + int numEntries = 0; while (reader.readLine(line) > 0) { String docid = line.toString(); @@ -189,10 +191,14 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws LOG.debug("source: " + source + ", year: " + year + ", month: " + month + ", day: " + day + ", articleNo: " + articleNo); if (! source.equals(prevSource) || - year != prevYear || - month != prevMonth) { + year != prevYear || + month != prevMonth) { LOG.debug("currentEntry: " + currentEntry); - if (currentEntry != null) list.add(currentEntry.toString()); + if (currentEntry != null) { + list.add(currentEntry.toString()); + list.add(""); + numEntries++; + } currentEntry = new StringBuilder(cnt + " " + source + " " + year + " " + month); prevDay = 0; prevArticleNo = 0; @@ -202,6 +208,10 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws currentEntry.append("\t" + cnt + " " + i); } prevArticleNo = 0; + // writeUTF can't write a string longer than 64k, so we output a chunk at a time + // here then concatenate strings between s + list.add(currentEntry.toString()); + currentEntry = new StringBuilder (); } if (articleNo != prevArticleNo + 1) { // we have missing article numbers - gather them @@ -214,30 +224,32 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws prevMonth = month; prevDay = day; prevArticleNo = articleNo; - + cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " docs"); } } list.add(currentEntry.toString()); + list.add(""); + numEntries++; list.add("" + cnt); reader.close(); LOG.info(cnt + " docs total. Done!"); - cnt = 0; LOG.info("Writing " + output); FSDataOutputStream out = fs.create(output, true); - out.writeInt(list.size()); + out.writeInt(numEntries); + numEntries = 0; for (int i = 0; i < list.size(); i++) { out.writeUTF(list.get(i)); - cnt++; - if (cnt % 100000 == 0) { - LOG.info(cnt + " months of docs"); + numEntries++; + if (numEntries % 10000 == 0) { + LOG.info(numEntries + " months of docs"); } } out.close(); - LOG.info(cnt + " months of docs total. Done!"); + LOG.info(numEntries + " months of docs total. Done!"); } static public String[] readDocnoData(Path p, FileSystem fs) throws IOException { @@ -247,9 +259,24 @@ static public String[] readDocnoData(Path p, FileSystem fs) throws IOException { int sz = in.readInt(); LOG.debug("creating a month array of length: " + sz); String[] arr = new String[sz]; + String currentEntryPart = in.readUTF(); + StringBuilder currentEntry = new StringBuilder(); + int i = 0; + while (!currentEntryPart.equals("")) { + LOG.debug("currentEntryPart: " + currentEntryPart); + if (currentEntryPart.equals("")) { + arr[i] = currentEntry.toString(); + LOG.debug("arr[" + i + "]: " + arr[i]); + i++; + currentEntry = new StringBuilder(); + } else { + currentEntry.append(currentEntryPart); + } + currentEntryPart = in.readUTF(); + } - for (int i = 0; i < sz; i++) { - arr[i] = in.readUTF(); + if (currentEntry.length() > 0){ + arr[i] = currentEntry.toString(); LOG.debug("arr[" + i + "]: " + arr[i]); } in.close(); From 79da6def23fdc922e142336a34ee3c10a7a08c4d Mon Sep 17 00:00:00 2001 From: "Earl J. Wagner" Date: Sun, 16 Oct 2011 23:13:22 -0400 Subject: [PATCH 05/18] catch errors with reading headline --- .../collection/aquaint2/Aquaint2Document.java | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java index 08cde55a8..ea7c3089f 100644 --- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java +++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java @@ -22,10 +22,16 @@ import java.util.regex.Pattern; import org.apache.hadoop.io.WritableUtils; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; import edu.umd.cloud9.collection.Indexable; + public class Aquaint2Document extends Indexable { + private static final Logger LOG = Logger.getLogger(Aquaint2Document.class); + { LOG.setLevel (Level.INFO); } + private static Pattern TAGS_PATTERN = Pattern.compile("<[^>]+>"); private static Pattern WHITESPACE_PATTERN = Pattern.compile("\t|\n"); @@ -73,8 +79,15 @@ public String getHeadline() { headline = ""; } else { int end = raw.indexOf(""); - headline = raw.substring(start + 10, end).trim(); - + try { + headline = raw.substring(start + 10, end).trim(); + } catch (Exception e) { + LOG.error("exception: " + e); + LOG.error("docid: " + getDocid () + ", start: " + start + ", end: " + end); + LOG.error(raw); + headline = raw.substring(start + 10).trim(); + LOG.error("headline should be: " + headline); + } headline = TAGS_PATTERN.matcher(headline).replaceAll(""); headline = WHITESPACE_PATTERN.matcher(headline).replaceAll(" "); } From a4d39fd7526c06a42940b61bbcb6ec7901aa8063 Mon Sep 17 00:00:00 2001 From: "Earl J. Wagner" Date: Mon, 24 Oct 2011 23:02:15 -0400 Subject: [PATCH 06/18] reformatted --- .../edu/umd/cloud9/io/SequenceFileUtils.java | 490 +++++++++--------- 1 file changed, 245 insertions(+), 245 deletions(-) diff --git a/src/dist/edu/umd/cloud9/io/SequenceFileUtils.java b/src/dist/edu/umd/cloud9/io/SequenceFileUtils.java index 106cb7ec8..5552a04dc 100644 --- a/src/dist/edu/umd/cloud9/io/SequenceFileUtils.java +++ b/src/dist/edu/umd/cloud9/io/SequenceFileUtils.java @@ -36,249 +36,249 @@ */ public class SequenceFileUtils { - private SequenceFileUtils() {} - - public static List> readFile(Path path) { - FileSystem fs; - try { - fs = FileSystem.get(new Configuration()); - } catch (IOException e) { - throw new RuntimeException("Unable to access the file system!"); - } - - return readFile(path, fs, Integer.MAX_VALUE); - } - - public static List> readFile(Path path, int max) { - FileSystem fs; - try { - fs = FileSystem.get(new Configuration()); - } catch (IOException e) { - throw new RuntimeException("Unable to access the file system!"); - } - - return readFile(path, fs, max); - } - - public static List> readFile(Path path, FileSystem fs) { - return readFile(path, fs, Integer.MAX_VALUE); - } - - /** - * Reads key-value pairs from a SequenceFile, up to a maximum number. - * - * @param path path to file - * @param max maximum of key-value pairs to read - * @return list of key-value pairs - */ - @SuppressWarnings("unchecked") - public static List> readFile(Path path, FileSystem fs, int max) { - List> list = new ArrayList>(); - - try { - int k = 0; - SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf()); - - K key = (K) reader.getKeyClass().newInstance(); - V value = (V) reader.getValueClass().newInstance(); - - while (reader.next(key, value)) { - k++; - list.add(new PairOfWritables(key, value)); - if (k >= max) { - break; - } - - key = (K) reader.getKeyClass().newInstance(); - value = (V) reader.getValueClass().newInstance(); - } - reader.close(); - } catch (Exception e) { - throw new RuntimeException("Error reading SequenceFile " + path); - } - - return list; - } - - public static SortedMap readFileIntoMap(Path path) { - FileSystem fs; - try { - fs = FileSystem.get(new Configuration()); - } catch (IOException e) { - throw new RuntimeException("Unable to access the file system!"); - } - - return readFileIntoMap(path, fs, Integer.MAX_VALUE); - } - - public static SortedMap readFileIntoMap(Path path, int max) { - FileSystem fs; - try { - fs = FileSystem.get(new Configuration()); - } catch (IOException e) { - throw new RuntimeException("Unable to access the file system!"); - } - - return readFileIntoMap(path, fs, max); - } - - public static SortedMap readFileIntoMap(Path path, FileSystem fs) { - return readFileIntoMap(path, fs, Integer.MAX_VALUE); - } - - public static SortedMap readFileIntoMap(Path path, FileSystem fs, int max) { - SortedMap map = new TreeMap(); - - for ( PairOfWritables pair : SequenceFileUtils.readFile(path, fs, max)) { - map.put(pair.getLeftElement(), pair.getRightElement()); - } - return map; - } - - public static List> readDirectory(Path path) { - FileSystem fs; - try { - fs = FileSystem.get(new Configuration()); - } catch (IOException e) { - throw new RuntimeException("Unable to access the file system!"); - } - - return readDirectory(path, fs, Integer.MAX_VALUE); - } - - /** - * Reads key-value pairs from a directory containing SequenceFiles. A - * maximum number of key-value pairs is read from each SequenceFile. - * - * @param path path to directory - * @param max maximum of key-value pairs to read per file - * @return list of key-value pairs - */ - public static List> readDirectory(Path path, FileSystem fs, int max) { - List> list = new ArrayList>(); - - try { - FileStatus[] stat = fs.listStatus(path); - for (int i = 0; i < stat.length; ++i) { - - // skip '_log' directory - if (stat[i].getPath().getName().startsWith("_")) - continue; - - List> pairs = readFile(stat[i].getPath(), fs, max); - list.addAll(pairs); - } - } catch (IOException e) { - throw new RuntimeException("Error reading the file system!"); - } - - return list; - } - - public static List readKeys(Path path) { - FileSystem fs; - try { - fs = FileSystem.get(new Configuration()); - } catch (IOException e) { - throw new RuntimeException("Unable to access the file system!"); - } - - return readKeys(path, fs, Integer.MAX_VALUE); - } - - public static List readKeys(Path path, int max) { - FileSystem fs; - try { - fs = FileSystem.get(new Configuration()); - } catch (IOException e) { - throw new RuntimeException("Unable to access the file system!"); - } - - return readKeys(path, fs, max); - } - - public static List readKeys(Path path, FileSystem fs) { - return readKeys(path, fs, Integer.MAX_VALUE); - } - - @SuppressWarnings("unchecked") - public static List readKeys(Path path, FileSystem fs, int max) { - List list = new ArrayList(); - - try { - int k = 0; - SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf()); - - K key = (K) reader.getKeyClass().newInstance(); - Writable value = (Writable) reader.getValueClass().newInstance(); - while (reader.next(key, value)) { - k++; - list.add(key); - if (k >= max) { - break; - } - - key = (K) reader.getKeyClass().newInstance(); - } - reader.close(); - } catch (Exception e) { - throw new RuntimeException("Error reading SequenceFile " + path); - } - - return list; - } - - public static List readValues(Path path) { - FileSystem fs; - try { - fs = FileSystem.get(new Configuration()); - } catch (IOException e) { - throw new RuntimeException("Unable to access the file system!"); - } - - return readValues(path, fs, Integer.MAX_VALUE); - } - - public static List readValues(Path path, int max) { - FileSystem fs; - try { - fs = FileSystem.get(new Configuration()); - } catch (IOException e) { - throw new RuntimeException("Unable to access the file system!"); - } - - return readValues(path, fs, max); - } - - public static List readValues(Path path, FileSystem fs) { - return readValues(path, fs, Integer.MAX_VALUE); - } - - @SuppressWarnings("unchecked") - public static List readValues(Path path, FileSystem fs, int max) { - List list = new ArrayList(); - - try { - int k = 0; - SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf()); - - Writable key = (Writable) reader.getKeyClass().newInstance(); - V value = (V) reader.getValueClass().newInstance(); - - while (reader.next(key, value)) { - k++; - list.add(value); - if (k >= max) { - break; - } - - value = (V) reader.getValueClass().newInstance(); - } - reader.close(); - } catch (Exception e) { - throw new RuntimeException("Error reading SequenceFile " + path); - } - - return list; - } + private SequenceFileUtils() {} + + public static List> readFile(Path path) { + FileSystem fs; + try { + fs = FileSystem.get(new Configuration()); + } catch (IOException e) { + throw new RuntimeException("Unable to access the file system!"); + } + + return readFile(path, fs, Integer.MAX_VALUE); + } + + public static List> readFile(Path path, int max) { + FileSystem fs; + try { + fs = FileSystem.get(new Configuration()); + } catch (IOException e) { + throw new RuntimeException("Unable to access the file system!"); + } + + return readFile(path, fs, max); + } + + public static List> readFile(Path path, FileSystem fs) { + return readFile(path, fs, Integer.MAX_VALUE); + } + + /** + * Reads key-value pairs from a SequenceFile, up to a maximum number. + * + * @param path path to file + * @param max maximum of key-value pairs to read + * @return list of key-value pairs + */ + @SuppressWarnings("unchecked") + public static List> readFile(Path path, FileSystem fs, int max) { + List> list = new ArrayList>(); + + try { + int k = 0; + SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf()); + + K key = (K) reader.getKeyClass().newInstance(); + V value = (V) reader.getValueClass().newInstance(); + + while (reader.next(key, value)) { + k++; + list.add(new PairOfWritables(key, value)); + if (k >= max) { + break; + } + + key = (K) reader.getKeyClass().newInstance(); + value = (V) reader.getValueClass().newInstance(); + } + reader.close(); + } catch (Exception e) { + throw new RuntimeException("Error reading SequenceFile " + path); + } + + return list; + } + + public static SortedMap readFileIntoMap(Path path) { + FileSystem fs; + try { + fs = FileSystem.get(new Configuration()); + } catch (IOException e) { + throw new RuntimeException("Unable to access the file system!"); + } + + return readFileIntoMap(path, fs, Integer.MAX_VALUE); + } + + public static SortedMap readFileIntoMap(Path path, int max) { + FileSystem fs; + try { + fs = FileSystem.get(new Configuration()); + } catch (IOException e) { + throw new RuntimeException("Unable to access the file system!"); + } + + return readFileIntoMap(path, fs, max); + } + + public static SortedMap readFileIntoMap(Path path, FileSystem fs) { + return readFileIntoMap(path, fs, Integer.MAX_VALUE); + } + + public static SortedMap readFileIntoMap(Path path, FileSystem fs, int max) { + SortedMap map = new TreeMap(); + + for ( PairOfWritables pair : SequenceFileUtils.readFile(path, fs, max)) { + map.put(pair.getLeftElement(), pair.getRightElement()); + } + return map; + } + + public static List> readDirectory(Path path) { + FileSystem fs; + try { + fs = FileSystem.get(new Configuration()); + } catch (IOException e) { + throw new RuntimeException("Unable to access the file system!"); + } + + return readDirectory(path, fs, Integer.MAX_VALUE); + } + + /** + * Reads key-value pairs from a directory containing SequenceFiles. A + * maximum number of key-value pairs is read from each SequenceFile. + * + * @param path path to directory + * @param max maximum of key-value pairs to read per file + * @return list of key-value pairs + */ + public static List> readDirectory(Path path, FileSystem fs, int max) { + List> list = new ArrayList>(); + + try { + FileStatus[] stat = fs.listStatus(path); + for (int i = 0; i < stat.length; ++i) { + + // skip '_log' directory + if (stat[i].getPath().getName().startsWith("_")) + continue; + + List> pairs = readFile(stat[i].getPath(), fs, max); + list.addAll(pairs); + } + } catch (IOException e) { + throw new RuntimeException("Error reading the file system!"); + } + + return list; + } + + public static List readKeys(Path path) { + FileSystem fs; + try { + fs = FileSystem.get(new Configuration()); + } catch (IOException e) { + throw new RuntimeException("Unable to access the file system!"); + } + + return readKeys(path, fs, Integer.MAX_VALUE); + } + + public static List readKeys(Path path, int max) { + FileSystem fs; + try { + fs = FileSystem.get(new Configuration()); + } catch (IOException e) { + throw new RuntimeException("Unable to access the file system!"); + } + + return readKeys(path, fs, max); + } + + public static List readKeys(Path path, FileSystem fs) { + return readKeys(path, fs, Integer.MAX_VALUE); + } + + @SuppressWarnings("unchecked") + public static List readKeys(Path path, FileSystem fs, int max) { + List list = new ArrayList(); + + try { + int k = 0; + SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf()); + + K key = (K) reader.getKeyClass().newInstance(); + Writable value = (Writable) reader.getValueClass().newInstance(); + while (reader.next(key, value)) { + k++; + list.add(key); + if (k >= max) { + break; + } + + key = (K) reader.getKeyClass().newInstance(); + } + reader.close(); + } catch (Exception e) { + throw new RuntimeException("Error reading SequenceFile " + path); + } + + return list; + } + + public static List readValues(Path path) { + FileSystem fs; + try { + fs = FileSystem.get(new Configuration()); + } catch (IOException e) { + throw new RuntimeException("Unable to access the file system!"); + } + + return readValues(path, fs, Integer.MAX_VALUE); + } + + public static List readValues(Path path, int max) { + FileSystem fs; + try { + fs = FileSystem.get(new Configuration()); + } catch (IOException e) { + throw new RuntimeException("Unable to access the file system!"); + } + + return readValues(path, fs, max); + } + + public static List readValues(Path path, FileSystem fs) { + return readValues(path, fs, Integer.MAX_VALUE); + } + + @SuppressWarnings("unchecked") + public static List readValues(Path path, FileSystem fs, int max) { + List list = new ArrayList(); + + try { + int k = 0; + SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf()); + + Writable key = (Writable) reader.getKeyClass().newInstance(); + V value = (V) reader.getValueClass().newInstance(); + + while (reader.next(key, value)) { + k++; + list.add(value); + if (k >= max) { + break; + } + + value = (V) reader.getValueClass().newInstance(); + } + reader.close(); + } catch (Exception e) { + throw new RuntimeException("Error reading SequenceFile " + path); + } + + return list; + } } From 5394796772f97ab4dadc8779f8998ec0060f055e Mon Sep 17 00:00:00 2001 From: "Earl J. Wagner" Date: Thu, 27 Oct 2011 14:53:37 -0400 Subject: [PATCH 07/18] added support to read directory into map --- .../edu/umd/cloud9/io/SequenceFileUtils.java | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/dist/edu/umd/cloud9/io/SequenceFileUtils.java b/src/dist/edu/umd/cloud9/io/SequenceFileUtils.java index 5552a04dc..a9464d720 100644 --- a/src/dist/edu/umd/cloud9/io/SequenceFileUtils.java +++ b/src/dist/edu/umd/cloud9/io/SequenceFileUtils.java @@ -100,6 +100,7 @@ public static List SortedMap readFileIntoMap(Path path) { FileSystem fs; try { @@ -175,6 +176,43 @@ public static List SortedMap readDirectoryIntoMap(Path path) { + FileSystem fs; + try { + fs = FileSystem.get(new Configuration()); + } catch (IOException e) { + throw new RuntimeException("Unable to access the file system!"); + } + + return readDirectoryIntoMap(path, fs, Integer.MAX_VALUE); + } + + public static SortedMap readDirectoryIntoMap(Path path, int max) { + FileSystem fs; + try { + fs = FileSystem.get(new Configuration()); + } catch (IOException e) { + throw new RuntimeException("Unable to access the file system!"); + } + + return readDirectoryIntoMap(path, fs, max); + } + + public static SortedMap readDirectoryIntoMap(Path path, FileSystem fs) { + return readDirectoryIntoMap(path, fs, Integer.MAX_VALUE); + } + + public static SortedMap readDirectoryIntoMap(Path path, FileSystem fs, int max) { + SortedMap map = new TreeMap(); + + for ( PairOfWritables pair : SequenceFileUtils.readDirectory(path, fs, max)) { + map.put(pair.getLeftElement(), pair.getRightElement()); + } + return map; + } + + public static List readKeys(Path path) { FileSystem fs; try { From 052585285fc7d0581a52334a3a6e4d9522137cae Mon Sep 17 00:00:00 2001 From: "Earl J. Wagner" Date: Fri, 28 Oct 2011 15:51:40 -0400 Subject: [PATCH 08/18] changes to print statements --- .../cloud9/collection/aquaint2/Aquaint2DocnoMapping.java | 6 ++++-- .../umd/cloud9/collection/aquaint2/Aquaint2Document.java | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java index d5b9d6627..005136fe5 100644 --- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java +++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java @@ -37,8 +37,8 @@ public class Aquaint2DocnoMapping implements DocnoMapping { private static final Logger LOG = Logger.getLogger(Aquaint2DocnoMapping.class); - // { LOG.setLevel (Level.TRACE); } { LOG.setLevel (Level.INFO); } + //{ LOG.setLevel (Level.TRACE); } private String[] docidEntries; @@ -123,7 +123,9 @@ public String getDocid(int docno) { articleNo++; } LOG.debug("found articleNo: " + articleNo + ", looking at: " + String.format("%s_%04d%02d%02d.%04d", source, year, month, day, articleNo)); - return String.format ("%s_%04d%02d%02d.%04d", source, year, month, day, articleNo); + String result = String.format ("%s_%04d%02d%02d.%04d", source, year, month, day, articleNo); + LOG.trace("getDocid returning: " + result); + return result; } diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java index ea7c3089f..0b85cc523 100644 --- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java +++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java @@ -84,9 +84,9 @@ public String getHeadline() { } catch (Exception e) { LOG.error("exception: " + e); LOG.error("docid: " + getDocid () + ", start: " + start + ", end: " + end); - LOG.error(raw); + LOG.error("raw:\n" + raw); headline = raw.substring(start + 10).trim(); - LOG.error("headline should be: " + headline); + LOG.error("updated headline: " + headline); } headline = TAGS_PATTERN.matcher(headline).replaceAll(""); headline = WHITESPACE_PATTERN.matcher(headline).replaceAll(" "); @@ -122,5 +122,6 @@ public static void readDocument(Aquaint2Document doc, String s) { doc.docid = null; doc.headline = null; doc.text = null; + LOG.debug("docid: " + doc.getDocid() + " length: " + doc.raw.length()); } } From a325b1ae9a81fbe41006d1c16025acdba3ff3fb1 Mon Sep 17 00:00:00 2001 From: "Earl J. Wagner" Date: Wed, 9 Nov 2011 23:02:20 -0500 Subject: [PATCH 09/18] reformatted, added check to ignore empty files in dir (e.g. _SUCCESS) --- .../edu/umd/cloud9/io/ReadSequenceFile.java | 164 +++++++++--------- 1 file changed, 83 insertions(+), 81 deletions(-) diff --git a/src/dist/edu/umd/cloud9/io/ReadSequenceFile.java b/src/dist/edu/umd/cloud9/io/ReadSequenceFile.java index 6c61c6074..0a3b4337b 100644 --- a/src/dist/edu/umd/cloud9/io/ReadSequenceFile.java +++ b/src/dist/edu/umd/cloud9/io/ReadSequenceFile.java @@ -32,7 +32,7 @@ * in the of a directory, the value specifies the number of key-value pairs to * read per file. *

- * + * *
  * args: [path] [max-num-of-records] (local)
  * 
@@ -44,84 +44,86 @@ */ public class ReadSequenceFile { - private ReadSequenceFile() {} - - public static void main(String[] args) throws IOException { - if (args.length < 1) { - System.out.println("args: [path] [max-num-of-records-per-file]"); - System.exit(-1); - } - - String f = args[0]; - - int max = Integer.MAX_VALUE; - if (args.length >= 2) { - max = Integer.parseInt(args[1]); - } - - boolean useLocal = args.length >= 3 && args[2].equals("local") ? true : false; - - if (useLocal) { - System.out.println("Reading from local filesystem"); - } - - FileSystem fs = useLocal? FileSystem.getLocal(new Configuration()) : FileSystem.get(new Configuration()); - Path p = new Path(f); - - if (fs.getFileStatus(p).isDir()) { - readSequenceFilesInDir(p, fs, max); - } else { - readSequenceFile(p, fs, max); - } - } - - private static int readSequenceFile(Path path, FileSystem fs, int max) throws IOException { - SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf()); - - System.out.println("Reading " + path + "...\n"); - try { - System.out.println("Key type: " + reader.getKeyClass().toString()); - System.out.println("Value type: " + reader.getValueClass().toString() + "\n"); - } catch (Exception e) { - throw new RuntimeException("Error: loading key/value class"); - } - - Writable key, value; - int n = 0; - try { - key = (Writable) reader.getKeyClass().newInstance(); - value = (Writable) reader.getValueClass().newInstance(); - - while (reader.next(key, value)) { - System.out.println("Record " + n); - System.out.println("Key: " + key + "\nValue: " + value); - System.out.println("----------------------------------------"); - n++; - - if (n >= max) - break; - } - reader.close(); - System.out.println(n + " records read.\n"); - } catch (Exception e) { - e.printStackTrace(); - } - - return n; - } - - private static int readSequenceFilesInDir(Path path, FileSystem fs, int max) { - int n = 0; - try { - FileStatus[] stat = fs.listStatus(path); - for (int i = 0; i < stat.length; ++i) { - n += readSequenceFile(stat[i].getPath(), fs ,max); - } - } catch (IOException e) { - e.printStackTrace(); - } - - System.out.println(n + " records read in total."); - return n; - } + private ReadSequenceFile() {} + + public static void main(String[] args) throws IOException { + if (args.length < 1) { + System.out.println("args: [path] [max-num-of-records-per-file]"); + System.exit(-1); + } + + String f = args[0]; + + int max = Integer.MAX_VALUE; + if (args.length >= 2) { + max = Integer.parseInt(args[1]); + } + + boolean useLocal = args.length >= 3 && args[2].equals("local") ? true : false; + + if (useLocal) { + System.out.println("Reading from local filesystem"); + } + + FileSystem fs = useLocal? FileSystem.getLocal(new Configuration()) : FileSystem.get(new Configuration()); + Path p = new Path(f); + + if (fs.getFileStatus(p).isDir()) { + readSequenceFilesInDir(p, fs, max); + } else { + readSequenceFile(p, fs, max); + } + } + + private static int readSequenceFile(Path path, FileSystem fs, int max) throws IOException { + SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf()); + + System.out.println("Reading " + path + "...\n"); + try { + System.out.println("Key type: " + reader.getKeyClass().toString()); + System.out.println("Value type: " + reader.getValueClass().toString() + "\n"); + } catch (Exception e) { + throw new RuntimeException("Error: loading key/value class"); + } + + Writable key, value; + int n = 0; + try { + key = (Writable) reader.getKeyClass().newInstance(); + value = (Writable) reader.getValueClass().newInstance(); + + while (reader.next(key, value)) { + System.out.println("Record " + n); + System.out.println("Key: " + key + "\nValue: " + value); + System.out.println("----------------------------------------"); + n++; + + if (n >= max) + break; + } + reader.close(); + System.out.println(n + " records read.\n"); + } catch (Exception e) { + e.printStackTrace(); + } + + return n; + } + + private static int readSequenceFilesInDir(Path path, FileSystem fs, int max) { + int n = 0; + try { + FileStatus[] stat = fs.listStatus(path); + for (int i = 0; i < stat.length; ++i) { + if (stat[i].getLen() > 0) { + n += readSequenceFile(stat[i].getPath(), fs ,max); + } + } + } catch (IOException e) { + e.printStackTrace(); + } + + System.out.println(n + " records read in total."); + return n; + } } From a4c713666500e9e813e4291da45442fdd8e3c3e2 Mon Sep 17 00:00:00 2001 From: "Earl J. Wagner" Date: Wed, 9 Nov 2011 23:03:18 -0500 Subject: [PATCH 10/18] minor changes to logging --- .../collection/aquaint2/Aquaint2DocnoMapping.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java index 005136fe5..098b893e4 100644 --- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java +++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java @@ -37,8 +37,10 @@ public class Aquaint2DocnoMapping implements DocnoMapping { private static final Logger LOG = Logger.getLogger(Aquaint2DocnoMapping.class); - { LOG.setLevel (Level.INFO); } - //{ LOG.setLevel (Level.TRACE); } + { + LOG.setLevel(Level.INFO); + //LOG.setLevel(Level.TRACE); + } private String[] docidEntries; @@ -228,7 +230,7 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws prevArticleNo = articleNo; cnt++; - if (cnt % 100000 == 0) { + if (cnt % (200 * 1000) == 0) { LOG.info(cnt + " docs"); } } @@ -246,7 +248,7 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws for (int i = 0; i < list.size(); i++) { out.writeUTF(list.get(i)); numEntries++; - if (numEntries % 10000 == 0) { + if (numEntries % (10 * 1000) == 0) { LOG.info(numEntries + " months of docs"); } } From ad21d35cc4b8361ed8999e1c9868ea255660dd33 Mon Sep 17 00:00:00 2001 From: "Earl J. Wagner" Date: Sun, 13 Nov 2011 23:02:04 -0500 Subject: [PATCH 11/18] formatting changes --- .../aquaint2/NumberAquaint2Documents2.java | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java b/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java index d07643ccc..219e7f559 100644 --- a/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java +++ b/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java @@ -35,6 +35,7 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; + public class NumberAquaint2Documents2 extends Configured implements Tool { private static final Logger LOG = Logger.getLogger(NumberAquaint2Documents2.class); private static enum Count { DOCS }; @@ -83,22 +84,23 @@ public int run(String[] args) throws Exception { return -1; } - String inputPath = args[0]; - String outputPath = args[1]; - String outputFile = args[2]; + Path inputDirPath = new Path(args[0]); + String outputDirPathname = args[1]; + Path outputDirPath = new Path(outputDirPathname); + Path outputFilePath = new Path(args[2]); LOG.info("Tool: " + NumberAquaint2Documents2.class.getCanonicalName()); - LOG.info(" - Input path: " + inputPath); - LOG.info(" - Output path: " + outputPath); - LOG.info(" - Output file: " + outputFile); + LOG.info(" - Input dir path: " + inputDirPath); + LOG.info(" - Output dir path: " + outputDirPath); + LOG.info(" - Output file path: " + outputFilePath); Job job = new Job(getConf(), NumberAquaint2Documents2.class.getSimpleName()); job.setJarByClass(NumberAquaint2Documents2.class); job.setNumReduceTasks(1); - FileInputFormat.setInputPaths(job, new Path(inputPath)); - FileOutputFormat.setOutputPath(job, new Path(outputPath)); + FileInputFormat.setInputPaths(job, inputDirPath); + FileOutputFormat.setOutputPath(job, outputDirPath); FileOutputFormat.setCompressOutput(job, false); job.setInputFormatClass(Aquaint2DocumentInputFormat2.class); @@ -110,13 +112,15 @@ public int run(String[] args) throws Exception { job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. - FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true); + FileSystem.get(job.getConfiguration()).delete(outputDirPath, true); job.waitForCompletion(true); - String input = outputPath + (outputPath.endsWith("/") ? "" : "/") + "/part-r-00000"; - Aquaint2DocnoMapping.writeDocnoData(new Path(input), new Path(outputFile), - FileSystem.get(getConf())); + Path inputFilePath = new Path(outputDirPathname + + (outputDirPathname.endsWith("/") ? "" : "/") + + "/part-r-00000"); + Aquaint2DocnoMapping.writeDocnoData(inputFilePath, outputFilePath, + FileSystem.get(getConf())); return 0; } From b86e608530c845a8479aecbaa1a0f757357ab777 Mon Sep 17 00:00:00 2001 From: "Earl J. Wagner" Date: Tue, 15 Nov 2011 18:41:11 -0500 Subject: [PATCH 12/18] added support for original Aquaint document format --- .../aquaint2/Aquaint2DocnoMapping.java | 37 +++++++------- .../collection/aquaint2/Aquaint2Document.java | 48 +++++++++++++++---- 2 files changed, 61 insertions(+), 24 deletions(-) diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java index 098b893e4..a866efb15 100644 --- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java +++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java @@ -48,11 +48,13 @@ public class Aquaint2DocnoMapping implements DocnoMapping { public int getDocno(String docid) { LOG.trace("getDocno(docid: " + docid + ")"); Preconditions.checkNotNull(docid); - String source = docid.substring(0, 7); - int year = Integer.parseInt(docid.substring (8, 12)); - int month = Integer.parseInt(docid.substring (12, 14)); - int day = Integer.parseInt(docid.substring (14, 16)); - int articleNo = Integer.parseInt(docid.substring (17, 21)); + int sourceLength = docid.length() - 13; + String source = docid.substring(0, sourceLength); + int year = Integer.parseInt(docid.substring (sourceLength, sourceLength + 4)); + int month = Integer.parseInt(docid.substring (sourceLength + 4, sourceLength + 6)); + int day = Integer.parseInt(docid.substring (sourceLength + 6, sourceLength + 8)); + int articleNo = Integer.parseInt(docid.substring (sourceLength + 9, sourceLength + 13)); + // first traverse the entries to find the month entry and get its days int entryId = findEntryId(source, year, month); @@ -109,14 +111,14 @@ public String getDocid(int docno) { String source = entryMetaInfo[1]; int year = Integer.parseInt(entryMetaInfo[2]); int month = Integer.parseInt(entryMetaInfo[3]); - LOG.debug("looking at: " + String.format("%s_%04d%02d__.____", source, year, month)); + LOG.debug("looking at: " + String.format("%s%04d%02d__.____", source, year, month)); // then traverse the days to find the day and skip over missing articles to get the article number String[] entryEltParts = findEntryEltParts (docno, entryElts); int offset = Integer.parseInt(entryEltParts[0]); String[] entryDayParts = entryEltParts[1].split(","); int day = Integer.parseInt(entryDayParts[0]); - LOG.debug("found day: " + day + ", looking at: " + String.format("%s_%04d%02d%02d.____", source, year, month, day)); + LOG.debug("found day: " + day + ", looking at: " + String.format("%s%04d%02d%02d.____", source, year, month, day)); int articleNo = docno - offset; for (int i = 1; i < entryDayParts.length; i++) { int missingNo = Integer.parseInt(entryDayParts[i]); @@ -124,8 +126,8 @@ public String getDocid(int docno) { LOG.debug("skipping missingNo: " + missingNo); articleNo++; } - LOG.debug("found articleNo: " + articleNo + ", looking at: " + String.format("%s_%04d%02d%02d.%04d", source, year, month, day, articleNo)); - String result = String.format ("%s_%04d%02d%02d.%04d", source, year, month, day, articleNo); + LOG.debug("found articleNo: " + articleNo + ", looking at: " + String.format("%s%04d%02d%02d.%04d", source, year, month, day, articleNo)); + String result = String.format ("%s%04d%02d%02d.%04d", source, year, month, day, articleNo); LOG.trace("getDocid returning: " + result); return result; } @@ -166,6 +168,7 @@ public void loadMapping(Path p, FileSystem fs) throws IOException { docidEntries = Aquaint2DocnoMapping.readDocnoData(p, fs); } + static public void writeDocnoData(Path input, Path output, FileSystem fs) throws IOException { LOG.info("Writing docno data to " + output); LineReader reader = new LineReader(fs.open(input)); @@ -186,11 +189,12 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws while (reader.readLine(line) > 0) { String docid = line.toString(); - String source = docid.substring(0, 7); - int year = Integer.parseInt(docid.substring (8, 12)); - int month = Integer.parseInt(docid.substring (12, 14)); - int day = Integer.parseInt(docid.substring (14, 16)); - int articleNo = Integer.parseInt(docid.substring (17, 21)); + int sourceLength = docid.indexOf("\t") - 13; + String source = docid.substring(0, sourceLength); + int year = Integer.parseInt(docid.substring (sourceLength, sourceLength + 4)); + int month = Integer.parseInt(docid.substring (sourceLength + 4, sourceLength + 6)); + int day = Integer.parseInt(docid.substring (sourceLength + 6, sourceLength + 8)); + int articleNo = Integer.parseInt(docid.substring (sourceLength + 9, sourceLength + 13)); LOG.debug("prevSource: " + prevSource + ", prevYear: " + prevYear + ", prevMonth: " + prevMonth + ", prevDay: " + prevDay + ", prevArticleNo: " + prevArticleNo); LOG.debug("source: " + source + ", year: " + year + ", month: " + month + ", day: " + day + ", articleNo: " + articleNo); @@ -297,9 +301,10 @@ public static void main(String[] args) throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); - System.out.println("loading mapping file " + args[1]); + Path mappingPath = new Path(args[1]); + System.out.println("loading mapping file " + mappingPath); Aquaint2DocnoMapping mapping = new Aquaint2DocnoMapping(); - mapping.loadMapping(new Path(args[1]), fs); + mapping.loadMapping(mappingPath, fs); if (args[0].equals("list")) { for (int i = 1; i < mapping.docidEntries.length; i++) { diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java index 0b85cc523..cdeec69eb 100644 --- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java +++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java @@ -30,14 +30,18 @@ public class Aquaint2Document extends Indexable { private static final Logger LOG = Logger.getLogger(Aquaint2Document.class); - { LOG.setLevel (Level.INFO); } + { + LOG.setLevel(Level.INFO); + //LOG.setLevel(Level.TRACE); + } private static Pattern TAGS_PATTERN = Pattern.compile("<[^>]+>"); private static Pattern WHITESPACE_PATTERN = Pattern.compile("\t|\n"); - public static final String XML_START_TAG = ""); + if (start == -1) { + docid = ""; + } else { + int end = raw.indexOf(""); + docid = raw.substring(start + 7, end).trim(); + } + LOG.trace("in setAquaintDocid, docid: " + docid); + } + + + private void setAquaint2Docid() { + int start = 9; + int end = raw.indexOf("\"", start); + docid = raw.substring(start, end).trim(); + LOG.trace("in setAquaint2Docid, docid: " + docid); + } + + public String getHeadline() { if (headline == null) { int start = raw.indexOf(""); - if (start == -1) { headline = ""; } else { @@ -95,6 +123,7 @@ public String getHeadline() { return headline; } + @Override public String getContent() { if (text == null) { @@ -109,10 +138,10 @@ public String getContent() { text = TAGS_PATTERN.matcher(text).replaceAll(""); } } - return text; } + public static void readDocument(Aquaint2Document doc, String s) { if (s == null) { throw new RuntimeException("Error, can't read null string!"); @@ -122,6 +151,9 @@ public static void readDocument(Aquaint2Document doc, String s) { doc.docid = null; doc.headline = null; doc.text = null; + //doc.isAquaint2 = (doc.raw.indexOf("\n") == -1); + doc.isAquaint2 = (doc.raw.indexOf("") == -1); + LOG.debug("docid: " + doc.getDocid() + " length: " + doc.raw.length()); } } From 6484c858779c20e20579a352d78213576d30909f Mon Sep 17 00:00:00 2001 From: "Earl J. Wagner" Date: Thu, 17 Nov 2011 15:08:27 -0500 Subject: [PATCH 13/18] extended Aquaint2 code to support original Aquaint corpus --- .../aquaint2/Aquaint2DocnoMapping.java | 40 ++++---- .../collection/aquaint2/Aquaint2Document.java | 95 +++++++++++++------ .../aquaint2/Aquaint2DocumentInputFormat.java | 3 - .../Aquaint2DocumentInputFormat2.java | 7 +- .../aquaint2/NumberAquaint2Documents2.java | 17 +++- 5 files changed, 108 insertions(+), 54 deletions(-) diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java index a866efb15..f146b5210 100644 --- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java +++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java @@ -55,27 +55,28 @@ public int getDocno(String docid) { int day = Integer.parseInt(docid.substring (sourceLength + 6, sourceLength + 8)); int articleNo = Integer.parseInt(docid.substring (sourceLength + 9, sourceLength + 13)); + LOG.debug("source: " + source + ", year: " + year + ", month: " + month + ", day: " + day + ", articleNo: " + articleNo); // first traverse the entries to find the month entry and get its days int entryId = findEntryId(source, year, month); LOG.debug("entryId: " + entryId); - String entryElt = docidEntries[entryId].split("\t")[day]; - LOG.debug("entryElt: " + entryElt); - - // then traverse the days to find the day and skip over missing articles to get the article number - String[] entryEltParts = entryElt.split(" "); - int result = articleNo + Integer.parseInt(entryEltParts[0]); - String[] entryDayParts = entryEltParts[1].split(","); - for (int i = 1; i < entryDayParts.length; i++) { - int missingNo = Integer.parseInt(entryDayParts[i]); - if (articleNo < missingNo) break; - LOG.debug("skipping missingNo: " + missingNo); - result--; - } + String entryElt = docidEntries[entryId].split("\t")[day]; + LOG.debug("entryElt: " + entryElt); + + // then traverse the days to find the day and skip over missing articles to get the article number + String[] entryEltParts = entryElt.split(" "); + int result = articleNo + Integer.parseInt(entryEltParts[0]); + String[] entryDayParts = entryEltParts[1].split(","); + for (int i = 1; i < entryDayParts.length; i++) { + int missingNo = Integer.parseInt(entryDayParts[i]); + if (articleNo < missingNo) break; + LOG.debug("skipping missingNo: " + missingNo); + result--; + } - LOG.trace("getDocno returning: " + result); - return result; + LOG.trace("getDocno returning: " + result); + return result; } private int findEntryId(String source, int year, int month) { @@ -170,6 +171,7 @@ public void loadMapping(Path p, FileSystem fs) throws IOException { static public void writeDocnoData(Path input, Path output, FileSystem fs) throws IOException { + //LOG.setLevel(Level.TRACE); LOG.info("Writing docno data to " + output); LineReader reader = new LineReader(fs.open(input)); List list = Lists.newArrayList(); @@ -189,6 +191,7 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws while (reader.readLine(line) > 0) { String docid = line.toString(); + LOG.debug("reading line docid: " + docid); int sourceLength = docid.indexOf("\t") - 13; String source = docid.substring(0, sourceLength); int year = Integer.parseInt(docid.substring (sourceLength, sourceLength + 4)); @@ -201,7 +204,7 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws if (! source.equals(prevSource) || year != prevYear || month != prevMonth) { - LOG.debug("currentEntry: " + currentEntry); + LOG.debug("diff source, year or month, currentEntry: " + currentEntry); if (currentEntry != null) { list.add(currentEntry.toString()); list.add(""); @@ -210,16 +213,19 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws currentEntry = new StringBuilder(cnt + " " + source + " " + year + " " + month); prevDay = 0; prevArticleNo = 0; + LOG.debug("diff source, year or month, reset currentEntry: " + currentEntry); } if (day != prevDay) { for (int i = prevDay + 1; i <= day; i++) { currentEntry.append("\t" + cnt + " " + i); } + LOG.debug("diff day, currentEntry: " + currentEntry); prevArticleNo = 0; // writeUTF can't write a string longer than 64k, so we output a chunk at a time // here then concatenate strings between s list.add(currentEntry.toString()); currentEntry = new StringBuilder (); + LOG.debug("diff day, reset currentEntry"); } if (articleNo != prevArticleNo + 1) { // we have missing article numbers - gather them @@ -238,6 +244,7 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws LOG.info(cnt + " docs"); } } + LOG.debug("adding final currentEntry: " + currentEntry); list.add(currentEntry.toString()); list.add(""); numEntries++; @@ -261,6 +268,7 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws } static public String[] readDocnoData(Path p, FileSystem fs) throws IOException { + //LOG.setLevel(Level.TRACE); LOG.trace("readDocnoData (p: " + p + ", fs)"); FSDataInputStream in = fs.open(p); diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java index cdeec69eb..79f7ff8ee 100644 --- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java +++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java @@ -21,7 +21,11 @@ import java.io.IOException; import java.util.regex.Pattern; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableUtils; +import org.apache.hadoop.util.LineReader; import org.apache.log4j.Level; import org.apache.log4j.Logger; @@ -38,16 +42,43 @@ public class Aquaint2Document extends Indexable { private static Pattern TAGS_PATTERN = Pattern.compile("<[^>]+>"); private static Pattern WHITESPACE_PATTERN = Pattern.compile("\t|\n"); - public static final String XML_START_TAG = ""); + } catch (IOException e) { + e.printStackTrace(); + } + //LOG.info("in getXmlStartTag, isAquaint2: " + isAquaint2); + if (isAquaint2) { + return AQUAINT2_XML_START_TAG; + } else { + return AQUAINT_XML_START_TAG; + } + } + + + public static String getXmlEndTag() { + return XML_END_TAG; + } @Override @@ -67,10 +98,32 @@ public void readFields(DataInput in) throws IOException { } + public String getElementText(String elementTagName) { + String result = ""; + int index = raw.indexOf("<" + elementTagName + ">"); + if (index != -1) { + int start = index + elementTagName.length() + 2; + int end = raw.indexOf(""); + try { + result = raw.substring(start, end).trim(); + } catch (Exception e) { + LOG.error("exception: " + e); + LOG.error("docid: " + getDocid () + ", index: " + index + ", start: " + start + ", end: " + end); + LOG.error("raw:\n" + raw); + result = raw.substring(start).trim(); + LOG.error("found element text: " + result); + } + result = TAGS_PATTERN.matcher(result).replaceAll(""); + result = WHITESPACE_PATTERN.matcher(result).replaceAll(" "); + } + return result; + } + + @Override public String getDocid() { if (docid == null) { - if (isAquaint2) { + if (isAquaint2Document) { setAquaint2Docid(); } else { setAquaintDocid(); @@ -81,18 +134,14 @@ public String getDocid() { private void setAquaintDocid() { - int start = raw.indexOf(""); - if (start == -1) { - docid = ""; - } else { - int end = raw.indexOf(""); - docid = raw.substring(start + 7, end).trim(); - } + LOG.trace("setAquaintDocid()"); + docid = getElementText("DOCNO"); LOG.trace("in setAquaintDocid, docid: " + docid); } private void setAquaint2Docid() { + LOG.trace("setAquaint2Docid()"); int start = 9; int end = raw.indexOf("\"", start); docid = raw.substring(start, end).trim(); @@ -102,22 +151,9 @@ private void setAquaint2Docid() { public String getHeadline() { if (headline == null) { - int start = raw.indexOf(""); - if (start == -1) { - headline = ""; - } else { - int end = raw.indexOf(""); - try { - headline = raw.substring(start + 10, end).trim(); - } catch (Exception e) { - LOG.error("exception: " + e); - LOG.error("docid: " + getDocid () + ", start: " + start + ", end: " + end); - LOG.error("raw:\n" + raw); - headline = raw.substring(start + 10).trim(); - LOG.error("updated headline: " + headline); - } - headline = TAGS_PATTERN.matcher(headline).replaceAll(""); - headline = WHITESPACE_PATTERN.matcher(headline).replaceAll(" "); + headline = getElementText("HEADLINE"); + if (! isAquaint2Document) { + headline = getElementText("SLUG").trim().toLowerCase() + ": " + headline; } } return headline; @@ -143,17 +179,18 @@ public String getContent() { public static void readDocument(Aquaint2Document doc, String s) { + LOG.trace("readDocument(doc, s), s: \n" + s); if (s == null) { throw new RuntimeException("Error, can't read null string!"); } doc.raw = s; + doc.isAquaint2Document = doc.raw.startsWith("\n") == -1); - doc.isAquaint2 = (doc.raw.indexOf("") == -1); LOG.debug("docid: " + doc.getDocid() + " length: " + doc.raw.length()); + LOG.trace("readDocument returning"); } } diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocumentInputFormat.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocumentInputFormat.java index c2f9f4dd5..13be1b9b6 100644 --- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocumentInputFormat.java +++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocumentInputFormat.java @@ -48,9 +48,6 @@ public static class Aquaint2DocumentRecordReader implements private final LongWritable offset = new LongWritable(); public Aquaint2DocumentRecordReader(FileSplit split, JobConf conf) throws IOException { - conf.set(XMLInputFormat.START_TAG_KEY, Aquaint2Document.XML_START_TAG); - conf.set(XMLInputFormat.END_TAG_KEY, Aquaint2Document.XML_END_TAG); - reader = new XMLRecordReader(split, conf); } diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocumentInputFormat2.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocumentInputFormat2.java index aacc4e4c1..c7ba65b65 100644 --- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocumentInputFormat2.java +++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocumentInputFormat2.java @@ -28,6 +28,9 @@ import edu.umd.cloud9.collection.XMLInputFormat; import edu.umd.cloud9.collection.XMLInputFormat2.XMLRecordReader; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; + public class Aquaint2DocumentInputFormat2 extends IndexableFileInputFormat2 { @@ -45,10 +48,6 @@ public static class Aquaint2DocumentRecordReader extends @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { - Configuration conf = context.getConfiguration(); - conf.set(XMLInputFormat.START_TAG_KEY, Aquaint2Document.XML_START_TAG); - conf.set(XMLInputFormat.END_TAG_KEY, Aquaint2Document.XML_END_TAG); - reader.initialize(split, context); } diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java b/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java index d07643ccc..812433754 100644 --- a/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java +++ b/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java @@ -33,10 +33,16 @@ import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; +import org.apache.log4j.Level; import org.apache.log4j.Logger; + public class NumberAquaint2Documents2 extends Configured implements Tool { private static final Logger LOG = Logger.getLogger(NumberAquaint2Documents2.class); + { + LOG.setLevel(Level.INFO); + } + private static enum Count { DOCS }; private static class MyMapper extends Mapper { @@ -49,6 +55,8 @@ public void map(LongWritable key, Aquaint2Document doc, Context context) context.getCounter(Count.DOCS).increment(1); docid.set(doc.getDocid()); context.write(docid, one); + LOG.setLevel(Level.INFO); + LOG.trace("map output (" + docid + ", " + one + ")"); } } @@ -60,6 +68,8 @@ public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { context.write(key, cnt); cnt.set(cnt.get() + 1); + LOG.setLevel(Level.INFO); + LOG.trace("reduce output (" + key + ", " + cnt + ")"); } } @@ -92,7 +102,10 @@ public int run(String[] args) throws Exception { LOG.info(" - Output path: " + outputPath); LOG.info(" - Output file: " + outputFile); - Job job = new Job(getConf(), NumberAquaint2Documents2.class.getSimpleName()); + Configuration conf = getConf(); + FileSystem fs = FileSystem.get(conf); + + Job job = new Job(conf, NumberAquaint2Documents2.class.getSimpleName()); job.setJarByClass(NumberAquaint2Documents2.class); job.setNumReduceTasks(1); @@ -110,7 +123,7 @@ public int run(String[] args) throws Exception { job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. - FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true); + fs.delete(new Path(outputPath), true); job.waitForCompletion(true); From 0963700ca49af11dcf71dbf7ad4ef8bfdcc9417a Mon Sep 17 00:00:00 2001 From: "Earl J. Wagner" Date: Thu, 17 Nov 2011 15:42:08 -0500 Subject: [PATCH 14/18] minor fix --- .../cloud9/collection/aquaint2/NumberAquaint2Documents2.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java b/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java index 5f9b753d5..737a59bd1 100644 --- a/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java +++ b/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java @@ -124,7 +124,7 @@ public int run(String[] args) throws Exception { job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. - fs.delete(new Path(outputDirPath), true); + fs.delete(outputDirPath, true); job.waitForCompletion(true); From 9da236ed4fa9087f035558b5a0c02146a2c310b2 Mon Sep 17 00:00:00 2001 From: "Earl J. Wagner" Date: Tue, 22 Nov 2011 01:49:10 -0500 Subject: [PATCH 15/18] fix to allow multiple Aquaint2 DTDs but just one for Aquaint --- .../edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java index 79f7ff8ee..567d45d2d 100644 --- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java +++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java @@ -63,7 +63,10 @@ public static String getXmlStartTag(FileSystem fs, String inputFile) { Text line = new Text(); reader.readLine(line); reader.readLine(line); - isAquaint2 = line.toString().endsWith("'a2_newswire_xml.dtd'>"); + // Aquaint: 'aquaint.dtd' + // Aquaint2: 'a2_newswire_xml.dtd' + // Gigaword: 'gigaword.dtd' + isAquaint2 = ! line.toString().endsWith("'aquaint.dtd'>"); } catch (IOException e) { e.printStackTrace(); } From ab4a525af86fe368fd41f38eef77bd9c9db4456d Mon Sep 17 00:00:00 2001 From: "Earl J. Wagner" Date: Fri, 2 Mar 2012 14:16:13 -0500 Subject: [PATCH 16/18] added sorting entries by ascending value --- src/dist/edu/umd/cloud9/util/map/HMapIF.java | 1949 +++++++++--------- 1 file changed, 1011 insertions(+), 938 deletions(-) diff --git a/src/dist/edu/umd/cloud9/util/map/HMapIF.java b/src/dist/edu/umd/cloud9/util/map/HMapIF.java index b7c449fc2..abdaba732 100644 --- a/src/dist/edu/umd/cloud9/util/map/HMapIF.java +++ b/src/dist/edu/umd/cloud9/util/map/HMapIF.java @@ -1,5 +1,5 @@ /* - * @(#)HashMap.java 1.73 07/03/13 + * @(#)HashMap.java 1.73 07/03/13 * * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. @@ -26,338 +26,338 @@ */ public class HMapIF implements MapIF, Cloneable, Serializable { - /** - * The default initial capacity - MUST be a power of two. - */ - static final int DEFAULT_INITIAL_CAPACITY = 1024; - - /** - * The maximum capacity, used if a higher value is implicitly specified by - * either of the constructors with arguments. MUST be a power of two <= 1<<30. - */ - static final int MAXIMUM_CAPACITY = 1 << 30; - - /** - * The load factor used when none specified in constructor. - */ - static final float DEFAULT_LOAD_FACTOR = 0.75f; - - /** - * The table, resized as necessary. Length MUST Always be a power of two. - */ - transient Entry[] table; - - /** - * The number of key-value mappings contained in this map. - */ - transient int size; - - /** - * The next size value at which to resize (capacity * load factor). - * - * @serial - */ - int threshold; - - /** - * The load factor for the hash table. - * - * @serial - */ - final float loadFactor; - - /** - * The number of times this HMapIF has been structurally modified Structural - * modifications are those that change the number of mappings in the HMapIF - * or otherwise modify its internal structure (e.g., rehash). This field is - * used to make iterators on Collection-views of the HMapIF fail-fast. (See - * ConcurrentModificationException). - */ - transient volatile int modCount; - - /** - * Constructs an empty HMapIF with the specified initial capacity - * and load factor. - * - * @param initialCapacity - * the initial capacity - * @param loadFactor - * the load factor - * @throws IllegalArgumentException - * if the initial capacity is negative or the load factor is - * nonpositive - */ - public HMapIF(int initialCapacity, float loadFactor) { - if (initialCapacity < 0) - throw new IllegalArgumentException("Illegal initial capacity: " + initialCapacity); - if (initialCapacity > MAXIMUM_CAPACITY) - initialCapacity = MAXIMUM_CAPACITY; - if (loadFactor <= 0 || Float.isNaN(loadFactor)) - throw new IllegalArgumentException("Illegal load factor: " + loadFactor); - - // Find a power of 2 >= initialCapacity - int capacity = 1; - while (capacity < initialCapacity) - capacity <<= 1; - - this.loadFactor = loadFactor; - threshold = (int) (capacity * loadFactor); - table = new Entry[capacity]; - init(); - } - - /** - * Constructs an empty HMapIF with the specified initial capacity - * and the default load factor (0.75). - * - * @param initialCapacity - * the initial capacity. - * @throws IllegalArgumentException - * if the initial capacity is negative. - */ - public HMapIF(int initialCapacity) { - this(initialCapacity, DEFAULT_LOAD_FACTOR); - } - - /** - * Constructs an empty HMapIF with the default initial capacity - * (1024) and the default load factor (0.75). - */ - public HMapIF() { - this.loadFactor = DEFAULT_LOAD_FACTOR; - threshold = (int) (DEFAULT_INITIAL_CAPACITY * DEFAULT_LOAD_FACTOR); - table = new Entry[DEFAULT_INITIAL_CAPACITY]; - init(); - } - - /** - * Constructs a new HMapIF with the same mappings as the - * specified MapIF. The HMapIF is created with default - * load factor (0.75) and an initial capacity sufficient to hold the - * mappings in the specified MapIF. - * - * @param m - * the map whose mappings are to be placed in this map - * @throws NullPointerException - * if the specified map is null - */ - public HMapIF(MapIF m) { - this(Math.max((int) (m.size() / DEFAULT_LOAD_FACTOR) + 1, DEFAULT_INITIAL_CAPACITY), - DEFAULT_LOAD_FACTOR); - putAllForCreate(m); - } - - // internal utilities - - /** - * Initialization hook for subclasses. This method is called in all - * constructors and pseudo-constructors (clone, readObject) after HMapIF has - * been initialized but before any entries have been inserted. (In the - * absence of this method, readObject would require explicit knowledge of - * subclasses.) - */ - void init() { - } - - /** - * Applies a supplemental hash function to a given hashCode, which defends - * against poor quality hash functions. This is critical because HMapIF uses - * power-of-two length hash tables, that otherwise encounter collisions for - * hashCodes that do not differ in lower bits. Note: Null keys always map to - * hash 0, thus index 0. - */ - static int hash(int h) { - // This function ensures that hashCodes that differ only by - // constant multiples at each bit position have a bounded - // number of collisions (approximately 8 at default load factor). - h ^= (h >>> 20) ^ (h >>> 12); - return h ^ (h >>> 7) ^ (h >>> 4); - } - - /** - * Returns index for hash code h. - */ - static int indexFor(int h, int length) { - return h & (length - 1); - } - - // doc copied from interface - public int size() { - return size; - } - - // doc copied from interface - public boolean isEmpty() { - return size == 0; - } - - // doc copied from interface - public float get(int key) { - int hash = hash(key); - for (Entry e = table[indexFor(hash, table.length)]; e != null; e = e.next) { - int k; - if (e.hash == hash && ((k = e.key) == key || key == k)) - return e.value; - } - - return DEFAULT_VALUE; - } - - // doc copied from interface - public boolean containsKey(int key) { - return getEntry(key) != null; - } - - /** - * Returns the entry associated with the specified key in the HMapIF. - * Returns null if the HMapIF contains no mapping for the key. - */ - final Entry getEntry(int key) { - int hash = hash(key); - for (Entry e = table[indexFor(hash, table.length)]; e != null; e = e.next) { - int k; - if (e.hash == hash && ((k = e.key) == key || key == k)) - return e; - } - return null; - } - - // doc copied from interface - public float put(int key, float value) { - int hash = hash(key); - int i = indexFor(hash, table.length); - for (Entry e = table[i]; e != null; e = e.next) { - int k; - if (e.hash == hash && ((k = e.key) == key || key == k)) { - float oldValue = e.value; - e.value = value; - e.recordAccess(this); - return oldValue; - } - } - - modCount++; - addEntry(hash, key, value, i); - return DEFAULT_VALUE; - } - - /** - * This method is used instead of put by constructors and pseudoconstructors - * (clone, readObject). It does not resize the table, check for - * comodification, etc. It calls createEntry rather than addEntry. - */ - private void putForCreate(int key, float value) { - int hash = hash(key); - int i = indexFor(hash, table.length); - - /** - * Look for preexisting entry for key. This will never happen for clone - * or deserialize. It will only happen for construction if the input Map - * is a sorted map whose ordering is inconsistent w/ equals. - */ - for (Entry e = table[i]; e != null; e = e.next) { - int k; - if (e.hash == hash && ((k = e.key) == key || key == k)) { - e.value = value; - return; - } - } - - createEntry(hash, key, value, i); - } - - private void putAllForCreate(MapIF m) { - for (Iterator i = m.entrySet().iterator(); i.hasNext();) { - MapIF.Entry e = i.next(); - putForCreate(e.getKey(), e.getValue()); - } - } - - /** - * Rehashes the contents of this map into a new array with a larger - * capacity. This method is called automatically when the number of keys in - * this map reaches its threshold. - * - * If current capacity is MAXIMUM_CAPACITY, this method does not resize the - * map, but sets threshold to Integer.MAX_VALUE. This has the effect of - * preventing future calls. - * - * @param newCapacity - * the new capacity, MUST be a power of two; must be greater than - * current capacity unless current capacity is MAXIMUM_CAPACITY - * (in which case value is irrelevant). - */ - void resize(int newCapacity) { - Entry[] oldTable = table; - int oldCapacity = oldTable.length; - if (oldCapacity == MAXIMUM_CAPACITY) { - threshold = Integer.MAX_VALUE; - return; - } - - Entry[] newTable = new Entry[newCapacity]; - transfer(newTable); - table = newTable; - threshold = (int) (newCapacity * loadFactor); - } - - /** - * Transfers all entries from current table to newTable. - */ - void transfer(Entry[] newTable) { - Entry[] src = table; - int newCapacity = newTable.length; - for (int j = 0; j < src.length; j++) { - Entry e = src[j]; - if (e != null) { - src[j] = null; - do { - Entry next = e.next; - int i = indexFor(e.hash, newCapacity); - e.next = newTable[i]; - newTable[i] = e; - e = next; - } while (e != null); - } - } - } - - // doc copied from interface - public void putAll(MapIF m) { - int numKeysToBeAdded = m.size(); - if (numKeysToBeAdded == 0) - return; - - /* - * Expand the map if the map if the number of mappings to be added is - * greater than or equal to threshold. This is conservative; the obvious - * condition is (m.size() + size) >= threshold, but this condition could - * result in a map with twice the appropriate capacity, if the keys to - * be added overlap with the keys already in this map. By using the - * conservative calculation, we subject ourself to at most one extra - * resize. - */ - if (numKeysToBeAdded > threshold) { - int targetCapacity = (int) (numKeysToBeAdded / loadFactor + 1); - if (targetCapacity > MAXIMUM_CAPACITY) - targetCapacity = MAXIMUM_CAPACITY; - int newCapacity = table.length; - while (newCapacity < targetCapacity) - newCapacity <<= 1; - if (newCapacity > table.length) - resize(newCapacity); - } - - for (Iterator i = m.entrySet().iterator(); i.hasNext();) { - MapIF.Entry e = i.next(); - put(e.getKey(), e.getValue()); - } - } + /** + * The default initial capacity - MUST be a power of two. + */ + static final int DEFAULT_INITIAL_CAPACITY = 1024; + + /** + * The maximum capacity, used if a higher value is implicitly specified by + * either of the constructors with arguments. MUST be a power of two <= 1<<30. + */ + static final int MAXIMUM_CAPACITY = 1 << 30; + + /** + * The load factor used when none specified in constructor. + */ + static final float DEFAULT_LOAD_FACTOR = 0.75f; + + /** + * The table, resized as necessary. Length MUST Always be a power of two. + */ + transient Entry[] table; + + /** + * The number of key-value mappings contained in this map. + */ + transient int size; + + /** + * The next size value at which to resize (capacity * load factor). + * + * @serial + */ + int threshold; + + /** + * The load factor for the hash table. + * + * @serial + */ + final float loadFactor; + + /** + * The number of times this HMapIF has been structurally modified Structural + * modifications are those that change the number of mappings in the HMapIF + * or otherwise modify its internal structure (e.g., rehash). This field is + * used to make iterators on Collection-views of the HMapIF fail-fast. (See + * ConcurrentModificationException). + */ + transient volatile int modCount; + + /** + * Constructs an empty HMapIF with the specified initial capacity + * and load factor. + * + * @param initialCapacity + * the initial capacity + * @param loadFactor + * the load factor + * @throws IllegalArgumentException + * if the initial capacity is negative or the load factor is + * nonpositive + */ + public HMapIF(int initialCapacity, float loadFactor) { + if (initialCapacity < 0) + throw new IllegalArgumentException("Illegal initial capacity: " + initialCapacity); + if (initialCapacity > MAXIMUM_CAPACITY) + initialCapacity = MAXIMUM_CAPACITY; + if (loadFactor <= 0 || Float.isNaN(loadFactor)) + throw new IllegalArgumentException("Illegal load factor: " + loadFactor); + + // Find a power of 2 >= initialCapacity + int capacity = 1; + while (capacity < initialCapacity) + capacity <<= 1; + + this.loadFactor = loadFactor; + threshold = (int) (capacity * loadFactor); + table = new Entry[capacity]; + init(); + } + + /** + * Constructs an empty HMapIF with the specified initial capacity + * and the default load factor (0.75). + * + * @param initialCapacity + * the initial capacity. + * @throws IllegalArgumentException + * if the initial capacity is negative. + */ + public HMapIF(int initialCapacity) { + this(initialCapacity, DEFAULT_LOAD_FACTOR); + } + + /** + * Constructs an empty HMapIF with the default initial capacity + * (1024) and the default load factor (0.75). + */ + public HMapIF() { + this.loadFactor = DEFAULT_LOAD_FACTOR; + threshold = (int) (DEFAULT_INITIAL_CAPACITY * DEFAULT_LOAD_FACTOR); + table = new Entry[DEFAULT_INITIAL_CAPACITY]; + init(); + } + + /** + * Constructs a new HMapIF with the same mappings as the + * specified MapIF. The HMapIF is created with default + * load factor (0.75) and an initial capacity sufficient to hold the + * mappings in the specified MapIF. + * + * @param m + * the map whose mappings are to be placed in this map + * @throws NullPointerException + * if the specified map is null + */ + public HMapIF(MapIF m) { + this(Math.max((int) (m.size() / DEFAULT_LOAD_FACTOR) + 1, DEFAULT_INITIAL_CAPACITY), + DEFAULT_LOAD_FACTOR); + putAllForCreate(m); + } + + // internal utilities + + /** + * Initialization hook for subclasses. This method is called in all + * constructors and pseudo-constructors (clone, readObject) after HMapIF has + * been initialized but before any entries have been inserted. (In the + * absence of this method, readObject would require explicit knowledge of + * subclasses.) + */ + void init() { + } + + /** + * Applies a supplemental hash function to a given hashCode, which defends + * against poor quality hash functions. This is critical because HMapIF uses + * power-of-two length hash tables, that otherwise encounter collisions for + * hashCodes that do not differ in lower bits. Note: Null keys always map to + * hash 0, thus index 0. + */ + static int hash(int h) { + // This function ensures that hashCodes that differ only by + // constant multiples at each bit position have a bounded + // number of collisions (approximately 8 at default load factor). + h ^= (h >>> 20) ^ (h >>> 12); + return h ^ (h >>> 7) ^ (h >>> 4); + } + + /** + * Returns index for hash code h. + */ + static int indexFor(int h, int length) { + return h & (length - 1); + } + + // doc copied from interface + public int size() { + return size; + } + + // doc copied from interface + public boolean isEmpty() { + return size == 0; + } + + // doc copied from interface + public float get(int key) { + int hash = hash(key); + for (Entry e = table[indexFor(hash, table.length)]; e != null; e = e.next) { + int k; + if (e.hash == hash && ((k = e.key) == key || key == k)) + return e.value; + } + + return DEFAULT_VALUE; + } + + // doc copied from interface + public boolean containsKey(int key) { + return getEntry(key) != null; + } + + /** + * Returns the entry associated with the specified key in the HMapIF. + * Returns null if the HMapIF contains no mapping for the key. + */ + final Entry getEntry(int key) { + int hash = hash(key); + for (Entry e = table[indexFor(hash, table.length)]; e != null; e = e.next) { + int k; + if (e.hash == hash && ((k = e.key) == key || key == k)) + return e; + } + return null; + } + + // doc copied from interface + public float put(int key, float value) { + int hash = hash(key); + int i = indexFor(hash, table.length); + for (Entry e = table[i]; e != null; e = e.next) { + int k; + if (e.hash == hash && ((k = e.key) == key || key == k)) { + float oldValue = e.value; + e.value = value; + e.recordAccess(this); + return oldValue; + } + } + + modCount++; + addEntry(hash, key, value, i); + return DEFAULT_VALUE; + } + + /** + * This method is used instead of put by constructors and pseudoconstructors + * (clone, readObject). It does not resize the table, check for + * comodification, etc. It calls createEntry rather than addEntry. + */ + private void putForCreate(int key, float value) { + int hash = hash(key); + int i = indexFor(hash, table.length); + + /** + * Look for preexisting entry for key. This will never happen for clone + * or deserialize. It will only happen for construction if the input Map + * is a sorted map whose ordering is inconsistent w/ equals. + */ + for (Entry e = table[i]; e != null; e = e.next) { + int k; + if (e.hash == hash && ((k = e.key) == key || key == k)) { + e.value = value; + return; + } + } + + createEntry(hash, key, value, i); + } + + private void putAllForCreate(MapIF m) { + for (Iterator i = m.entrySet().iterator(); i.hasNext();) { + MapIF.Entry e = i.next(); + putForCreate(e.getKey(), e.getValue()); + } + } + + /** + * Rehashes the contents of this map into a new array with a larger + * capacity. This method is called automatically when the number of keys in + * this map reaches its threshold. + * + * If current capacity is MAXIMUM_CAPACITY, this method does not resize the + * map, but sets threshold to Integer.MAX_VALUE. This has the effect of + * preventing future calls. + * + * @param newCapacity + * the new capacity, MUST be a power of two; must be greater than + * current capacity unless current capacity is MAXIMUM_CAPACITY + * (in which case value is irrelevant). + */ + void resize(int newCapacity) { + Entry[] oldTable = table; + int oldCapacity = oldTable.length; + if (oldCapacity == MAXIMUM_CAPACITY) { + threshold = Integer.MAX_VALUE; + return; + } + + Entry[] newTable = new Entry[newCapacity]; + transfer(newTable); + table = newTable; + threshold = (int) (newCapacity * loadFactor); + } + + /** + * Transfers all entries from current table to newTable. + */ + void transfer(Entry[] newTable) { + Entry[] src = table; + int newCapacity = newTable.length; + for (int j = 0; j < src.length; j++) { + Entry e = src[j]; + if (e != null) { + src[j] = null; + do { + Entry next = e.next; + int i = indexFor(e.hash, newCapacity); + e.next = newTable[i]; + newTable[i] = e; + e = next; + } while (e != null); + } + } + } + + // doc copied from interface + public void putAll(MapIF m) { + int numKeysToBeAdded = m.size(); + if (numKeysToBeAdded == 0) + return; + + /* + * Expand the map if the map if the number of mappings to be added is + * greater than or equal to threshold. This is conservative; the obvious + * condition is (m.size() + size) >= threshold, but this condition could + * result in a map with twice the appropriate capacity, if the keys to + * be added overlap with the keys already in this map. By using the + * conservative calculation, we subject ourself to at most one extra + * resize. + */ + if (numKeysToBeAdded > threshold) { + int targetCapacity = (int) (numKeysToBeAdded / loadFactor + 1); + if (targetCapacity > MAXIMUM_CAPACITY) + targetCapacity = MAXIMUM_CAPACITY; + int newCapacity = table.length; + while (newCapacity < targetCapacity) + newCapacity <<= 1; + if (newCapacity > table.length) + resize(newCapacity); + } + + for (Iterator i = m.entrySet().iterator(); i.hasNext();) { + MapIF.Entry e = i.next(); + put(e.getKey(), e.getValue()); + } + } /** * Increments the key by some value. If the key does not exist in the map, its value is * set to the parameter value. - * + * * @param key * key to increment * @param value @@ -371,613 +371,686 @@ public void increment(int key, float value) { } } - // doc copied from interface - public float remove(int key) { - Entry e = removeEntryForKey(key); - if (e != null) - return e.value; - - throw new NoSuchElementException(); - } - - /** - * Removes and returns the entry associated with the specified key in the - * HMapIF. Returns null if the HMapIF contains no mapping for this key. - */ - final Entry removeEntryForKey(int key) { - int hash = hash(key); - int i = indexFor(hash, table.length); - Entry prev = table[i]; - Entry e = prev; - - while (e != null) { - Entry next = e.next; - int k; - if (e.hash == hash && ((k = e.key) == key || key == k)) { - modCount++; - size--; - if (prev == e) - table[i] = next; - else - prev.next = next; - e.recordRemoval(this); - return e; - } - prev = e; - e = next; - } - - return e; - } - - /** - * Special version of remove for EntrySet. - */ - final Entry removeMapping(Object o) { - MapII.Entry entry = (MapII.Entry) o; - Object key = entry.getKey(); - int hash = (key == null) ? 0 : hash(key.hashCode()); - int i = indexFor(hash, table.length); - Entry prev = table[i]; - Entry e = prev; - - while (e != null) { - Entry next = e.next; - if (e.hash == hash && e.equals(entry)) { - modCount++; - size--; - if (prev == e) - table[i] = next; - else - prev.next = next; - e.recordRemoval(this); - return e; - } - prev = e; - e = next; - } - - return e; - } - - // doc copied from interface - public void clear() { - modCount++; - Entry[] tab = table; - for (int i = 0; i < tab.length; i++) - tab[i] = null; - size = 0; - } - - // doc copied from interface - public boolean containsValue(float value) { - Entry[] tab = table; - for (int i = 0; i < tab.length; i++) - for (Entry e = tab[i]; e != null; e = e.next) - if (value == e.value) - return true; - return false; - } - - /** - * Returns a shallow copy of this HMapIF instance: the keys and - * values themselves are not cloned. - * - * @return a shallow copy of this map - */ - public Object clone() { - HMapIF result = null; - try { - result = (HMapIF) super.clone(); - } catch (CloneNotSupportedException e) { - // assert false; - } - result.table = new Entry[table.length]; - result.entrySet = null; - result.modCount = 0; - result.size = 0; - result.init(); - result.putAllForCreate(this); - - return result; - } - - static class Entry implements MapIF.Entry { - final int key; - float value; - Entry next; - final int hash; - - /** - * Creates new entry. - */ - Entry(int h, int k, float v, Entry n) { - value = v; - next = n; - key = k; - hash = h; - } - - public final int getKey() { - return key; - } - - public final float getValue() { - return value; - } - - public final float setValue(float newValue) { - float oldValue = value; - value = newValue; - return oldValue; - } - - public final boolean equals(Object o) { - MapIF.Entry e = (MapIF.Entry) o; - int k1 = getKey(); - int k2 = e.getKey(); - if (k1 == k2) { - float v1 = getValue(); - float v2 = e.getValue(); - if (v1 == v2) - return true; - } - return false; - } - - public final int hashCode() { - return (key) ^ ((int) value); - } - - public final String toString() { - return getKey() + "=" + getValue(); - } - - /** - * This method is invoked whenever the value in an entry is overwritten - * by an invocation of put(k,v) for a key k that's already in the - * HMapIF. - */ - void recordAccess(HMapIF m) { - } - - /** - * This method is invoked whenever the entry is removed from the table. - */ - void recordRemoval(HMapIF m) { - } - } - - /** - * Adds a new entry with the specified key, value and hash code to the - * specified bucket. It is the responsibility of this method to resize the - * table if appropriate. - * - * Subclass overrides this to alter the behavior of put method. - */ - void addEntry(int hash, int key, float value, int bucketIndex) { - Entry e = table[bucketIndex]; - table[bucketIndex] = new Entry(hash, key, value, e); - if (size++ >= threshold) - resize(2 * table.length); - } - - /** - * Like addEntry except that this version is used when creating entries as - * part of Map construction or "pseudo-construction" (cloning, - * deserialization). This version needn't worry about resizing the table. - * - * Subclass overrides this to alter the behavior of HMapIF(Map), clone, and - * readObject. - */ - void createEntry(int hash, int key, float value, int bucketIndex) { - Entry e = table[bucketIndex]; - table[bucketIndex] = new Entry(hash, key, value, e); - size++; - } - - private abstract class HashIterator implements Iterator { - Entry next; // next entry to return - int expectedModCount; // For fast-fail - int index; // current slot - Entry current; // current entry - - HashIterator() { - expectedModCount = modCount; - if (size > 0) { // advance to first entry - Entry[] t = table; - while (index < t.length && (next = t[index++]) == null) - ; - } - } - - public final boolean hasNext() { - return next != null; - } - - final Entry nextEntry() { - if (modCount != expectedModCount) - throw new ConcurrentModificationException(); - Entry e = next; - if (e == null) - throw new NoSuchElementException(); - - if ((next = e.next) == null) { - Entry[] t = table; - while (index < t.length && (next = t[index++]) == null) - ; - } - current = e; - return e; - } - - public void remove() { - if (current == null) - throw new IllegalStateException(); - if (modCount != expectedModCount) - throw new ConcurrentModificationException(); - int k = current.key; - current = null; - HMapIF.this.removeEntryForKey(k); - expectedModCount = modCount; - } - - } - - private final class ValueIterator extends HashIterator { - public Float next() { - return nextEntry().value; - } - } - - private final class KeyIterator extends HashIterator { - public Integer next() { - return nextEntry().getKey(); - } - } - - private final class EntryIterator extends HashIterator { - public MapIF.Entry next() { - return nextEntry(); - } - } - - // Subclass overrides these to alter behavior of views' iterator() method - Iterator newKeyIterator() { - return new KeyIterator(); - } - - Iterator newValueIterator() { - return new ValueIterator(); - } - - Iterator newEntryIterator() { - return new EntryIterator(); - } - - // Views - - private transient Set entrySet = null; - - /** - * Each of these fields are initialized to contain an instance of the - * appropriate view the first time this view is requested. The views are - * stateless, so there's no reason to create more than one of each. - */ - transient volatile Set keySet = null; - transient volatile Collection values = null; - - // doc copied from interface - public Set keySet() { - Set ks = keySet; - return (ks != null ? ks : (keySet = new KeySet())); - } - - private final class KeySet extends AbstractSet { - @Override - public Iterator iterator() { - return newKeyIterator(); - } - - @Override - public int size() { - return size; - } - - @Override - public boolean contains(Object o) { - return containsKey((Integer) o); - } - } - - // doc copied from interface - public Collection values() { - Collection vs = values; - return (vs != null ? vs : (values = new Values())); - } - - private final class Values extends AbstractCollection { - @Override - public Iterator iterator() { - return newValueIterator(); - } - - @Override - public int size() { - return size; - } - - @Override - public boolean contains(Object o) { - return containsValue((Float) o); - } - } - - // doc copied from interface - public Set entrySet() { - return entrySet0(); - } - - private Set entrySet0() { - Set es = entrySet; - return es != null ? es : (entrySet = new EntrySet()); - } - - private final class EntrySet extends AbstractSet { - @Override - public Iterator iterator() { - return newEntryIterator(); - } - - @Override - public int size() { - return size; - } - - @Override - public boolean contains(Object o) { - MapIF.Entry e = (MapIF.Entry) o; - Entry candidate = getEntry(e.getKey()); - return candidate != null && candidate.equals(e); - } - } - - /** - * Save the state of the HMapIF instance to a stream (i.e., - * serialize it). - * - * @serialData The capacity of the HMapIF (the length of the bucket - * array) is emitted (int), followed by the size (an - * int, the number of key-value mappings), followed by the key - * (Object) and value (Object) for each key-value mapping. The - * key-value mappings are emitted in no particular order. - */ - private void writeObject(ObjectOutputStream s) throws IOException { - Iterator i = (size > 0) ? entrySet0().iterator() : null; - - // Write out the threshold, loadfactor, and any hidden stuff - s.defaultWriteObject(); - - // Write out number of buckets - s.writeInt(table.length); - - // Write out size (number of Mappings) - s.writeInt(size); - - // Write out keys and values (alternating) - if (i != null) { - while (i.hasNext()) { - MapIF.Entry e = i.next(); - s.writeInt(e.getKey()); - s.writeFloat(e.getValue()); - } - } - } - - private static final long serialVersionUID = 362498820763181265L; - - /** - * Reconstitute the HMapIF instance from a stream (i.e., - * deserialize it). - */ - private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException { - // Read in the threshold, loadfactor, and any hidden stuff - s.defaultReadObject(); - - // Read in number of buckets and allocate the bucket array; - int numBuckets = s.readInt(); - table = new Entry[numBuckets]; - - init(); // Give subclass a chance to do its thing. - - // Read in size (number of Mappings) - int size = s.readInt(); - - // Read the keys and values, and put the mappings in the HMapIF - for (int i = 0; i < size; i++) { - int key = s.readInt(); - float value = s.readFloat(); - putForCreate(key, value); - } - } - - // These methods are used when serializing HashSets - int capacity() { - return table.length; - } - - float loadFactor() { - return loadFactor; - } - - public String toString () { - return toString (-1); - } - - public String toString (int n) { - Iterator i = entrySet().iterator(); - if (!i.hasNext() || n == 0) - return "{}"; - - StringBuilder sb = new StringBuilder(); - sb.append('{'); - for (int m = 2; ; m++) { - MapIF.Entry e = i.next(); - int key = e.getKey(); - float value = e.getValue(); - //sb.append("(m: " + m + ", n: " + n + ")"); - sb.append(key); - sb.append('='); - sb.append(value); - if (! i.hasNext() || (m > n && n > 0)) { - if (i.hasNext()) { - sb.append (", ..."); - } - return sb.append('}').toString(); - } - sb.append(", "); - } - } - - // methods not part of a standard HashMap - - /** - * Adds values of keys from another map to this map. - * - * @param m - * the other map - */ - public void plus(MapIF m) { - for (MapIF.Entry e : m.entrySet()) { - int key = e.getKey(); - - if (this.containsKey(key)) { - this.put(key, this.get(key) + e.getValue()); - } else { - this.put(key, e.getValue()); - } - } - } - - /** - * Computes the dot product of this map with another map. - * - * @param m - * the other map - */ - public float dot(MapIF m) { - float s = 0.0f; - - for (MapIF.Entry e : m.entrySet()) { - int key = e.getKey(); - - if (this.containsKey(key)) { - s += this.get(key) * e.getValue(); - } - } - - return s; - } - - /** - * Returns the length of the vector represented by this map. - * - * @return length of the vector represented by this map - */ - public float length() { - float s = 0.0f; - - for (MapIF.Entry e : this.entrySet()) { - s += e.getValue() * e.getValue(); - } - - return (float) Math.sqrt(s); - } - - /** - * Normalizes values such that the vector represented by this map has unit - * length. - */ - public void normalize() { - float l = this.length(); - - for (int f : this.keySet()) { - this.put(f, this.get(f) / l); - } - - } - - /** - * Returns entries sorted by descending value. Ties broken by the key. - * - * @return entries sorted by descending value - */ - public MapIF.Entry[] getEntriesSortedByValue() { - if (this.size() == 0) - return null; - - // for storing the entries - MapIF.Entry[] entries = new Entry[this.size()]; - int i = 0; - Entry next = null; - - int index = 0; - // advance to first entry - while (index < table.length && (next = table[index++]) == null) - ; - - while (next != null) { - // current entry - Entry e = next; - - // advance to next entry - next = e.next; - if ((next = e.next) == null) { - while (index < table.length && (next = table[index++]) == null) - ; - } - - // add entry to array - entries[i++] = e; - } - - // sort the entries - Arrays.sort(entries, new Comparator() { - public int compare(MapIF.Entry e1, MapIF.Entry e2) { - if (e1.getValue() > e2.getValue()) { - return -1; - } else if (e1.getValue() < e2.getValue()) { - return 1; - } - - if (e1.getKey() == e2.getKey()) - return 0; - - return e1.getKey() > e2.getKey() ? 1 : -1; - } - }); - - return entries; - } - - /** - * Returns top n entries sorted by descending value. Ties broken by - * the key. - * - * @param n - * number of entries to return - * @return top n entries sorted by descending value - */ - public MapIF.Entry[] getEntriesSortedByValue(int n) { - MapIF.Entry[] entries = getEntriesSortedByValue(); - - if (entries == null) - return null; - - if (entries.length < n) - return entries; - - return Arrays.copyOfRange(entries, 0, n); - } - + // doc copied from interface + public float remove(int key) { + Entry e = removeEntryForKey(key); + if (e != null) + return e.value; + + throw new NoSuchElementException(); + } + + /** + * Removes and returns the entry associated with the specified key in the + * HMapIF. Returns null if the HMapIF contains no mapping for this key. + */ + final Entry removeEntryForKey(int key) { + int hash = hash(key); + int i = indexFor(hash, table.length); + Entry prev = table[i]; + Entry e = prev; + + while (e != null) { + Entry next = e.next; + int k; + if (e.hash == hash && ((k = e.key) == key || key == k)) { + modCount++; + size--; + if (prev == e) + table[i] = next; + else + prev.next = next; + e.recordRemoval(this); + return e; + } + prev = e; + e = next; + } + + return e; + } + + /** + * Special version of remove for EntrySet. + */ + final Entry removeMapping(Object o) { + MapII.Entry entry = (MapII.Entry) o; + Object key = entry.getKey(); + int hash = (key == null) ? 0 : hash(key.hashCode()); + int i = indexFor(hash, table.length); + Entry prev = table[i]; + Entry e = prev; + + while (e != null) { + Entry next = e.next; + if (e.hash == hash && e.equals(entry)) { + modCount++; + size--; + if (prev == e) + table[i] = next; + else + prev.next = next; + e.recordRemoval(this); + return e; + } + prev = e; + e = next; + } + + return e; + } + + // doc copied from interface + public void clear() { + modCount++; + Entry[] tab = table; + for (int i = 0; i < tab.length; i++) + tab[i] = null; + size = 0; + } + + // doc copied from interface + public boolean containsValue(float value) { + Entry[] tab = table; + for (int i = 0; i < tab.length; i++) + for (Entry e = tab[i]; e != null; e = e.next) + if (value == e.value) + return true; + return false; + } + + /** + * Returns a shallow copy of this HMapIF instance: the keys and + * values themselves are not cloned. + * + * @return a shallow copy of this map + */ + public Object clone() { + HMapIF result = null; + try { + result = (HMapIF) super.clone(); + } catch (CloneNotSupportedException e) { + // assert false; + } + result.table = new Entry[table.length]; + result.entrySet = null; + result.modCount = 0; + result.size = 0; + result.init(); + result.putAllForCreate(this); + + return result; + } + + static class Entry implements MapIF.Entry { + final int key; + float value; + Entry next; + final int hash; + + /** + * Creates new entry. + */ + Entry(int h, int k, float v, Entry n) { + value = v; + next = n; + key = k; + hash = h; + } + + public final int getKey() { + return key; + } + + public final float getValue() { + return value; + } + + public final float setValue(float newValue) { + float oldValue = value; + value = newValue; + return oldValue; + } + + public final boolean equals(Object o) { + MapIF.Entry e = (MapIF.Entry) o; + int k1 = getKey(); + int k2 = e.getKey(); + if (k1 == k2) { + float v1 = getValue(); + float v2 = e.getValue(); + if (v1 == v2) + return true; + } + return false; + } + + public final int hashCode() { + return (key) ^ ((int) value); + } + + public final String toString() { + return getKey() + "=" + getValue(); + } + + /** + * This method is invoked whenever the value in an entry is overwritten + * by an invocation of put(k,v) for a key k that's already in the + * HMapIF. + */ + void recordAccess(HMapIF m) { + } + + /** + * This method is invoked whenever the entry is removed from the table. + */ + void recordRemoval(HMapIF m) { + } + } + + /** + * Adds a new entry with the specified key, value and hash code to the + * specified bucket. It is the responsibility of this method to resize the + * table if appropriate. + * + * Subclass overrides this to alter the behavior of put method. + */ + void addEntry(int hash, int key, float value, int bucketIndex) { + Entry e = table[bucketIndex]; + table[bucketIndex] = new Entry(hash, key, value, e); + if (size++ >= threshold) + resize(2 * table.length); + } + + /** + * Like addEntry except that this version is used when creating entries as + * part of Map construction or "pseudo-construction" (cloning, + * deserialization). This version needn't worry about resizing the table. + * + * Subclass overrides this to alter the behavior of HMapIF(Map), clone, and + * readObject. + */ + void createEntry(int hash, int key, float value, int bucketIndex) { + Entry e = table[bucketIndex]; + table[bucketIndex] = new Entry(hash, key, value, e); + size++; + } + + private abstract class HashIterator implements Iterator { + Entry next; // next entry to return + int expectedModCount; // For fast-fail + int index; // current slot + Entry current; // current entry + + HashIterator() { + expectedModCount = modCount; + if (size > 0) { // advance to first entry + Entry[] t = table; + while (index < t.length && (next = t[index++]) == null) + ; + } + } + + public final boolean hasNext() { + return next != null; + } + + final Entry nextEntry() { + if (modCount != expectedModCount) + throw new ConcurrentModificationException(); + Entry e = next; + if (e == null) + throw new NoSuchElementException(); + + if ((next = e.next) == null) { + Entry[] t = table; + while (index < t.length && (next = t[index++]) == null) + ; + } + current = e; + return e; + } + + public void remove() { + if (current == null) + throw new IllegalStateException(); + if (modCount != expectedModCount) + throw new ConcurrentModificationException(); + int k = current.key; + current = null; + HMapIF.this.removeEntryForKey(k); + expectedModCount = modCount; + } + + } + + private final class ValueIterator extends HashIterator { + public Float next() { + return nextEntry().value; + } + } + + private final class KeyIterator extends HashIterator { + public Integer next() { + return nextEntry().getKey(); + } + } + + private final class EntryIterator extends HashIterator { + public MapIF.Entry next() { + return nextEntry(); + } + } + + // Subclass overrides these to alter behavior of views' iterator() method + Iterator newKeyIterator() { + return new KeyIterator(); + } + + Iterator newValueIterator() { + return new ValueIterator(); + } + + Iterator newEntryIterator() { + return new EntryIterator(); + } + + // Views + + private transient Set entrySet = null; + + /** + * Each of these fields are initialized to contain an instance of the + * appropriate view the first time this view is requested. The views are + * stateless, so there's no reason to create more than one of each. + */ + transient volatile Set keySet = null; + transient volatile Collection values = null; + + // doc copied from interface + public Set keySet() { + Set ks = keySet; + return (ks != null ? ks : (keySet = new KeySet())); + } + + private final class KeySet extends AbstractSet { + @Override + public Iterator iterator() { + return newKeyIterator(); + } + + @Override + public int size() { + return size; + } + + @Override + public boolean contains(Object o) { + return containsKey((Integer) o); + } + } + + // doc copied from interface + public Collection values() { + Collection vs = values; + return (vs != null ? vs : (values = new Values())); + } + + private final class Values extends AbstractCollection { + @Override + public Iterator iterator() { + return newValueIterator(); + } + + @Override + public int size() { + return size; + } + + @Override + public boolean contains(Object o) { + return containsValue((Float) o); + } + } + + // doc copied from interface + public Set entrySet() { + return entrySet0(); + } + + private Set entrySet0() { + Set es = entrySet; + return es != null ? es : (entrySet = new EntrySet()); + } + + private final class EntrySet extends AbstractSet { + @Override + public Iterator iterator() { + return newEntryIterator(); + } + + @Override + public int size() { + return size; + } + + @Override + public boolean contains(Object o) { + MapIF.Entry e = (MapIF.Entry) o; + Entry candidate = getEntry(e.getKey()); + return candidate != null && candidate.equals(e); + } + } + + /** + * Save the state of the HMapIF instance to a stream (i.e., + * serialize it). + * + * @serialData The capacity of the HMapIF (the length of the bucket + * array) is emitted (int), followed by the size (an + * int, the number of key-value mappings), followed by the key + * (Object) and value (Object) for each key-value mapping. The + * key-value mappings are emitted in no particular order. + */ + private void writeObject(ObjectOutputStream s) throws IOException { + Iterator i = (size > 0) ? entrySet0().iterator() : null; + + // Write out the threshold, loadfactor, and any hidden stuff + s.defaultWriteObject(); + + // Write out number of buckets + s.writeInt(table.length); + + // Write out size (number of Mappings) + s.writeInt(size); + + // Write out keys and values (alternating) + if (i != null) { + while (i.hasNext()) { + MapIF.Entry e = i.next(); + s.writeInt(e.getKey()); + s.writeFloat(e.getValue()); + } + } + } + + private static final long serialVersionUID = 362498820763181265L; + + /** + * Reconstitute the HMapIF instance from a stream (i.e., + * deserialize it). + */ + private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException { + // Read in the threshold, loadfactor, and any hidden stuff + s.defaultReadObject(); + + // Read in number of buckets and allocate the bucket array; + int numBuckets = s.readInt(); + table = new Entry[numBuckets]; + + init(); // Give subclass a chance to do its thing. + + // Read in size (number of Mappings) + int size = s.readInt(); + + // Read the keys and values, and put the mappings in the HMapIF + for (int i = 0; i < size; i++) { + int key = s.readInt(); + float value = s.readFloat(); + putForCreate(key, value); + } + } + + // These methods are used when serializing HashSets + int capacity() { + return table.length; + } + + float loadFactor() { + return loadFactor; + } + + public String toString () { + return toString (-1); + } + + public String toString (int n) { + Iterator i = entrySet().iterator(); + if (!i.hasNext() || n == 0) + return "{}"; + + StringBuilder sb = new StringBuilder(); + sb.append('{'); + for (int m = 2; ; m++) { + MapIF.Entry e = i.next(); + int key = e.getKey(); + float value = e.getValue(); + //sb.append("(m: " + m + ", n: " + n + ")"); + sb.append(key); + sb.append('='); + sb.append(value); + if (! i.hasNext() || (m > n && n > 0)) { + if (i.hasNext()) { + sb.append (", ..."); + } + return sb.append('}').toString(); + } + sb.append(", "); + } + } + + // methods not part of a standard HashMap + + /** + * Adds values of keys from another map to this map. + * + * @param m + * the other map + */ + public void plus(MapIF m) { + for (MapIF.Entry e : m.entrySet()) { + int key = e.getKey(); + + if (this.containsKey(key)) { + this.put(key, this.get(key) + e.getValue()); + } else { + this.put(key, e.getValue()); + } + } + } + + /** + * Computes the dot product of this map with another map. + * + * @param m + * the other map + */ + public float dot(MapIF m) { + float s = 0.0f; + + for (MapIF.Entry e : m.entrySet()) { + int key = e.getKey(); + + if (this.containsKey(key)) { + s += this.get(key) * e.getValue(); + } + } + + return s; + } + + /** + * Returns the length of the vector represented by this map. + * + * @return length of the vector represented by this map + */ + public float length() { + float s = 0.0f; + + for (MapIF.Entry e : this.entrySet()) { + s += e.getValue() * e.getValue(); + } + + return (float) Math.sqrt(s); + } + + /** + * Normalizes values such that the vector represented by this map has unit + * length. + */ + public void normalize() { + float l = this.length(); + + for (int f : this.keySet()) { + this.put(f, this.get(f) / l); + } + + } + + /** + * Returns entries sorted by descending value. Ties broken by the key. + * + * @return entries sorted by descending value + */ + public MapIF.Entry[] getEntriesSortedByValue() { + if (this.size() == 0) + return null; + + // for storing the entries + MapIF.Entry[] entries = new Entry[this.size()]; + int i = 0; + Entry next = null; + + int index = 0; + // advance to first entry + while (index < table.length && (next = table[index++]) == null) + ; + + while (next != null) { + // current entry + Entry e = next; + + // advance to next entry + next = e.next; + if ((next = e.next) == null) { + while (index < table.length && (next = table[index++]) == null) + ; + } + + // add entry to array + entries[i++] = e; + } + + // sort the entries + Arrays.sort(entries, new Comparator() { + public int compare(MapIF.Entry e1, MapIF.Entry e2) { + if (e1.getValue() > e2.getValue()) { + return -1; + } else if (e1.getValue() < e2.getValue()) { + return 1; + } + + if (e1.getKey() == e2.getKey()) + return 0; + + return e1.getKey() > e2.getKey() ? 1 : -1; + } + }); + + return entries; + } + + /** + * Returns top n entries sorted by descending value. Ties broken by + * the key. + * + * @param n + * number of entries to return + * @return top n entries sorted by descending value + */ + public MapIF.Entry[] getEntriesSortedByValue(int n) { + MapIF.Entry[] entries = getEntriesSortedByValue(); + + if (entries == null) + return null; + + if (entries.length < n) + return entries; + + return Arrays.copyOfRange(entries, 0, n); + } + + + /** + * Returns entries sorted by ascending value. Ties broken by the key. + * + * @return entries sorted by ascending value + */ + public MapIF.Entry[] getEntriesSortedByAscendingValue() { + if (this.size() == 0) + return null; + + // for storing the entries + MapIF.Entry[] entries = new Entry[this.size()]; + int i = 0; + Entry next = null; + + int index = 0; + // advance to first entry + while (index < table.length && (next = table[index++]) == null) + ; + + while (next != null) { + // current entry + Entry e = next; + + // advance to next entry + next = e.next; + if ((next = e.next) == null) { + while (index < table.length && (next = table[index++]) == null) + ; + } + + // add entry to array + entries[i++] = e; + } + + // sort the entries + Arrays.sort(entries, new Comparator() { + public int compare(MapIF.Entry e1, MapIF.Entry e2) { + if (e1.getValue() > e2.getValue()) { + return 1; + } else if (e1.getValue() < e2.getValue()) { + return -1; + } + + if (e1.getKey() == e2.getKey()) + return 0; + + return e1.getKey() > e2.getKey() ? -1 : 1; + } + }); + + return entries; + } + + /** + * Returns top n entries sorted by descending value. Ties broken by + * the key. + * + * @param n + * number of entries to return + * @return top n entries sorted by descending value + */ + public MapIF.Entry[] getEntriesSortedByAscendingValue(int n) { + MapIF.Entry[] entries = getEntriesSortedByAscendingValue(); + + if (entries == null) + return null; + + if (entries.length < n) + return entries; + + return Arrays.copyOfRange(entries, 0, n); + } } From 07b6a049e71ec705f09af28de15b1e1513bbfabd Mon Sep 17 00:00:00 2001 From: "Earl J. Wagner" Date: Mon, 5 Mar 2012 14:06:53 -0500 Subject: [PATCH 17/18] refactored some variable names --- .../aquaint2/Aquaint2DocnoMapping.java | 45 ++++++++++--------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java index f146b5210..80bbdd36b 100644 --- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java +++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java @@ -45,7 +45,7 @@ public class Aquaint2DocnoMapping implements DocnoMapping { private String[] docidEntries; @Override - public int getDocno(String docid) { + public int getDocno(String docid) { LOG.trace("getDocno(docid: " + docid + ")"); Preconditions.checkNotNull(docid); int sourceLength = docid.length() - 13; @@ -61,22 +61,22 @@ public int getDocno(String docid) { int entryId = findEntryId(source, year, month); LOG.debug("entryId: " + entryId); - String entryElt = docidEntries[entryId].split("\t")[day]; - LOG.debug("entryElt: " + entryElt); - - // then traverse the days to find the day and skip over missing articles to get the article number - String[] entryEltParts = entryElt.split(" "); - int result = articleNo + Integer.parseInt(entryEltParts[0]); - String[] entryDayParts = entryEltParts[1].split(","); - for (int i = 1; i < entryDayParts.length; i++) { - int missingNo = Integer.parseInt(entryDayParts[i]); - if (articleNo < missingNo) break; - LOG.debug("skipping missingNo: " + missingNo); - result--; - } + String entryElt = docidEntries[entryId].split("\t")[day]; + LOG.debug("entryElt: " + entryElt); + + // then traverse the days to find the day and skip over missing articles to get the article number + String[] entryEltParts = entryElt.split(" "); + int result = articleNo + Integer.parseInt(entryEltParts[0]); + String[] entryDayParts = entryEltParts[1].split(","); + for (int i = 1; i < entryDayParts.length; i++) { + int missingNo = Integer.parseInt(entryDayParts[i]); + if (articleNo < missingNo) break; + LOG.debug("skipping missingNo: " + missingNo); + result--; + } - LOG.trace("getDocno returning: " + result); - return result; + LOG.trace("getDocno returning: " + result); + return result; } private int findEntryId(String source, int year, int month) { @@ -100,7 +100,7 @@ private int findEntryId(String source, int year, int month) { @Override - public String getDocid(int docno) { + public String getDocid(int docno) { Preconditions.checkArgument(docno > 0); LOG.trace("getDocid(docno: " + docno + ")"); @@ -165,7 +165,7 @@ private String[] findEntryEltParts(int docno, String[] entryElts) { @Override - public void loadMapping(Path p, FileSystem fs) throws IOException { + public void loadMapping(Path p, FileSystem fs) throws IOException { docidEntries = Aquaint2DocnoMapping.readDocnoData(p, fs); } @@ -202,8 +202,8 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws LOG.debug("source: " + source + ", year: " + year + ", month: " + month + ", day: " + day + ", articleNo: " + articleNo); if (! source.equals(prevSource) || - year != prevYear || - month != prevMonth) { + year != prevYear || + month != prevMonth) { LOG.debug("diff source, year or month, currentEntry: " + currentEntry); if (currentEntry != null) { list.add(currentEntry.toString()); @@ -322,14 +322,15 @@ public static void main(String[] args) throws IOException { System.out.println("looking up docno for \"" + args[2] + "\""); int idx = mapping.getDocno(args[2]); if (idx > 0) { - System.out.println(mapping.getDocno(args[2])); + System.out.println(idx); } else { System.err.print("Invalid docid!"); } } else if (args[0].equals("getDocid")) { + int docno = Integer.parseInt(args[2]); try { System.out.println("looking up docid for " + args[2]); - System.out.println(mapping.getDocid(Integer.parseInt(args[2]))); + System.out.println(mapping.getDocid(docno)); } catch (Exception e) { System.err.print("Invalid docno!"); } From be21d865b4d7671b8722498df946a3dc8d298de8 Mon Sep 17 00:00:00 2001 From: "Earl J. Wagner" Date: Mon, 5 Mar 2012 14:07:16 -0500 Subject: [PATCH 18/18] catch parse error --- .../collection/aquaint2/Aquaint2Document.java | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java index 567d45d2d..5e788cfc3 100644 --- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java +++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java @@ -116,8 +116,9 @@ public String getElementText(String elementTagName) { result = raw.substring(start).trim(); LOG.error("found element text: " + result); } - result = TAGS_PATTERN.matcher(result).replaceAll(""); + result = TAGS_PATTERN.matcher(result).replaceAll("\n"); result = WHITESPACE_PATTERN.matcher(result).replaceAll(" "); + //System.out.println(result); } return result; } @@ -147,7 +148,15 @@ private void setAquaint2Docid() { LOG.trace("setAquaint2Docid()"); int start = 9; int end = raw.indexOf("\"", start); - docid = raw.substring(start, end).trim(); + try { + docid = raw.substring(start, end).trim(); + } catch (Exception e) { + LOG.error("exception: " + e); + LOG.error("start: " + start + ", end: " + end); + LOG.error("raw:\n" + raw); + String result = raw.substring(start).trim(); + LOG.error("found element text: " + result); + } LOG.trace("in setAquaint2Docid, docid: " + docid); }