From b02682632c7986ddc509ca6408c31b8c4f8f7767 Mon Sep 17 00:00:00 2001
From: "Earl J. Wagner"
Date: Sun, 25 Sep 2011 19:33:42 -0400
Subject: [PATCH 01/18] minor fix
---
build.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/build.xml b/build.xml
index 33c5efe22..20b532d14 100644
--- a/build.xml
+++ b/build.xml
@@ -103,10 +103,10 @@
-
+
-
+
From 2fecb8d44580b1b86cd95efb9b09716f9ff47086 Mon Sep 17 00:00:00 2001
From: "Earl J. Wagner"
Date: Sun, 25 Sep 2011 19:34:54 -0400
Subject: [PATCH 02/18] added optional printing of elements
---
src/dist/edu/umd/cloud9/util/map/HMapIF.java | 17 +++++++++++++----
1 file changed, 13 insertions(+), 4 deletions(-)
diff --git a/src/dist/edu/umd/cloud9/util/map/HMapIF.java b/src/dist/edu/umd/cloud9/util/map/HMapIF.java
index 14ec8cbd4..b7c449fc2 100644
--- a/src/dist/edu/umd/cloud9/util/map/HMapIF.java
+++ b/src/dist/edu/umd/cloud9/util/map/HMapIF.java
@@ -810,22 +810,31 @@ float loadFactor() {
return loadFactor;
}
- public String toString() {
+ public String toString () {
+ return toString (-1);
+ }
+
+ public String toString (int n) {
Iterator i = entrySet().iterator();
- if (!i.hasNext())
+ if (!i.hasNext() || n == 0)
return "{}";
StringBuilder sb = new StringBuilder();
sb.append('{');
- for (;;) {
+ for (int m = 2; ; m++) {
MapIF.Entry e = i.next();
int key = e.getKey();
float value = e.getValue();
+ //sb.append("(m: " + m + ", n: " + n + ")");
sb.append(key);
sb.append('=');
sb.append(value);
- if (!i.hasNext())
+ if (! i.hasNext() || (m > n && n > 0)) {
+ if (i.hasNext()) {
+ sb.append (", ...");
+ }
return sb.append('}').toString();
+ }
sb.append(", ");
}
}
From ea493b94f8981eab7b2b9ad69d0d6e20c67a4f80 Mon Sep 17 00:00:00 2001
From: "Earl J. Wagner"
Date: Fri, 7 Oct 2011 14:10:23 -0400
Subject: [PATCH 03/18] deterministically convert between Aquaint2 docnos and
docids
---
.../aquaint2/Aquaint2DocnoMapping.java | 184 ++++++++++++++++--
.../aquaint2/BuildAquaint2ForwardIndex.java | 2 +-
2 files changed, 166 insertions(+), 20 deletions(-)
diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
index 26a116072..65fb43ed6 100644
--- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
+++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
@@ -27,6 +27,7 @@
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.LineReader;
+import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import com.google.common.base.Preconditions;
@@ -36,24 +37,128 @@
public class Aquaint2DocnoMapping implements DocnoMapping {
private static final Logger LOG = Logger.getLogger(Aquaint2DocnoMapping.class);
+ // { LOG.setLevel (Level.TRACE); }
- private String[] docids;
+ private String[] docidEntries;
@Override
public int getDocno(String docid) {
+ LOG.trace("getDocno(docid: " + docid + ")");
Preconditions.checkNotNull(docid);
- return Arrays.binarySearch(docids, docid);
+ String source = docid.substring(0, 7);
+ int year = Integer.parseInt(docid.substring (8, 12));
+ int month = Integer.parseInt(docid.substring (12, 14));
+ int day = Integer.parseInt(docid.substring (14, 16));
+ int articleNo = Integer.parseInt(docid.substring (17, 21));
+
+ // first traverse the entries to find the month entry and get its days
+ int entryId = findEntryId(source, year, month);
+ LOG.debug("entryId: " + entryId);
+
+ String entryElt = docidEntries[entryId].split("\t")[day];
+ LOG.debug("entryElt: " + entryElt);
+
+ // then traverse the days to find the day and skip over missing articles to get the article number
+ String[] entryEltParts = entryElt.split(" ");
+ int result = articleNo + Integer.parseInt(entryEltParts[0]);
+ String[] entryDayParts = entryEltParts[1].split(",");
+ for (int i = 1; i < entryDayParts.length; i++) {
+ int missingNo = Integer.parseInt(entryDayParts[i]);
+ if (articleNo < missingNo) break;
+ LOG.debug("skipping missingNo: " + missingNo);
+ result--;
+ }
+
+ LOG.trace("getDocno returning: " + result);
+ return result;
+ }
+
+ private int findEntryId(String source, int year, int month) {
+ for (int i = 0; i < docidEntries.length; i++) {
+ LOG.debug("docidEntries [" + i + "]: " + docidEntries[i]);
+ String[] entryElts = docidEntries[i].split("\t");
+ String[] entryMetaInfo = entryElts[0].split(" ");
+ String entrySource = entryMetaInfo[1];
+ if (entrySource.equals (source)) {
+ int entryYear = Integer.parseInt(entryMetaInfo[2]);
+ if (entryYear == year) {
+ int entryMonth = Integer.parseInt(entryMetaInfo[3]);
+ if (entryMonth == month) {
+ return i;
+ }
+ }
+ }
+ }
+ return -1;
}
+
@Override
public String getDocid(int docno) {
Preconditions.checkArgument(docno > 0);
- return docids[docno];
+ LOG.trace("getDocid(docno: " + docno + ")");
+
+ // first traverse the entries to find the month entry and get its source, year, month
+ int entryId = findEntryId(docno);
+ LOG.debug("entryId: " + entryId);
+ String[] entryElts = docidEntries[entryId].split("\t");
+ String[] entryMetaInfo = entryElts[0].split(" ");
+ String source = entryMetaInfo[1];
+ int year = Integer.parseInt(entryMetaInfo[2]);
+ int month = Integer.parseInt(entryMetaInfo[3]);
+ LOG.debug("looking at: " + String.format("%s_%04d%02d__.____", source, year, month));
+
+ // then traverse the days to find the day and skip over missing articles to get the article number
+ String[] entryEltParts = findEntryEltParts (docno, entryElts);
+ int offset = Integer.parseInt(entryEltParts[0]);
+ String[] entryDayParts = entryEltParts[1].split(",");
+ int day = Integer.parseInt(entryDayParts[0]);
+ LOG.debug("found day: " + day + ", looking at: " + String.format("%s_%04d%02d%02d.____", source, year, month, day));
+ int articleNo = docno - offset;
+ for (int i = 1; i < entryDayParts.length; i++) {
+ int missingNo = Integer.parseInt(entryDayParts[i]);
+ if (articleNo < missingNo) break;
+ LOG.debug("skipping missingNo: " + missingNo);
+ articleNo++;
+ }
+ LOG.debug("found articleNo: " + articleNo + ", looking at: " + String.format("%s_%04d%02d%02d.%04d", source, year, month, day, articleNo));
+ return String.format ("%s_%04d%02d%02d.%04d", source, year, month, day, articleNo);
+ }
+
+
+ private int findEntryId(int docno) {
+ for (int i = 0; i < docidEntries.length; i++) {
+ LOG.debug("docidEntries [" + i + "]: " + docidEntries[i]);
+ int entryOffset = Integer.parseInt(docidEntries[i].split(" ") [0]);
+ if (entryOffset >= docno) {
+ return i - 1;
+ }
+ }
+ return docidEntries.length - 1;
}
+
+ private String[] findEntryEltParts(int docno, String[] entryElts) {
+ String[] thisEltParts = new String[0];
+ int prevOffset = -1;
+ String[] prevEltParts = new String[0];
+
+ for (int i = 1; i < entryElts.length; i++) {
+ thisEltParts = entryElts[i].split(" ");
+ int thisOffset = Integer.parseInt(thisEltParts[0]);
+ if (thisOffset >= docno) {
+ return prevEltParts;
+ }
+ prevOffset = thisOffset;
+ prevEltParts = thisEltParts;
+ }
+ return thisEltParts;
+ }
+
+
@Override
public void loadMapping(Path p, FileSystem fs) throws IOException {
- docids = Aquaint2DocnoMapping.readDocnoData(p, fs);
+ docidEntries = Aquaint2DocnoMapping.readDocnoData(p, fs);
}
static public void writeDocnoData(Path input, Path output, FileSystem fs) throws IOException {
@@ -64,14 +169,59 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws
LOG.info("Reading " + input);
int cnt = 0;
Text line = new Text();
+
+ String prevSource = null;
+ int prevYear = -1;
+ int prevMonth = -1;
+ int prevDay = -1;
+ int prevArticleNo = -1;
+ StringBuilder currentEntry = null;
+
while (reader.readLine(line) > 0) {
- String[] arr = line.toString().split("\\t");
- list.add(arr[0]);
+ String docid = line.toString();
+
+ String source = docid.substring(0, 7);
+ int year = Integer.parseInt(docid.substring (8, 12));
+ int month = Integer.parseInt(docid.substring (12, 14));
+ int day = Integer.parseInt(docid.substring (14, 16));
+ int articleNo = Integer.parseInt(docid.substring (17, 21));
+ LOG.debug("prevSource: " + prevSource + ", prevYear: " + prevYear + ", prevMonth: " + prevMonth + ", prevDay: " + prevDay + ", prevArticleNo: " + prevArticleNo);
+ LOG.debug("source: " + source + ", year: " + year + ", month: " + month + ", day: " + day + ", articleNo: " + articleNo);
+
+ if (! source.equals(prevSource) ||
+ year != prevYear ||
+ month != prevMonth) {
+ LOG.debug("currentEntry: " + currentEntry);
+ if (currentEntry != null) list.add(currentEntry.toString());
+ currentEntry = new StringBuilder(cnt + " " + source + " " + year + " " + month);
+ prevDay = 0;
+ prevArticleNo = 0;
+ }
+ if (day != prevDay) {
+ for (int i = prevDay + 1; i <= day; i++) {
+ currentEntry.append("\t" + cnt + " " + i);
+ }
+ prevArticleNo = 0;
+ }
+ if (articleNo != prevArticleNo + 1) {
+ // we have missing article numbers - gather them
+ for (int i = prevArticleNo + 1; i < articleNo; i++) {
+ currentEntry.append("," + i);
+ }
+ }
+ prevSource = source;
+ prevYear = year;
+ prevMonth = month;
+ prevDay = day;
+ prevArticleNo = articleNo;
+
cnt++;
if (cnt % 100000 == 0) {
LOG.info(cnt + " docs");
}
}
+ list.add(currentEntry.toString());
+ list.add("" + cnt);
reader.close();
LOG.info(cnt + " docs total. Done!");
@@ -83,31 +233,27 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws
out.writeUTF(list.get(i));
cnt++;
if (cnt % 100000 == 0) {
- LOG.info(cnt + " docs");
+ LOG.info(cnt + " months of docs");
}
}
out.close();
- LOG.info(cnt + " docs total. Done!");
+ LOG.info(cnt + " months of docs total. Done!");
}
static public String[] readDocnoData(Path p, FileSystem fs) throws IOException {
- LOG.warn("p: " + p);
+ LOG.trace("readDocnoData (p: " + p + ", fs)");
FSDataInputStream in = fs.open(p);
- // Docnos start at one, so we need an array that's one larger than number of docs.
- int sz = in.readInt() + 1;
- LOG.warn("creating array of length: " + sz);
+ int sz = in.readInt();
+ LOG.debug("creating a month array of length: " + sz);
String[] arr = new String[sz];
- for (int i = 1; i < sz; i++) {
+ for (int i = 0; i < sz; i++) {
arr[i] = in.readUTF();
+ LOG.debug("arr[" + i + "]: " + arr[i]);
}
in.close();
- // Can't leave the zero'th entry null, or else we might get a null pointer exception during a
- // binary search on the array.
- arr[0] = "";
-
return arr;
}
@@ -125,8 +271,8 @@ public static void main(String[] args) throws IOException {
mapping.loadMapping(new Path(args[1]), fs);
if (args[0].equals("list")) {
- for (int i = 1; i < mapping.docids.length; i++) {
- System.out.println(i + "\t" + mapping.docids[i]);
+ for (int i = 1; i < mapping.docidEntries.length; i++) {
+ System.out.println(i + "\t" + mapping.docidEntries[i]);
}
} else if (args[0].equals("getDocno")) {
System.out.println("looking up docno for \"" + args[2] + "\"");
diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/BuildAquaint2ForwardIndex.java b/src/dist/edu/umd/cloud9/collection/aquaint2/BuildAquaint2ForwardIndex.java
index 9a6d3f2fc..3163f0cf0 100644
--- a/src/dist/edu/umd/cloud9/collection/aquaint2/BuildAquaint2ForwardIndex.java
+++ b/src/dist/edu/umd/cloud9/collection/aquaint2/BuildAquaint2ForwardIndex.java
@@ -164,7 +164,7 @@ public int runTool (Configuration config, String collectionPath, String outputPa
String inputFile = outputPath + "/" + "part-00000";
- sLogger.info("Writing " + numDocs + " doc offseta to " + indexFile);
+ sLogger.info("Writing " + numDocs + " doc offsets to " + indexFile);
FSLineReader reader = new FSLineReader(inputFile, fs);
FSDataOutputStream writer = fs.create(new Path(indexFile), true);
From c4e081c4ddeb2232cdb1d601174de68bd961d7ab Mon Sep 17 00:00:00 2001
From: "Earl J. Wagner"
Date: Tue, 11 Oct 2011 21:38:33 -0400
Subject: [PATCH 04/18] outputting docno mapping data in smaller chunks for
writeUTF 64k limit
---
.../aquaint2/Aquaint2DocnoMapping.java | 53 ++++++++++++++-----
1 file changed, 40 insertions(+), 13 deletions(-)
diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
index 65fb43ed6..d5b9d6627 100644
--- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
+++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
@@ -37,7 +37,8 @@
public class Aquaint2DocnoMapping implements DocnoMapping {
private static final Logger LOG = Logger.getLogger(Aquaint2DocnoMapping.class);
- // { LOG.setLevel (Level.TRACE); }
+ // { LOG.setLevel (Level.TRACE); }
+ { LOG.setLevel (Level.INFO); }
private String[] docidEntries;
@@ -176,6 +177,7 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws
int prevDay = -1;
int prevArticleNo = -1;
StringBuilder currentEntry = null;
+ int numEntries = 0;
while (reader.readLine(line) > 0) {
String docid = line.toString();
@@ -189,10 +191,14 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws
LOG.debug("source: " + source + ", year: " + year + ", month: " + month + ", day: " + day + ", articleNo: " + articleNo);
if (! source.equals(prevSource) ||
- year != prevYear ||
- month != prevMonth) {
+ year != prevYear ||
+ month != prevMonth) {
LOG.debug("currentEntry: " + currentEntry);
- if (currentEntry != null) list.add(currentEntry.toString());
+ if (currentEntry != null) {
+ list.add(currentEntry.toString());
+ list.add("");
+ numEntries++;
+ }
currentEntry = new StringBuilder(cnt + " " + source + " " + year + " " + month);
prevDay = 0;
prevArticleNo = 0;
@@ -202,6 +208,10 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws
currentEntry.append("\t" + cnt + " " + i);
}
prevArticleNo = 0;
+ // writeUTF can't write a string longer than 64k, so we output a chunk at a time
+ // here then concatenate strings between s
+ list.add(currentEntry.toString());
+ currentEntry = new StringBuilder ();
}
if (articleNo != prevArticleNo + 1) {
// we have missing article numbers - gather them
@@ -214,30 +224,32 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws
prevMonth = month;
prevDay = day;
prevArticleNo = articleNo;
-
+
cnt++;
if (cnt % 100000 == 0) {
LOG.info(cnt + " docs");
}
}
list.add(currentEntry.toString());
+ list.add("");
+ numEntries++;
list.add("" + cnt);
reader.close();
LOG.info(cnt + " docs total. Done!");
- cnt = 0;
LOG.info("Writing " + output);
FSDataOutputStream out = fs.create(output, true);
- out.writeInt(list.size());
+ out.writeInt(numEntries);
+ numEntries = 0;
for (int i = 0; i < list.size(); i++) {
out.writeUTF(list.get(i));
- cnt++;
- if (cnt % 100000 == 0) {
- LOG.info(cnt + " months of docs");
+ numEntries++;
+ if (numEntries % 10000 == 0) {
+ LOG.info(numEntries + " months of docs");
}
}
out.close();
- LOG.info(cnt + " months of docs total. Done!");
+ LOG.info(numEntries + " months of docs total. Done!");
}
static public String[] readDocnoData(Path p, FileSystem fs) throws IOException {
@@ -247,9 +259,24 @@ static public String[] readDocnoData(Path p, FileSystem fs) throws IOException {
int sz = in.readInt();
LOG.debug("creating a month array of length: " + sz);
String[] arr = new String[sz];
+ String currentEntryPart = in.readUTF();
+ StringBuilder currentEntry = new StringBuilder();
+ int i = 0;
+ while (!currentEntryPart.equals("")) {
+ LOG.debug("currentEntryPart: " + currentEntryPart);
+ if (currentEntryPart.equals("")) {
+ arr[i] = currentEntry.toString();
+ LOG.debug("arr[" + i + "]: " + arr[i]);
+ i++;
+ currentEntry = new StringBuilder();
+ } else {
+ currentEntry.append(currentEntryPart);
+ }
+ currentEntryPart = in.readUTF();
+ }
- for (int i = 0; i < sz; i++) {
- arr[i] = in.readUTF();
+ if (currentEntry.length() > 0){
+ arr[i] = currentEntry.toString();
LOG.debug("arr[" + i + "]: " + arr[i]);
}
in.close();
From 79da6def23fdc922e142336a34ee3c10a7a08c4d Mon Sep 17 00:00:00 2001
From: "Earl J. Wagner"
Date: Sun, 16 Oct 2011 23:13:22 -0400
Subject: [PATCH 05/18] catch errors with reading headline
---
.../collection/aquaint2/Aquaint2Document.java | 17 +++++++++++++++--
1 file changed, 15 insertions(+), 2 deletions(-)
diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java
index 08cde55a8..ea7c3089f 100644
--- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java
+++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java
@@ -22,10 +22,16 @@
import java.util.regex.Pattern;
import org.apache.hadoop.io.WritableUtils;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
import edu.umd.cloud9.collection.Indexable;
+
public class Aquaint2Document extends Indexable {
+ private static final Logger LOG = Logger.getLogger(Aquaint2Document.class);
+ { LOG.setLevel (Level.INFO); }
+
private static Pattern TAGS_PATTERN = Pattern.compile("<[^>]+>");
private static Pattern WHITESPACE_PATTERN = Pattern.compile("\t|\n");
@@ -73,8 +79,15 @@ public String getHeadline() {
headline = "";
} else {
int end = raw.indexOf("");
- headline = raw.substring(start + 10, end).trim();
-
+ try {
+ headline = raw.substring(start + 10, end).trim();
+ } catch (Exception e) {
+ LOG.error("exception: " + e);
+ LOG.error("docid: " + getDocid () + ", start: " + start + ", end: " + end);
+ LOG.error(raw);
+ headline = raw.substring(start + 10).trim();
+ LOG.error("headline should be: " + headline);
+ }
headline = TAGS_PATTERN.matcher(headline).replaceAll("");
headline = WHITESPACE_PATTERN.matcher(headline).replaceAll(" ");
}
From a4d39fd7526c06a42940b61bbcb6ec7901aa8063 Mon Sep 17 00:00:00 2001
From: "Earl J. Wagner"
Date: Mon, 24 Oct 2011 23:02:15 -0400
Subject: [PATCH 06/18] reformatted
---
.../edu/umd/cloud9/io/SequenceFileUtils.java | 490 +++++++++---------
1 file changed, 245 insertions(+), 245 deletions(-)
diff --git a/src/dist/edu/umd/cloud9/io/SequenceFileUtils.java b/src/dist/edu/umd/cloud9/io/SequenceFileUtils.java
index 106cb7ec8..5552a04dc 100644
--- a/src/dist/edu/umd/cloud9/io/SequenceFileUtils.java
+++ b/src/dist/edu/umd/cloud9/io/SequenceFileUtils.java
@@ -36,249 +36,249 @@
*/
public class SequenceFileUtils {
- private SequenceFileUtils() {}
-
- public static List> readFile(Path path) {
- FileSystem fs;
- try {
- fs = FileSystem.get(new Configuration());
- } catch (IOException e) {
- throw new RuntimeException("Unable to access the file system!");
- }
-
- return readFile(path, fs, Integer.MAX_VALUE);
- }
-
- public static List> readFile(Path path, int max) {
- FileSystem fs;
- try {
- fs = FileSystem.get(new Configuration());
- } catch (IOException e) {
- throw new RuntimeException("Unable to access the file system!");
- }
-
- return readFile(path, fs, max);
- }
-
- public static List> readFile(Path path, FileSystem fs) {
- return readFile(path, fs, Integer.MAX_VALUE);
- }
-
- /**
- * Reads key-value pairs from a SequenceFile, up to a maximum number.
- *
- * @param path path to file
- * @param max maximum of key-value pairs to read
- * @return list of key-value pairs
- */
- @SuppressWarnings("unchecked")
- public static List> readFile(Path path, FileSystem fs, int max) {
- List> list = new ArrayList>();
-
- try {
- int k = 0;
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());
-
- K key = (K) reader.getKeyClass().newInstance();
- V value = (V) reader.getValueClass().newInstance();
-
- while (reader.next(key, value)) {
- k++;
- list.add(new PairOfWritables(key, value));
- if (k >= max) {
- break;
- }
-
- key = (K) reader.getKeyClass().newInstance();
- value = (V) reader.getValueClass().newInstance();
- }
- reader.close();
- } catch (Exception e) {
- throw new RuntimeException("Error reading SequenceFile " + path);
- }
-
- return list;
- }
-
- public static SortedMap readFileIntoMap(Path path) {
- FileSystem fs;
- try {
- fs = FileSystem.get(new Configuration());
- } catch (IOException e) {
- throw new RuntimeException("Unable to access the file system!");
- }
-
- return readFileIntoMap(path, fs, Integer.MAX_VALUE);
- }
-
- public static SortedMap readFileIntoMap(Path path, int max) {
- FileSystem fs;
- try {
- fs = FileSystem.get(new Configuration());
- } catch (IOException e) {
- throw new RuntimeException("Unable to access the file system!");
- }
-
- return readFileIntoMap(path, fs, max);
- }
-
- public static SortedMap readFileIntoMap(Path path, FileSystem fs) {
- return readFileIntoMap(path, fs, Integer.MAX_VALUE);
- }
-
- public static SortedMap readFileIntoMap(Path path, FileSystem fs, int max) {
- SortedMap map = new TreeMap();
-
- for ( PairOfWritables pair : SequenceFileUtils.readFile(path, fs, max)) {
- map.put(pair.getLeftElement(), pair.getRightElement());
- }
- return map;
- }
-
- public static List> readDirectory(Path path) {
- FileSystem fs;
- try {
- fs = FileSystem.get(new Configuration());
- } catch (IOException e) {
- throw new RuntimeException("Unable to access the file system!");
- }
-
- return readDirectory(path, fs, Integer.MAX_VALUE);
- }
-
- /**
- * Reads key-value pairs from a directory containing SequenceFiles. A
- * maximum number of key-value pairs is read from each SequenceFile.
- *
- * @param path path to directory
- * @param max maximum of key-value pairs to read per file
- * @return list of key-value pairs
- */
- public static List> readDirectory(Path path, FileSystem fs, int max) {
- List> list = new ArrayList>();
-
- try {
- FileStatus[] stat = fs.listStatus(path);
- for (int i = 0; i < stat.length; ++i) {
-
- // skip '_log' directory
- if (stat[i].getPath().getName().startsWith("_"))
- continue;
-
- List> pairs = readFile(stat[i].getPath(), fs, max);
- list.addAll(pairs);
- }
- } catch (IOException e) {
- throw new RuntimeException("Error reading the file system!");
- }
-
- return list;
- }
-
- public static List readKeys(Path path) {
- FileSystem fs;
- try {
- fs = FileSystem.get(new Configuration());
- } catch (IOException e) {
- throw new RuntimeException("Unable to access the file system!");
- }
-
- return readKeys(path, fs, Integer.MAX_VALUE);
- }
-
- public static List readKeys(Path path, int max) {
- FileSystem fs;
- try {
- fs = FileSystem.get(new Configuration());
- } catch (IOException e) {
- throw new RuntimeException("Unable to access the file system!");
- }
-
- return readKeys(path, fs, max);
- }
-
- public static List readKeys(Path path, FileSystem fs) {
- return readKeys(path, fs, Integer.MAX_VALUE);
- }
-
- @SuppressWarnings("unchecked")
- public static List readKeys(Path path, FileSystem fs, int max) {
- List list = new ArrayList();
-
- try {
- int k = 0;
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());
-
- K key = (K) reader.getKeyClass().newInstance();
- Writable value = (Writable) reader.getValueClass().newInstance();
- while (reader.next(key, value)) {
- k++;
- list.add(key);
- if (k >= max) {
- break;
- }
-
- key = (K) reader.getKeyClass().newInstance();
- }
- reader.close();
- } catch (Exception e) {
- throw new RuntimeException("Error reading SequenceFile " + path);
- }
-
- return list;
- }
-
- public static List readValues(Path path) {
- FileSystem fs;
- try {
- fs = FileSystem.get(new Configuration());
- } catch (IOException e) {
- throw new RuntimeException("Unable to access the file system!");
- }
-
- return readValues(path, fs, Integer.MAX_VALUE);
- }
-
- public static List readValues(Path path, int max) {
- FileSystem fs;
- try {
- fs = FileSystem.get(new Configuration());
- } catch (IOException e) {
- throw new RuntimeException("Unable to access the file system!");
- }
-
- return readValues(path, fs, max);
- }
-
- public static List readValues(Path path, FileSystem fs) {
- return readValues(path, fs, Integer.MAX_VALUE);
- }
-
- @SuppressWarnings("unchecked")
- public static List readValues(Path path, FileSystem fs, int max) {
- List list = new ArrayList();
-
- try {
- int k = 0;
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());
-
- Writable key = (Writable) reader.getKeyClass().newInstance();
- V value = (V) reader.getValueClass().newInstance();
-
- while (reader.next(key, value)) {
- k++;
- list.add(value);
- if (k >= max) {
- break;
- }
-
- value = (V) reader.getValueClass().newInstance();
- }
- reader.close();
- } catch (Exception e) {
- throw new RuntimeException("Error reading SequenceFile " + path);
- }
-
- return list;
- }
+ private SequenceFileUtils() {}
+
+ public static List> readFile(Path path) {
+ FileSystem fs;
+ try {
+ fs = FileSystem.get(new Configuration());
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to access the file system!");
+ }
+
+ return readFile(path, fs, Integer.MAX_VALUE);
+ }
+
+ public static List> readFile(Path path, int max) {
+ FileSystem fs;
+ try {
+ fs = FileSystem.get(new Configuration());
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to access the file system!");
+ }
+
+ return readFile(path, fs, max);
+ }
+
+ public static List> readFile(Path path, FileSystem fs) {
+ return readFile(path, fs, Integer.MAX_VALUE);
+ }
+
+ /**
+ * Reads key-value pairs from a SequenceFile, up to a maximum number.
+ *
+ * @param path path to file
+ * @param max maximum of key-value pairs to read
+ * @return list of key-value pairs
+ */
+ @SuppressWarnings("unchecked")
+ public static List> readFile(Path path, FileSystem fs, int max) {
+ List> list = new ArrayList>();
+
+ try {
+ int k = 0;
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());
+
+ K key = (K) reader.getKeyClass().newInstance();
+ V value = (V) reader.getValueClass().newInstance();
+
+ while (reader.next(key, value)) {
+ k++;
+ list.add(new PairOfWritables(key, value));
+ if (k >= max) {
+ break;
+ }
+
+ key = (K) reader.getKeyClass().newInstance();
+ value = (V) reader.getValueClass().newInstance();
+ }
+ reader.close();
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading SequenceFile " + path);
+ }
+
+ return list;
+ }
+
+ public static SortedMap readFileIntoMap(Path path) {
+ FileSystem fs;
+ try {
+ fs = FileSystem.get(new Configuration());
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to access the file system!");
+ }
+
+ return readFileIntoMap(path, fs, Integer.MAX_VALUE);
+ }
+
+ public static SortedMap readFileIntoMap(Path path, int max) {
+ FileSystem fs;
+ try {
+ fs = FileSystem.get(new Configuration());
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to access the file system!");
+ }
+
+ return readFileIntoMap(path, fs, max);
+ }
+
+ public static SortedMap readFileIntoMap(Path path, FileSystem fs) {
+ return readFileIntoMap(path, fs, Integer.MAX_VALUE);
+ }
+
+ public static SortedMap readFileIntoMap(Path path, FileSystem fs, int max) {
+ SortedMap map = new TreeMap();
+
+ for ( PairOfWritables pair : SequenceFileUtils.readFile(path, fs, max)) {
+ map.put(pair.getLeftElement(), pair.getRightElement());
+ }
+ return map;
+ }
+
+ public static List> readDirectory(Path path) {
+ FileSystem fs;
+ try {
+ fs = FileSystem.get(new Configuration());
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to access the file system!");
+ }
+
+ return readDirectory(path, fs, Integer.MAX_VALUE);
+ }
+
+ /**
+ * Reads key-value pairs from a directory containing SequenceFiles. A
+ * maximum number of key-value pairs is read from each SequenceFile.
+ *
+ * @param path path to directory
+ * @param max maximum of key-value pairs to read per file
+ * @return list of key-value pairs
+ */
+ public static List> readDirectory(Path path, FileSystem fs, int max) {
+ List> list = new ArrayList>();
+
+ try {
+ FileStatus[] stat = fs.listStatus(path);
+ for (int i = 0; i < stat.length; ++i) {
+
+ // skip '_log' directory
+ if (stat[i].getPath().getName().startsWith("_"))
+ continue;
+
+ List> pairs = readFile(stat[i].getPath(), fs, max);
+ list.addAll(pairs);
+ }
+ } catch (IOException e) {
+ throw new RuntimeException("Error reading the file system!");
+ }
+
+ return list;
+ }
+
+ public static List readKeys(Path path) {
+ FileSystem fs;
+ try {
+ fs = FileSystem.get(new Configuration());
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to access the file system!");
+ }
+
+ return readKeys(path, fs, Integer.MAX_VALUE);
+ }
+
+ public static List readKeys(Path path, int max) {
+ FileSystem fs;
+ try {
+ fs = FileSystem.get(new Configuration());
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to access the file system!");
+ }
+
+ return readKeys(path, fs, max);
+ }
+
+ public static List readKeys(Path path, FileSystem fs) {
+ return readKeys(path, fs, Integer.MAX_VALUE);
+ }
+
+ @SuppressWarnings("unchecked")
+ public static List readKeys(Path path, FileSystem fs, int max) {
+ List list = new ArrayList();
+
+ try {
+ int k = 0;
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());
+
+ K key = (K) reader.getKeyClass().newInstance();
+ Writable value = (Writable) reader.getValueClass().newInstance();
+ while (reader.next(key, value)) {
+ k++;
+ list.add(key);
+ if (k >= max) {
+ break;
+ }
+
+ key = (K) reader.getKeyClass().newInstance();
+ }
+ reader.close();
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading SequenceFile " + path);
+ }
+
+ return list;
+ }
+
+ public static List readValues(Path path) {
+ FileSystem fs;
+ try {
+ fs = FileSystem.get(new Configuration());
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to access the file system!");
+ }
+
+ return readValues(path, fs, Integer.MAX_VALUE);
+ }
+
+ public static List readValues(Path path, int max) {
+ FileSystem fs;
+ try {
+ fs = FileSystem.get(new Configuration());
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to access the file system!");
+ }
+
+ return readValues(path, fs, max);
+ }
+
+ public static List readValues(Path path, FileSystem fs) {
+ return readValues(path, fs, Integer.MAX_VALUE);
+ }
+
+ @SuppressWarnings("unchecked")
+ public static List readValues(Path path, FileSystem fs, int max) {
+ List list = new ArrayList();
+
+ try {
+ int k = 0;
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());
+
+ Writable key = (Writable) reader.getKeyClass().newInstance();
+ V value = (V) reader.getValueClass().newInstance();
+
+ while (reader.next(key, value)) {
+ k++;
+ list.add(value);
+ if (k >= max) {
+ break;
+ }
+
+ value = (V) reader.getValueClass().newInstance();
+ }
+ reader.close();
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading SequenceFile " + path);
+ }
+
+ return list;
+ }
}
From 5394796772f97ab4dadc8779f8998ec0060f055e Mon Sep 17 00:00:00 2001
From: "Earl J. Wagner"
Date: Thu, 27 Oct 2011 14:53:37 -0400
Subject: [PATCH 07/18] added support to read directory into map
---
.../edu/umd/cloud9/io/SequenceFileUtils.java | 38 +++++++++++++++++++
1 file changed, 38 insertions(+)
diff --git a/src/dist/edu/umd/cloud9/io/SequenceFileUtils.java b/src/dist/edu/umd/cloud9/io/SequenceFileUtils.java
index 5552a04dc..a9464d720 100644
--- a/src/dist/edu/umd/cloud9/io/SequenceFileUtils.java
+++ b/src/dist/edu/umd/cloud9/io/SequenceFileUtils.java
@@ -100,6 +100,7 @@ public static List SortedMap readFileIntoMap(Path path) {
FileSystem fs;
try {
@@ -175,6 +176,43 @@ public static List SortedMap readDirectoryIntoMap(Path path) {
+ FileSystem fs;
+ try {
+ fs = FileSystem.get(new Configuration());
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to access the file system!");
+ }
+
+ return readDirectoryIntoMap(path, fs, Integer.MAX_VALUE);
+ }
+
+ public static SortedMap readDirectoryIntoMap(Path path, int max) {
+ FileSystem fs;
+ try {
+ fs = FileSystem.get(new Configuration());
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to access the file system!");
+ }
+
+ return readDirectoryIntoMap(path, fs, max);
+ }
+
+ public static SortedMap readDirectoryIntoMap(Path path, FileSystem fs) {
+ return readDirectoryIntoMap(path, fs, Integer.MAX_VALUE);
+ }
+
+ public static SortedMap readDirectoryIntoMap(Path path, FileSystem fs, int max) {
+ SortedMap map = new TreeMap();
+
+ for ( PairOfWritables pair : SequenceFileUtils.readDirectory(path, fs, max)) {
+ map.put(pair.getLeftElement(), pair.getRightElement());
+ }
+ return map;
+ }
+
+
public static List readKeys(Path path) {
FileSystem fs;
try {
From 052585285fc7d0581a52334a3a6e4d9522137cae Mon Sep 17 00:00:00 2001
From: "Earl J. Wagner"
Date: Fri, 28 Oct 2011 15:51:40 -0400
Subject: [PATCH 08/18] changes to print statements
---
.../cloud9/collection/aquaint2/Aquaint2DocnoMapping.java | 6 ++++--
.../umd/cloud9/collection/aquaint2/Aquaint2Document.java | 5 +++--
2 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
index d5b9d6627..005136fe5 100644
--- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
+++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
@@ -37,8 +37,8 @@
public class Aquaint2DocnoMapping implements DocnoMapping {
private static final Logger LOG = Logger.getLogger(Aquaint2DocnoMapping.class);
- // { LOG.setLevel (Level.TRACE); }
{ LOG.setLevel (Level.INFO); }
+ //{ LOG.setLevel (Level.TRACE); }
private String[] docidEntries;
@@ -123,7 +123,9 @@ public String getDocid(int docno) {
articleNo++;
}
LOG.debug("found articleNo: " + articleNo + ", looking at: " + String.format("%s_%04d%02d%02d.%04d", source, year, month, day, articleNo));
- return String.format ("%s_%04d%02d%02d.%04d", source, year, month, day, articleNo);
+ String result = String.format ("%s_%04d%02d%02d.%04d", source, year, month, day, articleNo);
+ LOG.trace("getDocid returning: " + result);
+ return result;
}
diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java
index ea7c3089f..0b85cc523 100644
--- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java
+++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java
@@ -84,9 +84,9 @@ public String getHeadline() {
} catch (Exception e) {
LOG.error("exception: " + e);
LOG.error("docid: " + getDocid () + ", start: " + start + ", end: " + end);
- LOG.error(raw);
+ LOG.error("raw:\n" + raw);
headline = raw.substring(start + 10).trim();
- LOG.error("headline should be: " + headline);
+ LOG.error("updated headline: " + headline);
}
headline = TAGS_PATTERN.matcher(headline).replaceAll("");
headline = WHITESPACE_PATTERN.matcher(headline).replaceAll(" ");
@@ -122,5 +122,6 @@ public static void readDocument(Aquaint2Document doc, String s) {
doc.docid = null;
doc.headline = null;
doc.text = null;
+ LOG.debug("docid: " + doc.getDocid() + " length: " + doc.raw.length());
}
}
From a325b1ae9a81fbe41006d1c16025acdba3ff3fb1 Mon Sep 17 00:00:00 2001
From: "Earl J. Wagner"
Date: Wed, 9 Nov 2011 23:02:20 -0500
Subject: [PATCH 09/18] reformatted, added check to ignore empty files in dir
(e.g. _SUCCESS)
---
.../edu/umd/cloud9/io/ReadSequenceFile.java | 164 +++++++++---------
1 file changed, 83 insertions(+), 81 deletions(-)
diff --git a/src/dist/edu/umd/cloud9/io/ReadSequenceFile.java b/src/dist/edu/umd/cloud9/io/ReadSequenceFile.java
index 6c61c6074..0a3b4337b 100644
--- a/src/dist/edu/umd/cloud9/io/ReadSequenceFile.java
+++ b/src/dist/edu/umd/cloud9/io/ReadSequenceFile.java
@@ -32,7 +32,7 @@
* in the of a directory, the value specifies the number of key-value pairs to
* read per file.
*
- *
+ *
*
* args: [path] [max-num-of-records] (local)
*
@@ -44,84 +44,86 @@
*/
public class ReadSequenceFile {
- private ReadSequenceFile() {}
-
- public static void main(String[] args) throws IOException {
- if (args.length < 1) {
- System.out.println("args: [path] [max-num-of-records-per-file]");
- System.exit(-1);
- }
-
- String f = args[0];
-
- int max = Integer.MAX_VALUE;
- if (args.length >= 2) {
- max = Integer.parseInt(args[1]);
- }
-
- boolean useLocal = args.length >= 3 && args[2].equals("local") ? true : false;
-
- if (useLocal) {
- System.out.println("Reading from local filesystem");
- }
-
- FileSystem fs = useLocal? FileSystem.getLocal(new Configuration()) : FileSystem.get(new Configuration());
- Path p = new Path(f);
-
- if (fs.getFileStatus(p).isDir()) {
- readSequenceFilesInDir(p, fs, max);
- } else {
- readSequenceFile(p, fs, max);
- }
- }
-
- private static int readSequenceFile(Path path, FileSystem fs, int max) throws IOException {
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());
-
- System.out.println("Reading " + path + "...\n");
- try {
- System.out.println("Key type: " + reader.getKeyClass().toString());
- System.out.println("Value type: " + reader.getValueClass().toString() + "\n");
- } catch (Exception e) {
- throw new RuntimeException("Error: loading key/value class");
- }
-
- Writable key, value;
- int n = 0;
- try {
- key = (Writable) reader.getKeyClass().newInstance();
- value = (Writable) reader.getValueClass().newInstance();
-
- while (reader.next(key, value)) {
- System.out.println("Record " + n);
- System.out.println("Key: " + key + "\nValue: " + value);
- System.out.println("----------------------------------------");
- n++;
-
- if (n >= max)
- break;
- }
- reader.close();
- System.out.println(n + " records read.\n");
- } catch (Exception e) {
- e.printStackTrace();
- }
-
- return n;
- }
-
- private static int readSequenceFilesInDir(Path path, FileSystem fs, int max) {
- int n = 0;
- try {
- FileStatus[] stat = fs.listStatus(path);
- for (int i = 0; i < stat.length; ++i) {
- n += readSequenceFile(stat[i].getPath(), fs ,max);
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
-
- System.out.println(n + " records read in total.");
- return n;
- }
+ private ReadSequenceFile() {}
+
+ public static void main(String[] args) throws IOException {
+ if (args.length < 1) {
+ System.out.println("args: [path] [max-num-of-records-per-file]");
+ System.exit(-1);
+ }
+
+ String f = args[0];
+
+ int max = Integer.MAX_VALUE;
+ if (args.length >= 2) {
+ max = Integer.parseInt(args[1]);
+ }
+
+ boolean useLocal = args.length >= 3 && args[2].equals("local") ? true : false;
+
+ if (useLocal) {
+ System.out.println("Reading from local filesystem");
+ }
+
+ FileSystem fs = useLocal? FileSystem.getLocal(new Configuration()) : FileSystem.get(new Configuration());
+ Path p = new Path(f);
+
+ if (fs.getFileStatus(p).isDir()) {
+ readSequenceFilesInDir(p, fs, max);
+ } else {
+ readSequenceFile(p, fs, max);
+ }
+ }
+
+ private static int readSequenceFile(Path path, FileSystem fs, int max) throws IOException {
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());
+
+ System.out.println("Reading " + path + "...\n");
+ try {
+ System.out.println("Key type: " + reader.getKeyClass().toString());
+ System.out.println("Value type: " + reader.getValueClass().toString() + "\n");
+ } catch (Exception e) {
+ throw new RuntimeException("Error: loading key/value class");
+ }
+
+ Writable key, value;
+ int n = 0;
+ try {
+ key = (Writable) reader.getKeyClass().newInstance();
+ value = (Writable) reader.getValueClass().newInstance();
+
+ while (reader.next(key, value)) {
+ System.out.println("Record " + n);
+ System.out.println("Key: " + key + "\nValue: " + value);
+ System.out.println("----------------------------------------");
+ n++;
+
+ if (n >= max)
+ break;
+ }
+ reader.close();
+ System.out.println(n + " records read.\n");
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ return n;
+ }
+
+ private static int readSequenceFilesInDir(Path path, FileSystem fs, int max) {
+ int n = 0;
+ try {
+ FileStatus[] stat = fs.listStatus(path);
+ for (int i = 0; i < stat.length; ++i) {
+ if (stat[i].getLen() > 0) {
+ n += readSequenceFile(stat[i].getPath(), fs ,max);
+ }
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ System.out.println(n + " records read in total.");
+ return n;
+ }
}
From a4c713666500e9e813e4291da45442fdd8e3c3e2 Mon Sep 17 00:00:00 2001
From: "Earl J. Wagner"
Date: Wed, 9 Nov 2011 23:03:18 -0500
Subject: [PATCH 10/18] minor changes to logging
---
.../collection/aquaint2/Aquaint2DocnoMapping.java | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
index 005136fe5..098b893e4 100644
--- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
+++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
@@ -37,8 +37,10 @@
public class Aquaint2DocnoMapping implements DocnoMapping {
private static final Logger LOG = Logger.getLogger(Aquaint2DocnoMapping.class);
- { LOG.setLevel (Level.INFO); }
- //{ LOG.setLevel (Level.TRACE); }
+ {
+ LOG.setLevel(Level.INFO);
+ //LOG.setLevel(Level.TRACE);
+ }
private String[] docidEntries;
@@ -228,7 +230,7 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws
prevArticleNo = articleNo;
cnt++;
- if (cnt % 100000 == 0) {
+ if (cnt % (200 * 1000) == 0) {
LOG.info(cnt + " docs");
}
}
@@ -246,7 +248,7 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws
for (int i = 0; i < list.size(); i++) {
out.writeUTF(list.get(i));
numEntries++;
- if (numEntries % 10000 == 0) {
+ if (numEntries % (10 * 1000) == 0) {
LOG.info(numEntries + " months of docs");
}
}
From ad21d35cc4b8361ed8999e1c9868ea255660dd33 Mon Sep 17 00:00:00 2001
From: "Earl J. Wagner"
Date: Sun, 13 Nov 2011 23:02:04 -0500
Subject: [PATCH 11/18] formatting changes
---
.../aquaint2/NumberAquaint2Documents2.java | 28 +++++++++++--------
1 file changed, 16 insertions(+), 12 deletions(-)
diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java b/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java
index d07643ccc..219e7f559 100644
--- a/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java
+++ b/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java
@@ -35,6 +35,7 @@
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
+
public class NumberAquaint2Documents2 extends Configured implements Tool {
private static final Logger LOG = Logger.getLogger(NumberAquaint2Documents2.class);
private static enum Count { DOCS };
@@ -83,22 +84,23 @@ public int run(String[] args) throws Exception {
return -1;
}
- String inputPath = args[0];
- String outputPath = args[1];
- String outputFile = args[2];
+ Path inputDirPath = new Path(args[0]);
+ String outputDirPathname = args[1];
+ Path outputDirPath = new Path(outputDirPathname);
+ Path outputFilePath = new Path(args[2]);
LOG.info("Tool: " + NumberAquaint2Documents2.class.getCanonicalName());
- LOG.info(" - Input path: " + inputPath);
- LOG.info(" - Output path: " + outputPath);
- LOG.info(" - Output file: " + outputFile);
+ LOG.info(" - Input dir path: " + inputDirPath);
+ LOG.info(" - Output dir path: " + outputDirPath);
+ LOG.info(" - Output file path: " + outputFilePath);
Job job = new Job(getConf(), NumberAquaint2Documents2.class.getSimpleName());
job.setJarByClass(NumberAquaint2Documents2.class);
job.setNumReduceTasks(1);
- FileInputFormat.setInputPaths(job, new Path(inputPath));
- FileOutputFormat.setOutputPath(job, new Path(outputPath));
+ FileInputFormat.setInputPaths(job, inputDirPath);
+ FileOutputFormat.setOutputPath(job, outputDirPath);
FileOutputFormat.setCompressOutput(job, false);
job.setInputFormatClass(Aquaint2DocumentInputFormat2.class);
@@ -110,13 +112,15 @@ public int run(String[] args) throws Exception {
job.setReducerClass(MyReducer.class);
// Delete the output directory if it exists already.
- FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);
+ FileSystem.get(job.getConfiguration()).delete(outputDirPath, true);
job.waitForCompletion(true);
- String input = outputPath + (outputPath.endsWith("/") ? "" : "/") + "/part-r-00000";
- Aquaint2DocnoMapping.writeDocnoData(new Path(input), new Path(outputFile),
- FileSystem.get(getConf()));
+ Path inputFilePath = new Path(outputDirPathname
+ + (outputDirPathname.endsWith("/") ? "" : "/")
+ + "/part-r-00000");
+ Aquaint2DocnoMapping.writeDocnoData(inputFilePath, outputFilePath,
+ FileSystem.get(getConf()));
return 0;
}
From b86e608530c845a8479aecbaa1a0f757357ab777 Mon Sep 17 00:00:00 2001
From: "Earl J. Wagner"
Date: Tue, 15 Nov 2011 18:41:11 -0500
Subject: [PATCH 12/18] added support for original Aquaint document format
---
.../aquaint2/Aquaint2DocnoMapping.java | 37 +++++++-------
.../collection/aquaint2/Aquaint2Document.java | 48 +++++++++++++++----
2 files changed, 61 insertions(+), 24 deletions(-)
diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
index 098b893e4..a866efb15 100644
--- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
+++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
@@ -48,11 +48,13 @@ public class Aquaint2DocnoMapping implements DocnoMapping {
public int getDocno(String docid) {
LOG.trace("getDocno(docid: " + docid + ")");
Preconditions.checkNotNull(docid);
- String source = docid.substring(0, 7);
- int year = Integer.parseInt(docid.substring (8, 12));
- int month = Integer.parseInt(docid.substring (12, 14));
- int day = Integer.parseInt(docid.substring (14, 16));
- int articleNo = Integer.parseInt(docid.substring (17, 21));
+ int sourceLength = docid.length() - 13;
+ String source = docid.substring(0, sourceLength);
+ int year = Integer.parseInt(docid.substring (sourceLength, sourceLength + 4));
+ int month = Integer.parseInt(docid.substring (sourceLength + 4, sourceLength + 6));
+ int day = Integer.parseInt(docid.substring (sourceLength + 6, sourceLength + 8));
+ int articleNo = Integer.parseInt(docid.substring (sourceLength + 9, sourceLength + 13));
+
// first traverse the entries to find the month entry and get its days
int entryId = findEntryId(source, year, month);
@@ -109,14 +111,14 @@ public String getDocid(int docno) {
String source = entryMetaInfo[1];
int year = Integer.parseInt(entryMetaInfo[2]);
int month = Integer.parseInt(entryMetaInfo[3]);
- LOG.debug("looking at: " + String.format("%s_%04d%02d__.____", source, year, month));
+ LOG.debug("looking at: " + String.format("%s%04d%02d__.____", source, year, month));
// then traverse the days to find the day and skip over missing articles to get the article number
String[] entryEltParts = findEntryEltParts (docno, entryElts);
int offset = Integer.parseInt(entryEltParts[0]);
String[] entryDayParts = entryEltParts[1].split(",");
int day = Integer.parseInt(entryDayParts[0]);
- LOG.debug("found day: " + day + ", looking at: " + String.format("%s_%04d%02d%02d.____", source, year, month, day));
+ LOG.debug("found day: " + day + ", looking at: " + String.format("%s%04d%02d%02d.____", source, year, month, day));
int articleNo = docno - offset;
for (int i = 1; i < entryDayParts.length; i++) {
int missingNo = Integer.parseInt(entryDayParts[i]);
@@ -124,8 +126,8 @@ public String getDocid(int docno) {
LOG.debug("skipping missingNo: " + missingNo);
articleNo++;
}
- LOG.debug("found articleNo: " + articleNo + ", looking at: " + String.format("%s_%04d%02d%02d.%04d", source, year, month, day, articleNo));
- String result = String.format ("%s_%04d%02d%02d.%04d", source, year, month, day, articleNo);
+ LOG.debug("found articleNo: " + articleNo + ", looking at: " + String.format("%s%04d%02d%02d.%04d", source, year, month, day, articleNo));
+ String result = String.format ("%s%04d%02d%02d.%04d", source, year, month, day, articleNo);
LOG.trace("getDocid returning: " + result);
return result;
}
@@ -166,6 +168,7 @@ public void loadMapping(Path p, FileSystem fs) throws IOException {
docidEntries = Aquaint2DocnoMapping.readDocnoData(p, fs);
}
+
static public void writeDocnoData(Path input, Path output, FileSystem fs) throws IOException {
LOG.info("Writing docno data to " + output);
LineReader reader = new LineReader(fs.open(input));
@@ -186,11 +189,12 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws
while (reader.readLine(line) > 0) {
String docid = line.toString();
- String source = docid.substring(0, 7);
- int year = Integer.parseInt(docid.substring (8, 12));
- int month = Integer.parseInt(docid.substring (12, 14));
- int day = Integer.parseInt(docid.substring (14, 16));
- int articleNo = Integer.parseInt(docid.substring (17, 21));
+ int sourceLength = docid.indexOf("\t") - 13;
+ String source = docid.substring(0, sourceLength);
+ int year = Integer.parseInt(docid.substring (sourceLength, sourceLength + 4));
+ int month = Integer.parseInt(docid.substring (sourceLength + 4, sourceLength + 6));
+ int day = Integer.parseInt(docid.substring (sourceLength + 6, sourceLength + 8));
+ int articleNo = Integer.parseInt(docid.substring (sourceLength + 9, sourceLength + 13));
LOG.debug("prevSource: " + prevSource + ", prevYear: " + prevYear + ", prevMonth: " + prevMonth + ", prevDay: " + prevDay + ", prevArticleNo: " + prevArticleNo);
LOG.debug("source: " + source + ", year: " + year + ", month: " + month + ", day: " + day + ", articleNo: " + articleNo);
@@ -297,9 +301,10 @@ public static void main(String[] args) throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
- System.out.println("loading mapping file " + args[1]);
+ Path mappingPath = new Path(args[1]);
+ System.out.println("loading mapping file " + mappingPath);
Aquaint2DocnoMapping mapping = new Aquaint2DocnoMapping();
- mapping.loadMapping(new Path(args[1]), fs);
+ mapping.loadMapping(mappingPath, fs);
if (args[0].equals("list")) {
for (int i = 1; i < mapping.docidEntries.length; i++) {
diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java
index 0b85cc523..cdeec69eb 100644
--- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java
+++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java
@@ -30,14 +30,18 @@
public class Aquaint2Document extends Indexable {
private static final Logger LOG = Logger.getLogger(Aquaint2Document.class);
- { LOG.setLevel (Level.INFO); }
+ {
+ LOG.setLevel(Level.INFO);
+ //LOG.setLevel(Level.TRACE);
+ }
private static Pattern TAGS_PATTERN = Pattern.compile("<[^>]+>");
private static Pattern WHITESPACE_PATTERN = Pattern.compile("\t|\n");
- public static final String XML_START_TAG = "");
+ if (start == -1) {
+ docid = "";
+ } else {
+ int end = raw.indexOf("");
+ docid = raw.substring(start + 7, end).trim();
+ }
+ LOG.trace("in setAquaintDocid, docid: " + docid);
+ }
+
+
+ private void setAquaint2Docid() {
+ int start = 9;
+ int end = raw.indexOf("\"", start);
+ docid = raw.substring(start, end).trim();
+ LOG.trace("in setAquaint2Docid, docid: " + docid);
+ }
+
+
public String getHeadline() {
if (headline == null) {
int start = raw.indexOf("");
-
if (start == -1) {
headline = "";
} else {
@@ -95,6 +123,7 @@ public String getHeadline() {
return headline;
}
+
@Override
public String getContent() {
if (text == null) {
@@ -109,10 +138,10 @@ public String getContent() {
text = TAGS_PATTERN.matcher(text).replaceAll("");
}
}
-
return text;
}
+
public static void readDocument(Aquaint2Document doc, String s) {
if (s == null) {
throw new RuntimeException("Error, can't read null string!");
@@ -122,6 +151,9 @@ public static void readDocument(Aquaint2Document doc, String s) {
doc.docid = null;
doc.headline = null;
doc.text = null;
+ //doc.isAquaint2 = (doc.raw.indexOf("\n") == -1);
+ doc.isAquaint2 = (doc.raw.indexOf("") == -1);
+
LOG.debug("docid: " + doc.getDocid() + " length: " + doc.raw.length());
}
}
From 6484c858779c20e20579a352d78213576d30909f Mon Sep 17 00:00:00 2001
From: "Earl J. Wagner"
Date: Thu, 17 Nov 2011 15:08:27 -0500
Subject: [PATCH 13/18] extended Aquaint2 code to support original Aquaint
corpus
---
.../aquaint2/Aquaint2DocnoMapping.java | 40 ++++----
.../collection/aquaint2/Aquaint2Document.java | 95 +++++++++++++------
.../aquaint2/Aquaint2DocumentInputFormat.java | 3 -
.../Aquaint2DocumentInputFormat2.java | 7 +-
.../aquaint2/NumberAquaint2Documents2.java | 17 +++-
5 files changed, 108 insertions(+), 54 deletions(-)
diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
index a866efb15..f146b5210 100644
--- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
+++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
@@ -55,27 +55,28 @@ public int getDocno(String docid) {
int day = Integer.parseInt(docid.substring (sourceLength + 6, sourceLength + 8));
int articleNo = Integer.parseInt(docid.substring (sourceLength + 9, sourceLength + 13));
+ LOG.debug("source: " + source + ", year: " + year + ", month: " + month + ", day: " + day + ", articleNo: " + articleNo);
// first traverse the entries to find the month entry and get its days
int entryId = findEntryId(source, year, month);
LOG.debug("entryId: " + entryId);
- String entryElt = docidEntries[entryId].split("\t")[day];
- LOG.debug("entryElt: " + entryElt);
-
- // then traverse the days to find the day and skip over missing articles to get the article number
- String[] entryEltParts = entryElt.split(" ");
- int result = articleNo + Integer.parseInt(entryEltParts[0]);
- String[] entryDayParts = entryEltParts[1].split(",");
- for (int i = 1; i < entryDayParts.length; i++) {
- int missingNo = Integer.parseInt(entryDayParts[i]);
- if (articleNo < missingNo) break;
- LOG.debug("skipping missingNo: " + missingNo);
- result--;
- }
+ String entryElt = docidEntries[entryId].split("\t")[day];
+ LOG.debug("entryElt: " + entryElt);
+
+ // then traverse the days to find the day and skip over missing articles to get the article number
+ String[] entryEltParts = entryElt.split(" ");
+ int result = articleNo + Integer.parseInt(entryEltParts[0]);
+ String[] entryDayParts = entryEltParts[1].split(",");
+ for (int i = 1; i < entryDayParts.length; i++) {
+ int missingNo = Integer.parseInt(entryDayParts[i]);
+ if (articleNo < missingNo) break;
+ LOG.debug("skipping missingNo: " + missingNo);
+ result--;
+ }
- LOG.trace("getDocno returning: " + result);
- return result;
+ LOG.trace("getDocno returning: " + result);
+ return result;
}
private int findEntryId(String source, int year, int month) {
@@ -170,6 +171,7 @@ public void loadMapping(Path p, FileSystem fs) throws IOException {
static public void writeDocnoData(Path input, Path output, FileSystem fs) throws IOException {
+ //LOG.setLevel(Level.TRACE);
LOG.info("Writing docno data to " + output);
LineReader reader = new LineReader(fs.open(input));
List list = Lists.newArrayList();
@@ -189,6 +191,7 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws
while (reader.readLine(line) > 0) {
String docid = line.toString();
+ LOG.debug("reading line docid: " + docid);
int sourceLength = docid.indexOf("\t") - 13;
String source = docid.substring(0, sourceLength);
int year = Integer.parseInt(docid.substring (sourceLength, sourceLength + 4));
@@ -201,7 +204,7 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws
if (! source.equals(prevSource) ||
year != prevYear ||
month != prevMonth) {
- LOG.debug("currentEntry: " + currentEntry);
+ LOG.debug("diff source, year or month, currentEntry: " + currentEntry);
if (currentEntry != null) {
list.add(currentEntry.toString());
list.add("");
@@ -210,16 +213,19 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws
currentEntry = new StringBuilder(cnt + " " + source + " " + year + " " + month);
prevDay = 0;
prevArticleNo = 0;
+ LOG.debug("diff source, year or month, reset currentEntry: " + currentEntry);
}
if (day != prevDay) {
for (int i = prevDay + 1; i <= day; i++) {
currentEntry.append("\t" + cnt + " " + i);
}
+ LOG.debug("diff day, currentEntry: " + currentEntry);
prevArticleNo = 0;
// writeUTF can't write a string longer than 64k, so we output a chunk at a time
// here then concatenate strings between s
list.add(currentEntry.toString());
currentEntry = new StringBuilder ();
+ LOG.debug("diff day, reset currentEntry");
}
if (articleNo != prevArticleNo + 1) {
// we have missing article numbers - gather them
@@ -238,6 +244,7 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws
LOG.info(cnt + " docs");
}
}
+ LOG.debug("adding final currentEntry: " + currentEntry);
list.add(currentEntry.toString());
list.add("");
numEntries++;
@@ -261,6 +268,7 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws
}
static public String[] readDocnoData(Path p, FileSystem fs) throws IOException {
+ //LOG.setLevel(Level.TRACE);
LOG.trace("readDocnoData (p: " + p + ", fs)");
FSDataInputStream in = fs.open(p);
diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java
index cdeec69eb..79f7ff8ee 100644
--- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java
+++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java
@@ -21,7 +21,11 @@
import java.io.IOException;
import java.util.regex.Pattern;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableUtils;
+import org.apache.hadoop.util.LineReader;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
@@ -38,16 +42,43 @@ public class Aquaint2Document extends Indexable {
private static Pattern TAGS_PATTERN = Pattern.compile("<[^>]+>");
private static Pattern WHITESPACE_PATTERN = Pattern.compile("\t|\n");
- public static final String XML_START_TAG = "");
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ //LOG.info("in getXmlStartTag, isAquaint2: " + isAquaint2);
+ if (isAquaint2) {
+ return AQUAINT2_XML_START_TAG;
+ } else {
+ return AQUAINT_XML_START_TAG;
+ }
+ }
+
+
+ public static String getXmlEndTag() {
+ return XML_END_TAG;
+ }
@Override
@@ -67,10 +98,32 @@ public void readFields(DataInput in) throws IOException {
}
+ public String getElementText(String elementTagName) {
+ String result = "";
+ int index = raw.indexOf("<" + elementTagName + ">");
+ if (index != -1) {
+ int start = index + elementTagName.length() + 2;
+ int end = raw.indexOf("" + elementTagName + ">");
+ try {
+ result = raw.substring(start, end).trim();
+ } catch (Exception e) {
+ LOG.error("exception: " + e);
+ LOG.error("docid: " + getDocid () + ", index: " + index + ", start: " + start + ", end: " + end);
+ LOG.error("raw:\n" + raw);
+ result = raw.substring(start).trim();
+ LOG.error("found element text: " + result);
+ }
+ result = TAGS_PATTERN.matcher(result).replaceAll("");
+ result = WHITESPACE_PATTERN.matcher(result).replaceAll(" ");
+ }
+ return result;
+ }
+
+
@Override
public String getDocid() {
if (docid == null) {
- if (isAquaint2) {
+ if (isAquaint2Document) {
setAquaint2Docid();
} else {
setAquaintDocid();
@@ -81,18 +134,14 @@ public String getDocid() {
private void setAquaintDocid() {
- int start = raw.indexOf("");
- if (start == -1) {
- docid = "";
- } else {
- int end = raw.indexOf("");
- docid = raw.substring(start + 7, end).trim();
- }
+ LOG.trace("setAquaintDocid()");
+ docid = getElementText("DOCNO");
LOG.trace("in setAquaintDocid, docid: " + docid);
}
private void setAquaint2Docid() {
+ LOG.trace("setAquaint2Docid()");
int start = 9;
int end = raw.indexOf("\"", start);
docid = raw.substring(start, end).trim();
@@ -102,22 +151,9 @@ private void setAquaint2Docid() {
public String getHeadline() {
if (headline == null) {
- int start = raw.indexOf("");
- if (start == -1) {
- headline = "";
- } else {
- int end = raw.indexOf("");
- try {
- headline = raw.substring(start + 10, end).trim();
- } catch (Exception e) {
- LOG.error("exception: " + e);
- LOG.error("docid: " + getDocid () + ", start: " + start + ", end: " + end);
- LOG.error("raw:\n" + raw);
- headline = raw.substring(start + 10).trim();
- LOG.error("updated headline: " + headline);
- }
- headline = TAGS_PATTERN.matcher(headline).replaceAll("");
- headline = WHITESPACE_PATTERN.matcher(headline).replaceAll(" ");
+ headline = getElementText("HEADLINE");
+ if (! isAquaint2Document) {
+ headline = getElementText("SLUG").trim().toLowerCase() + ": " + headline;
}
}
return headline;
@@ -143,17 +179,18 @@ public String getContent() {
public static void readDocument(Aquaint2Document doc, String s) {
+ LOG.trace("readDocument(doc, s), s: \n" + s);
if (s == null) {
throw new RuntimeException("Error, can't read null string!");
}
doc.raw = s;
+ doc.isAquaint2Document = doc.raw.startsWith("\n") == -1);
- doc.isAquaint2 = (doc.raw.indexOf("") == -1);
LOG.debug("docid: " + doc.getDocid() + " length: " + doc.raw.length());
+ LOG.trace("readDocument returning");
}
}
diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocumentInputFormat.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocumentInputFormat.java
index c2f9f4dd5..13be1b9b6 100644
--- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocumentInputFormat.java
+++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocumentInputFormat.java
@@ -48,9 +48,6 @@ public static class Aquaint2DocumentRecordReader implements
private final LongWritable offset = new LongWritable();
public Aquaint2DocumentRecordReader(FileSplit split, JobConf conf) throws IOException {
- conf.set(XMLInputFormat.START_TAG_KEY, Aquaint2Document.XML_START_TAG);
- conf.set(XMLInputFormat.END_TAG_KEY, Aquaint2Document.XML_END_TAG);
-
reader = new XMLRecordReader(split, conf);
}
diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocumentInputFormat2.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocumentInputFormat2.java
index aacc4e4c1..c7ba65b65 100644
--- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocumentInputFormat2.java
+++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocumentInputFormat2.java
@@ -28,6 +28,9 @@
import edu.umd.cloud9.collection.XMLInputFormat;
import edu.umd.cloud9.collection.XMLInputFormat2.XMLRecordReader;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+
public class Aquaint2DocumentInputFormat2 extends
IndexableFileInputFormat2 {
@@ -45,10 +48,6 @@ public static class Aquaint2DocumentRecordReader extends
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
- Configuration conf = context.getConfiguration();
- conf.set(XMLInputFormat.START_TAG_KEY, Aquaint2Document.XML_START_TAG);
- conf.set(XMLInputFormat.END_TAG_KEY, Aquaint2Document.XML_END_TAG);
-
reader.initialize(split, context);
}
diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java b/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java
index d07643ccc..812433754 100644
--- a/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java
+++ b/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java
@@ -33,10 +33,16 @@
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
+import org.apache.log4j.Level;
import org.apache.log4j.Logger;
+
public class NumberAquaint2Documents2 extends Configured implements Tool {
private static final Logger LOG = Logger.getLogger(NumberAquaint2Documents2.class);
+ {
+ LOG.setLevel(Level.INFO);
+ }
+
private static enum Count { DOCS };
private static class MyMapper extends Mapper {
@@ -49,6 +55,8 @@ public void map(LongWritable key, Aquaint2Document doc, Context context)
context.getCounter(Count.DOCS).increment(1);
docid.set(doc.getDocid());
context.write(docid, one);
+ LOG.setLevel(Level.INFO);
+ LOG.trace("map output (" + docid + ", " + one + ")");
}
}
@@ -60,6 +68,8 @@ public void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException {
context.write(key, cnt);
cnt.set(cnt.get() + 1);
+ LOG.setLevel(Level.INFO);
+ LOG.trace("reduce output (" + key + ", " + cnt + ")");
}
}
@@ -92,7 +102,10 @@ public int run(String[] args) throws Exception {
LOG.info(" - Output path: " + outputPath);
LOG.info(" - Output file: " + outputFile);
- Job job = new Job(getConf(), NumberAquaint2Documents2.class.getSimpleName());
+ Configuration conf = getConf();
+ FileSystem fs = FileSystem.get(conf);
+
+ Job job = new Job(conf, NumberAquaint2Documents2.class.getSimpleName());
job.setJarByClass(NumberAquaint2Documents2.class);
job.setNumReduceTasks(1);
@@ -110,7 +123,7 @@ public int run(String[] args) throws Exception {
job.setReducerClass(MyReducer.class);
// Delete the output directory if it exists already.
- FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);
+ fs.delete(new Path(outputPath), true);
job.waitForCompletion(true);
From 0963700ca49af11dcf71dbf7ad4ef8bfdcc9417a Mon Sep 17 00:00:00 2001
From: "Earl J. Wagner"
Date: Thu, 17 Nov 2011 15:42:08 -0500
Subject: [PATCH 14/18] minor fix
---
.../cloud9/collection/aquaint2/NumberAquaint2Documents2.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java b/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java
index 5f9b753d5..737a59bd1 100644
--- a/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java
+++ b/src/dist/edu/umd/cloud9/collection/aquaint2/NumberAquaint2Documents2.java
@@ -124,7 +124,7 @@ public int run(String[] args) throws Exception {
job.setReducerClass(MyReducer.class);
// Delete the output directory if it exists already.
- fs.delete(new Path(outputDirPath), true);
+ fs.delete(outputDirPath, true);
job.waitForCompletion(true);
From 9da236ed4fa9087f035558b5a0c02146a2c310b2 Mon Sep 17 00:00:00 2001
From: "Earl J. Wagner"
Date: Tue, 22 Nov 2011 01:49:10 -0500
Subject: [PATCH 15/18] fix to allow multiple Aquaint2 DTDs but just one for
Aquaint
---
.../edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java
index 79f7ff8ee..567d45d2d 100644
--- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java
+++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java
@@ -63,7 +63,10 @@ public static String getXmlStartTag(FileSystem fs, String inputFile) {
Text line = new Text();
reader.readLine(line);
reader.readLine(line);
- isAquaint2 = line.toString().endsWith("'a2_newswire_xml.dtd'>");
+ // Aquaint: 'aquaint.dtd'
+ // Aquaint2: 'a2_newswire_xml.dtd'
+ // Gigaword: 'gigaword.dtd'
+ isAquaint2 = ! line.toString().endsWith("'aquaint.dtd'>");
} catch (IOException e) {
e.printStackTrace();
}
From ab4a525af86fe368fd41f38eef77bd9c9db4456d Mon Sep 17 00:00:00 2001
From: "Earl J. Wagner"
Date: Fri, 2 Mar 2012 14:16:13 -0500
Subject: [PATCH 16/18] added sorting entries by ascending value
---
src/dist/edu/umd/cloud9/util/map/HMapIF.java | 1949 +++++++++---------
1 file changed, 1011 insertions(+), 938 deletions(-)
diff --git a/src/dist/edu/umd/cloud9/util/map/HMapIF.java b/src/dist/edu/umd/cloud9/util/map/HMapIF.java
index b7c449fc2..abdaba732 100644
--- a/src/dist/edu/umd/cloud9/util/map/HMapIF.java
+++ b/src/dist/edu/umd/cloud9/util/map/HMapIF.java
@@ -1,5 +1,5 @@
/*
- * @(#)HashMap.java 1.73 07/03/13
+ * @(#)HashMap.java 1.73 07/03/13
*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
@@ -26,338 +26,338 @@
*/
public class HMapIF implements MapIF, Cloneable, Serializable {
- /**
- * The default initial capacity - MUST be a power of two.
- */
- static final int DEFAULT_INITIAL_CAPACITY = 1024;
-
- /**
- * The maximum capacity, used if a higher value is implicitly specified by
- * either of the constructors with arguments. MUST be a power of two <= 1<<30.
- */
- static final int MAXIMUM_CAPACITY = 1 << 30;
-
- /**
- * The load factor used when none specified in constructor.
- */
- static final float DEFAULT_LOAD_FACTOR = 0.75f;
-
- /**
- * The table, resized as necessary. Length MUST Always be a power of two.
- */
- transient Entry[] table;
-
- /**
- * The number of key-value mappings contained in this map.
- */
- transient int size;
-
- /**
- * The next size value at which to resize (capacity * load factor).
- *
- * @serial
- */
- int threshold;
-
- /**
- * The load factor for the hash table.
- *
- * @serial
- */
- final float loadFactor;
-
- /**
- * The number of times this HMapIF has been structurally modified Structural
- * modifications are those that change the number of mappings in the HMapIF
- * or otherwise modify its internal structure (e.g., rehash). This field is
- * used to make iterators on Collection-views of the HMapIF fail-fast. (See
- * ConcurrentModificationException).
- */
- transient volatile int modCount;
-
- /**
- * Constructs an empty HMapIF with the specified initial capacity
- * and load factor.
- *
- * @param initialCapacity
- * the initial capacity
- * @param loadFactor
- * the load factor
- * @throws IllegalArgumentException
- * if the initial capacity is negative or the load factor is
- * nonpositive
- */
- public HMapIF(int initialCapacity, float loadFactor) {
- if (initialCapacity < 0)
- throw new IllegalArgumentException("Illegal initial capacity: " + initialCapacity);
- if (initialCapacity > MAXIMUM_CAPACITY)
- initialCapacity = MAXIMUM_CAPACITY;
- if (loadFactor <= 0 || Float.isNaN(loadFactor))
- throw new IllegalArgumentException("Illegal load factor: " + loadFactor);
-
- // Find a power of 2 >= initialCapacity
- int capacity = 1;
- while (capacity < initialCapacity)
- capacity <<= 1;
-
- this.loadFactor = loadFactor;
- threshold = (int) (capacity * loadFactor);
- table = new Entry[capacity];
- init();
- }
-
- /**
- * Constructs an empty HMapIF with the specified initial capacity
- * and the default load factor (0.75).
- *
- * @param initialCapacity
- * the initial capacity.
- * @throws IllegalArgumentException
- * if the initial capacity is negative.
- */
- public HMapIF(int initialCapacity) {
- this(initialCapacity, DEFAULT_LOAD_FACTOR);
- }
-
- /**
- * Constructs an empty HMapIF with the default initial capacity
- * (1024) and the default load factor (0.75).
- */
- public HMapIF() {
- this.loadFactor = DEFAULT_LOAD_FACTOR;
- threshold = (int) (DEFAULT_INITIAL_CAPACITY * DEFAULT_LOAD_FACTOR);
- table = new Entry[DEFAULT_INITIAL_CAPACITY];
- init();
- }
-
- /**
- * Constructs a new HMapIF with the same mappings as the
- * specified MapIF. The HMapIF is created with default
- * load factor (0.75) and an initial capacity sufficient to hold the
- * mappings in the specified MapIF.
- *
- * @param m
- * the map whose mappings are to be placed in this map
- * @throws NullPointerException
- * if the specified map is null
- */
- public HMapIF(MapIF m) {
- this(Math.max((int) (m.size() / DEFAULT_LOAD_FACTOR) + 1, DEFAULT_INITIAL_CAPACITY),
- DEFAULT_LOAD_FACTOR);
- putAllForCreate(m);
- }
-
- // internal utilities
-
- /**
- * Initialization hook for subclasses. This method is called in all
- * constructors and pseudo-constructors (clone, readObject) after HMapIF has
- * been initialized but before any entries have been inserted. (In the
- * absence of this method, readObject would require explicit knowledge of
- * subclasses.)
- */
- void init() {
- }
-
- /**
- * Applies a supplemental hash function to a given hashCode, which defends
- * against poor quality hash functions. This is critical because HMapIF uses
- * power-of-two length hash tables, that otherwise encounter collisions for
- * hashCodes that do not differ in lower bits. Note: Null keys always map to
- * hash 0, thus index 0.
- */
- static int hash(int h) {
- // This function ensures that hashCodes that differ only by
- // constant multiples at each bit position have a bounded
- // number of collisions (approximately 8 at default load factor).
- h ^= (h >>> 20) ^ (h >>> 12);
- return h ^ (h >>> 7) ^ (h >>> 4);
- }
-
- /**
- * Returns index for hash code h.
- */
- static int indexFor(int h, int length) {
- return h & (length - 1);
- }
-
- // doc copied from interface
- public int size() {
- return size;
- }
-
- // doc copied from interface
- public boolean isEmpty() {
- return size == 0;
- }
-
- // doc copied from interface
- public float get(int key) {
- int hash = hash(key);
- for (Entry e = table[indexFor(hash, table.length)]; e != null; e = e.next) {
- int k;
- if (e.hash == hash && ((k = e.key) == key || key == k))
- return e.value;
- }
-
- return DEFAULT_VALUE;
- }
-
- // doc copied from interface
- public boolean containsKey(int key) {
- return getEntry(key) != null;
- }
-
- /**
- * Returns the entry associated with the specified key in the HMapIF.
- * Returns null if the HMapIF contains no mapping for the key.
- */
- final Entry getEntry(int key) {
- int hash = hash(key);
- for (Entry e = table[indexFor(hash, table.length)]; e != null; e = e.next) {
- int k;
- if (e.hash == hash && ((k = e.key) == key || key == k))
- return e;
- }
- return null;
- }
-
- // doc copied from interface
- public float put(int key, float value) {
- int hash = hash(key);
- int i = indexFor(hash, table.length);
- for (Entry e = table[i]; e != null; e = e.next) {
- int k;
- if (e.hash == hash && ((k = e.key) == key || key == k)) {
- float oldValue = e.value;
- e.value = value;
- e.recordAccess(this);
- return oldValue;
- }
- }
-
- modCount++;
- addEntry(hash, key, value, i);
- return DEFAULT_VALUE;
- }
-
- /**
- * This method is used instead of put by constructors and pseudoconstructors
- * (clone, readObject). It does not resize the table, check for
- * comodification, etc. It calls createEntry rather than addEntry.
- */
- private void putForCreate(int key, float value) {
- int hash = hash(key);
- int i = indexFor(hash, table.length);
-
- /**
- * Look for preexisting entry for key. This will never happen for clone
- * or deserialize. It will only happen for construction if the input Map
- * is a sorted map whose ordering is inconsistent w/ equals.
- */
- for (Entry e = table[i]; e != null; e = e.next) {
- int k;
- if (e.hash == hash && ((k = e.key) == key || key == k)) {
- e.value = value;
- return;
- }
- }
-
- createEntry(hash, key, value, i);
- }
-
- private void putAllForCreate(MapIF m) {
- for (Iterator extends MapIF.Entry> i = m.entrySet().iterator(); i.hasNext();) {
- MapIF.Entry e = i.next();
- putForCreate(e.getKey(), e.getValue());
- }
- }
-
- /**
- * Rehashes the contents of this map into a new array with a larger
- * capacity. This method is called automatically when the number of keys in
- * this map reaches its threshold.
- *
- * If current capacity is MAXIMUM_CAPACITY, this method does not resize the
- * map, but sets threshold to Integer.MAX_VALUE. This has the effect of
- * preventing future calls.
- *
- * @param newCapacity
- * the new capacity, MUST be a power of two; must be greater than
- * current capacity unless current capacity is MAXIMUM_CAPACITY
- * (in which case value is irrelevant).
- */
- void resize(int newCapacity) {
- Entry[] oldTable = table;
- int oldCapacity = oldTable.length;
- if (oldCapacity == MAXIMUM_CAPACITY) {
- threshold = Integer.MAX_VALUE;
- return;
- }
-
- Entry[] newTable = new Entry[newCapacity];
- transfer(newTable);
- table = newTable;
- threshold = (int) (newCapacity * loadFactor);
- }
-
- /**
- * Transfers all entries from current table to newTable.
- */
- void transfer(Entry[] newTable) {
- Entry[] src = table;
- int newCapacity = newTable.length;
- for (int j = 0; j < src.length; j++) {
- Entry e = src[j];
- if (e != null) {
- src[j] = null;
- do {
- Entry next = e.next;
- int i = indexFor(e.hash, newCapacity);
- e.next = newTable[i];
- newTable[i] = e;
- e = next;
- } while (e != null);
- }
- }
- }
-
- // doc copied from interface
- public void putAll(MapIF m) {
- int numKeysToBeAdded = m.size();
- if (numKeysToBeAdded == 0)
- return;
-
- /*
- * Expand the map if the map if the number of mappings to be added is
- * greater than or equal to threshold. This is conservative; the obvious
- * condition is (m.size() + size) >= threshold, but this condition could
- * result in a map with twice the appropriate capacity, if the keys to
- * be added overlap with the keys already in this map. By using the
- * conservative calculation, we subject ourself to at most one extra
- * resize.
- */
- if (numKeysToBeAdded > threshold) {
- int targetCapacity = (int) (numKeysToBeAdded / loadFactor + 1);
- if (targetCapacity > MAXIMUM_CAPACITY)
- targetCapacity = MAXIMUM_CAPACITY;
- int newCapacity = table.length;
- while (newCapacity < targetCapacity)
- newCapacity <<= 1;
- if (newCapacity > table.length)
- resize(newCapacity);
- }
-
- for (Iterator extends MapIF.Entry> i = m.entrySet().iterator(); i.hasNext();) {
- MapIF.Entry e = i.next();
- put(e.getKey(), e.getValue());
- }
- }
+ /**
+ * The default initial capacity - MUST be a power of two.
+ */
+ static final int DEFAULT_INITIAL_CAPACITY = 1024;
+
+ /**
+ * The maximum capacity, used if a higher value is implicitly specified by
+ * either of the constructors with arguments. MUST be a power of two <= 1<<30.
+ */
+ static final int MAXIMUM_CAPACITY = 1 << 30;
+
+ /**
+ * The load factor used when none specified in constructor.
+ */
+ static final float DEFAULT_LOAD_FACTOR = 0.75f;
+
+ /**
+ * The table, resized as necessary. Length MUST Always be a power of two.
+ */
+ transient Entry[] table;
+
+ /**
+ * The number of key-value mappings contained in this map.
+ */
+ transient int size;
+
+ /**
+ * The next size value at which to resize (capacity * load factor).
+ *
+ * @serial
+ */
+ int threshold;
+
+ /**
+ * The load factor for the hash table.
+ *
+ * @serial
+ */
+ final float loadFactor;
+
+ /**
+ * The number of times this HMapIF has been structurally modified Structural
+ * modifications are those that change the number of mappings in the HMapIF
+ * or otherwise modify its internal structure (e.g., rehash). This field is
+ * used to make iterators on Collection-views of the HMapIF fail-fast. (See
+ * ConcurrentModificationException).
+ */
+ transient volatile int modCount;
+
+ /**
+ * Constructs an empty HMapIF with the specified initial capacity
+ * and load factor.
+ *
+ * @param initialCapacity
+ * the initial capacity
+ * @param loadFactor
+ * the load factor
+ * @throws IllegalArgumentException
+ * if the initial capacity is negative or the load factor is
+ * nonpositive
+ */
+ public HMapIF(int initialCapacity, float loadFactor) {
+ if (initialCapacity < 0)
+ throw new IllegalArgumentException("Illegal initial capacity: " + initialCapacity);
+ if (initialCapacity > MAXIMUM_CAPACITY)
+ initialCapacity = MAXIMUM_CAPACITY;
+ if (loadFactor <= 0 || Float.isNaN(loadFactor))
+ throw new IllegalArgumentException("Illegal load factor: " + loadFactor);
+
+ // Find a power of 2 >= initialCapacity
+ int capacity = 1;
+ while (capacity < initialCapacity)
+ capacity <<= 1;
+
+ this.loadFactor = loadFactor;
+ threshold = (int) (capacity * loadFactor);
+ table = new Entry[capacity];
+ init();
+ }
+
+ /**
+ * Constructs an empty HMapIF with the specified initial capacity
+ * and the default load factor (0.75).
+ *
+ * @param initialCapacity
+ * the initial capacity.
+ * @throws IllegalArgumentException
+ * if the initial capacity is negative.
+ */
+ public HMapIF(int initialCapacity) {
+ this(initialCapacity, DEFAULT_LOAD_FACTOR);
+ }
+
+ /**
+ * Constructs an empty HMapIF with the default initial capacity
+ * (1024) and the default load factor (0.75).
+ */
+ public HMapIF() {
+ this.loadFactor = DEFAULT_LOAD_FACTOR;
+ threshold = (int) (DEFAULT_INITIAL_CAPACITY * DEFAULT_LOAD_FACTOR);
+ table = new Entry[DEFAULT_INITIAL_CAPACITY];
+ init();
+ }
+
+ /**
+ * Constructs a new HMapIF with the same mappings as the
+ * specified MapIF. The HMapIF is created with default
+ * load factor (0.75) and an initial capacity sufficient to hold the
+ * mappings in the specified MapIF.
+ *
+ * @param m
+ * the map whose mappings are to be placed in this map
+ * @throws NullPointerException
+ * if the specified map is null
+ */
+ public HMapIF(MapIF m) {
+ this(Math.max((int) (m.size() / DEFAULT_LOAD_FACTOR) + 1, DEFAULT_INITIAL_CAPACITY),
+ DEFAULT_LOAD_FACTOR);
+ putAllForCreate(m);
+ }
+
+ // internal utilities
+
+ /**
+ * Initialization hook for subclasses. This method is called in all
+ * constructors and pseudo-constructors (clone, readObject) after HMapIF has
+ * been initialized but before any entries have been inserted. (In the
+ * absence of this method, readObject would require explicit knowledge of
+ * subclasses.)
+ */
+ void init() {
+ }
+
+ /**
+ * Applies a supplemental hash function to a given hashCode, which defends
+ * against poor quality hash functions. This is critical because HMapIF uses
+ * power-of-two length hash tables, that otherwise encounter collisions for
+ * hashCodes that do not differ in lower bits. Note: Null keys always map to
+ * hash 0, thus index 0.
+ */
+ static int hash(int h) {
+ // This function ensures that hashCodes that differ only by
+ // constant multiples at each bit position have a bounded
+ // number of collisions (approximately 8 at default load factor).
+ h ^= (h >>> 20) ^ (h >>> 12);
+ return h ^ (h >>> 7) ^ (h >>> 4);
+ }
+
+ /**
+ * Returns index for hash code h.
+ */
+ static int indexFor(int h, int length) {
+ return h & (length - 1);
+ }
+
+ // doc copied from interface
+ public int size() {
+ return size;
+ }
+
+ // doc copied from interface
+ public boolean isEmpty() {
+ return size == 0;
+ }
+
+ // doc copied from interface
+ public float get(int key) {
+ int hash = hash(key);
+ for (Entry e = table[indexFor(hash, table.length)]; e != null; e = e.next) {
+ int k;
+ if (e.hash == hash && ((k = e.key) == key || key == k))
+ return e.value;
+ }
+
+ return DEFAULT_VALUE;
+ }
+
+ // doc copied from interface
+ public boolean containsKey(int key) {
+ return getEntry(key) != null;
+ }
+
+ /**
+ * Returns the entry associated with the specified key in the HMapIF.
+ * Returns null if the HMapIF contains no mapping for the key.
+ */
+ final Entry getEntry(int key) {
+ int hash = hash(key);
+ for (Entry e = table[indexFor(hash, table.length)]; e != null; e = e.next) {
+ int k;
+ if (e.hash == hash && ((k = e.key) == key || key == k))
+ return e;
+ }
+ return null;
+ }
+
+ // doc copied from interface
+ public float put(int key, float value) {
+ int hash = hash(key);
+ int i = indexFor(hash, table.length);
+ for (Entry e = table[i]; e != null; e = e.next) {
+ int k;
+ if (e.hash == hash && ((k = e.key) == key || key == k)) {
+ float oldValue = e.value;
+ e.value = value;
+ e.recordAccess(this);
+ return oldValue;
+ }
+ }
+
+ modCount++;
+ addEntry(hash, key, value, i);
+ return DEFAULT_VALUE;
+ }
+
+ /**
+ * This method is used instead of put by constructors and pseudoconstructors
+ * (clone, readObject). It does not resize the table, check for
+ * comodification, etc. It calls createEntry rather than addEntry.
+ */
+ private void putForCreate(int key, float value) {
+ int hash = hash(key);
+ int i = indexFor(hash, table.length);
+
+ /**
+ * Look for preexisting entry for key. This will never happen for clone
+ * or deserialize. It will only happen for construction if the input Map
+ * is a sorted map whose ordering is inconsistent w/ equals.
+ */
+ for (Entry e = table[i]; e != null; e = e.next) {
+ int k;
+ if (e.hash == hash && ((k = e.key) == key || key == k)) {
+ e.value = value;
+ return;
+ }
+ }
+
+ createEntry(hash, key, value, i);
+ }
+
+ private void putAllForCreate(MapIF m) {
+ for (Iterator extends MapIF.Entry> i = m.entrySet().iterator(); i.hasNext();) {
+ MapIF.Entry e = i.next();
+ putForCreate(e.getKey(), e.getValue());
+ }
+ }
+
+ /**
+ * Rehashes the contents of this map into a new array with a larger
+ * capacity. This method is called automatically when the number of keys in
+ * this map reaches its threshold.
+ *
+ * If current capacity is MAXIMUM_CAPACITY, this method does not resize the
+ * map, but sets threshold to Integer.MAX_VALUE. This has the effect of
+ * preventing future calls.
+ *
+ * @param newCapacity
+ * the new capacity, MUST be a power of two; must be greater than
+ * current capacity unless current capacity is MAXIMUM_CAPACITY
+ * (in which case value is irrelevant).
+ */
+ void resize(int newCapacity) {
+ Entry[] oldTable = table;
+ int oldCapacity = oldTable.length;
+ if (oldCapacity == MAXIMUM_CAPACITY) {
+ threshold = Integer.MAX_VALUE;
+ return;
+ }
+
+ Entry[] newTable = new Entry[newCapacity];
+ transfer(newTable);
+ table = newTable;
+ threshold = (int) (newCapacity * loadFactor);
+ }
+
+ /**
+ * Transfers all entries from current table to newTable.
+ */
+ void transfer(Entry[] newTable) {
+ Entry[] src = table;
+ int newCapacity = newTable.length;
+ for (int j = 0; j < src.length; j++) {
+ Entry e = src[j];
+ if (e != null) {
+ src[j] = null;
+ do {
+ Entry next = e.next;
+ int i = indexFor(e.hash, newCapacity);
+ e.next = newTable[i];
+ newTable[i] = e;
+ e = next;
+ } while (e != null);
+ }
+ }
+ }
+
+ // doc copied from interface
+ public void putAll(MapIF m) {
+ int numKeysToBeAdded = m.size();
+ if (numKeysToBeAdded == 0)
+ return;
+
+ /*
+ * Expand the map if the map if the number of mappings to be added is
+ * greater than or equal to threshold. This is conservative; the obvious
+ * condition is (m.size() + size) >= threshold, but this condition could
+ * result in a map with twice the appropriate capacity, if the keys to
+ * be added overlap with the keys already in this map. By using the
+ * conservative calculation, we subject ourself to at most one extra
+ * resize.
+ */
+ if (numKeysToBeAdded > threshold) {
+ int targetCapacity = (int) (numKeysToBeAdded / loadFactor + 1);
+ if (targetCapacity > MAXIMUM_CAPACITY)
+ targetCapacity = MAXIMUM_CAPACITY;
+ int newCapacity = table.length;
+ while (newCapacity < targetCapacity)
+ newCapacity <<= 1;
+ if (newCapacity > table.length)
+ resize(newCapacity);
+ }
+
+ for (Iterator extends MapIF.Entry> i = m.entrySet().iterator(); i.hasNext();) {
+ MapIF.Entry e = i.next();
+ put(e.getKey(), e.getValue());
+ }
+ }
/**
* Increments the key by some value. If the key does not exist in the map, its value is
* set to the parameter value.
- *
+ *
* @param key
* key to increment
* @param value
@@ -371,613 +371,686 @@ public void increment(int key, float value) {
}
}
- // doc copied from interface
- public float remove(int key) {
- Entry e = removeEntryForKey(key);
- if (e != null)
- return e.value;
-
- throw new NoSuchElementException();
- }
-
- /**
- * Removes and returns the entry associated with the specified key in the
- * HMapIF. Returns null if the HMapIF contains no mapping for this key.
- */
- final Entry removeEntryForKey(int key) {
- int hash = hash(key);
- int i = indexFor(hash, table.length);
- Entry prev = table[i];
- Entry e = prev;
-
- while (e != null) {
- Entry next = e.next;
- int k;
- if (e.hash == hash && ((k = e.key) == key || key == k)) {
- modCount++;
- size--;
- if (prev == e)
- table[i] = next;
- else
- prev.next = next;
- e.recordRemoval(this);
- return e;
- }
- prev = e;
- e = next;
- }
-
- return e;
- }
-
- /**
- * Special version of remove for EntrySet.
- */
- final Entry removeMapping(Object o) {
- MapII.Entry entry = (MapII.Entry) o;
- Object key = entry.getKey();
- int hash = (key == null) ? 0 : hash(key.hashCode());
- int i = indexFor(hash, table.length);
- Entry prev = table[i];
- Entry e = prev;
-
- while (e != null) {
- Entry next = e.next;
- if (e.hash == hash && e.equals(entry)) {
- modCount++;
- size--;
- if (prev == e)
- table[i] = next;
- else
- prev.next = next;
- e.recordRemoval(this);
- return e;
- }
- prev = e;
- e = next;
- }
-
- return e;
- }
-
- // doc copied from interface
- public void clear() {
- modCount++;
- Entry[] tab = table;
- for (int i = 0; i < tab.length; i++)
- tab[i] = null;
- size = 0;
- }
-
- // doc copied from interface
- public boolean containsValue(float value) {
- Entry[] tab = table;
- for (int i = 0; i < tab.length; i++)
- for (Entry e = tab[i]; e != null; e = e.next)
- if (value == e.value)
- return true;
- return false;
- }
-
- /**
- * Returns a shallow copy of this HMapIF instance: the keys and
- * values themselves are not cloned.
- *
- * @return a shallow copy of this map
- */
- public Object clone() {
- HMapIF result = null;
- try {
- result = (HMapIF) super.clone();
- } catch (CloneNotSupportedException e) {
- // assert false;
- }
- result.table = new Entry[table.length];
- result.entrySet = null;
- result.modCount = 0;
- result.size = 0;
- result.init();
- result.putAllForCreate(this);
-
- return result;
- }
-
- static class Entry implements MapIF.Entry {
- final int key;
- float value;
- Entry next;
- final int hash;
-
- /**
- * Creates new entry.
- */
- Entry(int h, int k, float v, Entry n) {
- value = v;
- next = n;
- key = k;
- hash = h;
- }
-
- public final int getKey() {
- return key;
- }
-
- public final float getValue() {
- return value;
- }
-
- public final float setValue(float newValue) {
- float oldValue = value;
- value = newValue;
- return oldValue;
- }
-
- public final boolean equals(Object o) {
- MapIF.Entry e = (MapIF.Entry) o;
- int k1 = getKey();
- int k2 = e.getKey();
- if (k1 == k2) {
- float v1 = getValue();
- float v2 = e.getValue();
- if (v1 == v2)
- return true;
- }
- return false;
- }
-
- public final int hashCode() {
- return (key) ^ ((int) value);
- }
-
- public final String toString() {
- return getKey() + "=" + getValue();
- }
-
- /**
- * This method is invoked whenever the value in an entry is overwritten
- * by an invocation of put(k,v) for a key k that's already in the
- * HMapIF.
- */
- void recordAccess(HMapIF m) {
- }
-
- /**
- * This method is invoked whenever the entry is removed from the table.
- */
- void recordRemoval(HMapIF m) {
- }
- }
-
- /**
- * Adds a new entry with the specified key, value and hash code to the
- * specified bucket. It is the responsibility of this method to resize the
- * table if appropriate.
- *
- * Subclass overrides this to alter the behavior of put method.
- */
- void addEntry(int hash, int key, float value, int bucketIndex) {
- Entry e = table[bucketIndex];
- table[bucketIndex] = new Entry(hash, key, value, e);
- if (size++ >= threshold)
- resize(2 * table.length);
- }
-
- /**
- * Like addEntry except that this version is used when creating entries as
- * part of Map construction or "pseudo-construction" (cloning,
- * deserialization). This version needn't worry about resizing the table.
- *
- * Subclass overrides this to alter the behavior of HMapIF(Map), clone, and
- * readObject.
- */
- void createEntry(int hash, int key, float value, int bucketIndex) {
- Entry e = table[bucketIndex];
- table[bucketIndex] = new Entry(hash, key, value, e);
- size++;
- }
-
- private abstract class HashIterator implements Iterator {
- Entry next; // next entry to return
- int expectedModCount; // For fast-fail
- int index; // current slot
- Entry current; // current entry
-
- HashIterator() {
- expectedModCount = modCount;
- if (size > 0) { // advance to first entry
- Entry[] t = table;
- while (index < t.length && (next = t[index++]) == null)
- ;
- }
- }
-
- public final boolean hasNext() {
- return next != null;
- }
-
- final Entry nextEntry() {
- if (modCount != expectedModCount)
- throw new ConcurrentModificationException();
- Entry e = next;
- if (e == null)
- throw new NoSuchElementException();
-
- if ((next = e.next) == null) {
- Entry[] t = table;
- while (index < t.length && (next = t[index++]) == null)
- ;
- }
- current = e;
- return e;
- }
-
- public void remove() {
- if (current == null)
- throw new IllegalStateException();
- if (modCount != expectedModCount)
- throw new ConcurrentModificationException();
- int k = current.key;
- current = null;
- HMapIF.this.removeEntryForKey(k);
- expectedModCount = modCount;
- }
-
- }
-
- private final class ValueIterator extends HashIterator {
- public Float next() {
- return nextEntry().value;
- }
- }
-
- private final class KeyIterator extends HashIterator {
- public Integer next() {
- return nextEntry().getKey();
- }
- }
-
- private final class EntryIterator extends HashIterator {
- public MapIF.Entry next() {
- return nextEntry();
- }
- }
-
- // Subclass overrides these to alter behavior of views' iterator() method
- Iterator newKeyIterator() {
- return new KeyIterator();
- }
-
- Iterator newValueIterator() {
- return new ValueIterator();
- }
-
- Iterator newEntryIterator() {
- return new EntryIterator();
- }
-
- // Views
-
- private transient Set entrySet = null;
-
- /**
- * Each of these fields are initialized to contain an instance of the
- * appropriate view the first time this view is requested. The views are
- * stateless, so there's no reason to create more than one of each.
- */
- transient volatile Set keySet = null;
- transient volatile Collection values = null;
-
- // doc copied from interface
- public Set keySet() {
- Set ks = keySet;
- return (ks != null ? ks : (keySet = new KeySet()));
- }
-
- private final class KeySet extends AbstractSet {
- @Override
- public Iterator iterator() {
- return newKeyIterator();
- }
-
- @Override
- public int size() {
- return size;
- }
-
- @Override
- public boolean contains(Object o) {
- return containsKey((Integer) o);
- }
- }
-
- // doc copied from interface
- public Collection values() {
- Collection vs = values;
- return (vs != null ? vs : (values = new Values()));
- }
-
- private final class Values extends AbstractCollection {
- @Override
- public Iterator iterator() {
- return newValueIterator();
- }
-
- @Override
- public int size() {
- return size;
- }
-
- @Override
- public boolean contains(Object o) {
- return containsValue((Float) o);
- }
- }
-
- // doc copied from interface
- public Set entrySet() {
- return entrySet0();
- }
-
- private Set entrySet0() {
- Set es = entrySet;
- return es != null ? es : (entrySet = new EntrySet());
- }
-
- private final class EntrySet extends AbstractSet {
- @Override
- public Iterator iterator() {
- return newEntryIterator();
- }
-
- @Override
- public int size() {
- return size;
- }
-
- @Override
- public boolean contains(Object o) {
- MapIF.Entry e = (MapIF.Entry) o;
- Entry candidate = getEntry(e.getKey());
- return candidate != null && candidate.equals(e);
- }
- }
-
- /**
- * Save the state of the HMapIF instance to a stream (i.e.,
- * serialize it).
- *
- * @serialData The capacity of the HMapIF (the length of the bucket
- * array) is emitted (int), followed by the size (an
- * int, the number of key-value mappings), followed by the key
- * (Object) and value (Object) for each key-value mapping. The
- * key-value mappings are emitted in no particular order.
- */
- private void writeObject(ObjectOutputStream s) throws IOException {
- Iterator i = (size > 0) ? entrySet0().iterator() : null;
-
- // Write out the threshold, loadfactor, and any hidden stuff
- s.defaultWriteObject();
-
- // Write out number of buckets
- s.writeInt(table.length);
-
- // Write out size (number of Mappings)
- s.writeInt(size);
-
- // Write out keys and values (alternating)
- if (i != null) {
- while (i.hasNext()) {
- MapIF.Entry e = i.next();
- s.writeInt(e.getKey());
- s.writeFloat(e.getValue());
- }
- }
- }
-
- private static final long serialVersionUID = 362498820763181265L;
-
- /**
- * Reconstitute the HMapIF instance from a stream (i.e.,
- * deserialize it).
- */
- private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException {
- // Read in the threshold, loadfactor, and any hidden stuff
- s.defaultReadObject();
-
- // Read in number of buckets and allocate the bucket array;
- int numBuckets = s.readInt();
- table = new Entry[numBuckets];
-
- init(); // Give subclass a chance to do its thing.
-
- // Read in size (number of Mappings)
- int size = s.readInt();
-
- // Read the keys and values, and put the mappings in the HMapIF
- for (int i = 0; i < size; i++) {
- int key = s.readInt();
- float value = s.readFloat();
- putForCreate(key, value);
- }
- }
-
- // These methods are used when serializing HashSets
- int capacity() {
- return table.length;
- }
-
- float loadFactor() {
- return loadFactor;
- }
-
- public String toString () {
- return toString (-1);
- }
-
- public String toString (int n) {
- Iterator i = entrySet().iterator();
- if (!i.hasNext() || n == 0)
- return "{}";
-
- StringBuilder sb = new StringBuilder();
- sb.append('{');
- for (int m = 2; ; m++) {
- MapIF.Entry e = i.next();
- int key = e.getKey();
- float value = e.getValue();
- //sb.append("(m: " + m + ", n: " + n + ")");
- sb.append(key);
- sb.append('=');
- sb.append(value);
- if (! i.hasNext() || (m > n && n > 0)) {
- if (i.hasNext()) {
- sb.append (", ...");
- }
- return sb.append('}').toString();
- }
- sb.append(", ");
- }
- }
-
- // methods not part of a standard HashMap
-
- /**
- * Adds values of keys from another map to this map.
- *
- * @param m
- * the other map
- */
- public void plus(MapIF m) {
- for (MapIF.Entry e : m.entrySet()) {
- int key = e.getKey();
-
- if (this.containsKey(key)) {
- this.put(key, this.get(key) + e.getValue());
- } else {
- this.put(key, e.getValue());
- }
- }
- }
-
- /**
- * Computes the dot product of this map with another map.
- *
- * @param m
- * the other map
- */
- public float dot(MapIF m) {
- float s = 0.0f;
-
- for (MapIF.Entry e : m.entrySet()) {
- int key = e.getKey();
-
- if (this.containsKey(key)) {
- s += this.get(key) * e.getValue();
- }
- }
-
- return s;
- }
-
- /**
- * Returns the length of the vector represented by this map.
- *
- * @return length of the vector represented by this map
- */
- public float length() {
- float s = 0.0f;
-
- for (MapIF.Entry e : this.entrySet()) {
- s += e.getValue() * e.getValue();
- }
-
- return (float) Math.sqrt(s);
- }
-
- /**
- * Normalizes values such that the vector represented by this map has unit
- * length.
- */
- public void normalize() {
- float l = this.length();
-
- for (int f : this.keySet()) {
- this.put(f, this.get(f) / l);
- }
-
- }
-
- /**
- * Returns entries sorted by descending value. Ties broken by the key.
- *
- * @return entries sorted by descending value
- */
- public MapIF.Entry[] getEntriesSortedByValue() {
- if (this.size() == 0)
- return null;
-
- // for storing the entries
- MapIF.Entry[] entries = new Entry[this.size()];
- int i = 0;
- Entry next = null;
-
- int index = 0;
- // advance to first entry
- while (index < table.length && (next = table[index++]) == null)
- ;
-
- while (next != null) {
- // current entry
- Entry e = next;
-
- // advance to next entry
- next = e.next;
- if ((next = e.next) == null) {
- while (index < table.length && (next = table[index++]) == null)
- ;
- }
-
- // add entry to array
- entries[i++] = e;
- }
-
- // sort the entries
- Arrays.sort(entries, new Comparator() {
- public int compare(MapIF.Entry e1, MapIF.Entry e2) {
- if (e1.getValue() > e2.getValue()) {
- return -1;
- } else if (e1.getValue() < e2.getValue()) {
- return 1;
- }
-
- if (e1.getKey() == e2.getKey())
- return 0;
-
- return e1.getKey() > e2.getKey() ? 1 : -1;
- }
- });
-
- return entries;
- }
-
- /**
- * Returns top n entries sorted by descending value. Ties broken by
- * the key.
- *
- * @param n
- * number of entries to return
- * @return top n entries sorted by descending value
- */
- public MapIF.Entry[] getEntriesSortedByValue(int n) {
- MapIF.Entry[] entries = getEntriesSortedByValue();
-
- if (entries == null)
- return null;
-
- if (entries.length < n)
- return entries;
-
- return Arrays.copyOfRange(entries, 0, n);
- }
-
+ // doc copied from interface
+ public float remove(int key) {
+ Entry e = removeEntryForKey(key);
+ if (e != null)
+ return e.value;
+
+ throw new NoSuchElementException();
+ }
+
+ /**
+ * Removes and returns the entry associated with the specified key in the
+ * HMapIF. Returns null if the HMapIF contains no mapping for this key.
+ */
+ final Entry removeEntryForKey(int key) {
+ int hash = hash(key);
+ int i = indexFor(hash, table.length);
+ Entry prev = table[i];
+ Entry e = prev;
+
+ while (e != null) {
+ Entry next = e.next;
+ int k;
+ if (e.hash == hash && ((k = e.key) == key || key == k)) {
+ modCount++;
+ size--;
+ if (prev == e)
+ table[i] = next;
+ else
+ prev.next = next;
+ e.recordRemoval(this);
+ return e;
+ }
+ prev = e;
+ e = next;
+ }
+
+ return e;
+ }
+
+ /**
+ * Special version of remove for EntrySet.
+ */
+ final Entry removeMapping(Object o) {
+ MapII.Entry entry = (MapII.Entry) o;
+ Object key = entry.getKey();
+ int hash = (key == null) ? 0 : hash(key.hashCode());
+ int i = indexFor(hash, table.length);
+ Entry prev = table[i];
+ Entry e = prev;
+
+ while (e != null) {
+ Entry next = e.next;
+ if (e.hash == hash && e.equals(entry)) {
+ modCount++;
+ size--;
+ if (prev == e)
+ table[i] = next;
+ else
+ prev.next = next;
+ e.recordRemoval(this);
+ return e;
+ }
+ prev = e;
+ e = next;
+ }
+
+ return e;
+ }
+
+ // doc copied from interface
+ public void clear() {
+ modCount++;
+ Entry[] tab = table;
+ for (int i = 0; i < tab.length; i++)
+ tab[i] = null;
+ size = 0;
+ }
+
+ // doc copied from interface
+ public boolean containsValue(float value) {
+ Entry[] tab = table;
+ for (int i = 0; i < tab.length; i++)
+ for (Entry e = tab[i]; e != null; e = e.next)
+ if (value == e.value)
+ return true;
+ return false;
+ }
+
+ /**
+ * Returns a shallow copy of this HMapIF instance: the keys and
+ * values themselves are not cloned.
+ *
+ * @return a shallow copy of this map
+ */
+ public Object clone() {
+ HMapIF result = null;
+ try {
+ result = (HMapIF) super.clone();
+ } catch (CloneNotSupportedException e) {
+ // assert false;
+ }
+ result.table = new Entry[table.length];
+ result.entrySet = null;
+ result.modCount = 0;
+ result.size = 0;
+ result.init();
+ result.putAllForCreate(this);
+
+ return result;
+ }
+
+ static class Entry implements MapIF.Entry {
+ final int key;
+ float value;
+ Entry next;
+ final int hash;
+
+ /**
+ * Creates new entry.
+ */
+ Entry(int h, int k, float v, Entry n) {
+ value = v;
+ next = n;
+ key = k;
+ hash = h;
+ }
+
+ public final int getKey() {
+ return key;
+ }
+
+ public final float getValue() {
+ return value;
+ }
+
+ public final float setValue(float newValue) {
+ float oldValue = value;
+ value = newValue;
+ return oldValue;
+ }
+
+ public final boolean equals(Object o) {
+ MapIF.Entry e = (MapIF.Entry) o;
+ int k1 = getKey();
+ int k2 = e.getKey();
+ if (k1 == k2) {
+ float v1 = getValue();
+ float v2 = e.getValue();
+ if (v1 == v2)
+ return true;
+ }
+ return false;
+ }
+
+ public final int hashCode() {
+ return (key) ^ ((int) value);
+ }
+
+ public final String toString() {
+ return getKey() + "=" + getValue();
+ }
+
+ /**
+ * This method is invoked whenever the value in an entry is overwritten
+ * by an invocation of put(k,v) for a key k that's already in the
+ * HMapIF.
+ */
+ void recordAccess(HMapIF m) {
+ }
+
+ /**
+ * This method is invoked whenever the entry is removed from the table.
+ */
+ void recordRemoval(HMapIF m) {
+ }
+ }
+
+ /**
+ * Adds a new entry with the specified key, value and hash code to the
+ * specified bucket. It is the responsibility of this method to resize the
+ * table if appropriate.
+ *
+ * Subclass overrides this to alter the behavior of put method.
+ */
+ void addEntry(int hash, int key, float value, int bucketIndex) {
+ Entry e = table[bucketIndex];
+ table[bucketIndex] = new Entry(hash, key, value, e);
+ if (size++ >= threshold)
+ resize(2 * table.length);
+ }
+
+ /**
+ * Like addEntry except that this version is used when creating entries as
+ * part of Map construction or "pseudo-construction" (cloning,
+ * deserialization). This version needn't worry about resizing the table.
+ *
+ * Subclass overrides this to alter the behavior of HMapIF(Map), clone, and
+ * readObject.
+ */
+ void createEntry(int hash, int key, float value, int bucketIndex) {
+ Entry e = table[bucketIndex];
+ table[bucketIndex] = new Entry(hash, key, value, e);
+ size++;
+ }
+
+ private abstract class HashIterator implements Iterator {
+ Entry next; // next entry to return
+ int expectedModCount; // For fast-fail
+ int index; // current slot
+ Entry current; // current entry
+
+ HashIterator() {
+ expectedModCount = modCount;
+ if (size > 0) { // advance to first entry
+ Entry[] t = table;
+ while (index < t.length && (next = t[index++]) == null)
+ ;
+ }
+ }
+
+ public final boolean hasNext() {
+ return next != null;
+ }
+
+ final Entry nextEntry() {
+ if (modCount != expectedModCount)
+ throw new ConcurrentModificationException();
+ Entry e = next;
+ if (e == null)
+ throw new NoSuchElementException();
+
+ if ((next = e.next) == null) {
+ Entry[] t = table;
+ while (index < t.length && (next = t[index++]) == null)
+ ;
+ }
+ current = e;
+ return e;
+ }
+
+ public void remove() {
+ if (current == null)
+ throw new IllegalStateException();
+ if (modCount != expectedModCount)
+ throw new ConcurrentModificationException();
+ int k = current.key;
+ current = null;
+ HMapIF.this.removeEntryForKey(k);
+ expectedModCount = modCount;
+ }
+
+ }
+
+ private final class ValueIterator extends HashIterator {
+ public Float next() {
+ return nextEntry().value;
+ }
+ }
+
+ private final class KeyIterator extends HashIterator {
+ public Integer next() {
+ return nextEntry().getKey();
+ }
+ }
+
+ private final class EntryIterator extends HashIterator {
+ public MapIF.Entry next() {
+ return nextEntry();
+ }
+ }
+
+ // Subclass overrides these to alter behavior of views' iterator() method
+ Iterator newKeyIterator() {
+ return new KeyIterator();
+ }
+
+ Iterator newValueIterator() {
+ return new ValueIterator();
+ }
+
+ Iterator newEntryIterator() {
+ return new EntryIterator();
+ }
+
+ // Views
+
+ private transient Set entrySet = null;
+
+ /**
+ * Each of these fields are initialized to contain an instance of the
+ * appropriate view the first time this view is requested. The views are
+ * stateless, so there's no reason to create more than one of each.
+ */
+ transient volatile Set keySet = null;
+ transient volatile Collection values = null;
+
+ // doc copied from interface
+ public Set keySet() {
+ Set ks = keySet;
+ return (ks != null ? ks : (keySet = new KeySet()));
+ }
+
+ private final class KeySet extends AbstractSet {
+ @Override
+ public Iterator iterator() {
+ return newKeyIterator();
+ }
+
+ @Override
+ public int size() {
+ return size;
+ }
+
+ @Override
+ public boolean contains(Object o) {
+ return containsKey((Integer) o);
+ }
+ }
+
+ // doc copied from interface
+ public Collection values() {
+ Collection vs = values;
+ return (vs != null ? vs : (values = new Values()));
+ }
+
+ private final class Values extends AbstractCollection {
+ @Override
+ public Iterator iterator() {
+ return newValueIterator();
+ }
+
+ @Override
+ public int size() {
+ return size;
+ }
+
+ @Override
+ public boolean contains(Object o) {
+ return containsValue((Float) o);
+ }
+ }
+
+ // doc copied from interface
+ public Set entrySet() {
+ return entrySet0();
+ }
+
+ private Set entrySet0() {
+ Set es = entrySet;
+ return es != null ? es : (entrySet = new EntrySet());
+ }
+
+ private final class EntrySet extends AbstractSet {
+ @Override
+ public Iterator iterator() {
+ return newEntryIterator();
+ }
+
+ @Override
+ public int size() {
+ return size;
+ }
+
+ @Override
+ public boolean contains(Object o) {
+ MapIF.Entry e = (MapIF.Entry) o;
+ Entry candidate = getEntry(e.getKey());
+ return candidate != null && candidate.equals(e);
+ }
+ }
+
+ /**
+ * Save the state of the HMapIF instance to a stream (i.e.,
+ * serialize it).
+ *
+ * @serialData The capacity of the HMapIF (the length of the bucket
+ * array) is emitted (int), followed by the size (an
+ * int, the number of key-value mappings), followed by the key
+ * (Object) and value (Object) for each key-value mapping. The
+ * key-value mappings are emitted in no particular order.
+ */
+ private void writeObject(ObjectOutputStream s) throws IOException {
+ Iterator i = (size > 0) ? entrySet0().iterator() : null;
+
+ // Write out the threshold, loadfactor, and any hidden stuff
+ s.defaultWriteObject();
+
+ // Write out number of buckets
+ s.writeInt(table.length);
+
+ // Write out size (number of Mappings)
+ s.writeInt(size);
+
+ // Write out keys and values (alternating)
+ if (i != null) {
+ while (i.hasNext()) {
+ MapIF.Entry e = i.next();
+ s.writeInt(e.getKey());
+ s.writeFloat(e.getValue());
+ }
+ }
+ }
+
+ private static final long serialVersionUID = 362498820763181265L;
+
+ /**
+ * Reconstitute the HMapIF instance from a stream (i.e.,
+ * deserialize it).
+ */
+ private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException {
+ // Read in the threshold, loadfactor, and any hidden stuff
+ s.defaultReadObject();
+
+ // Read in number of buckets and allocate the bucket array;
+ int numBuckets = s.readInt();
+ table = new Entry[numBuckets];
+
+ init(); // Give subclass a chance to do its thing.
+
+ // Read in size (number of Mappings)
+ int size = s.readInt();
+
+ // Read the keys and values, and put the mappings in the HMapIF
+ for (int i = 0; i < size; i++) {
+ int key = s.readInt();
+ float value = s.readFloat();
+ putForCreate(key, value);
+ }
+ }
+
+ // These methods are used when serializing HashSets
+ int capacity() {
+ return table.length;
+ }
+
+ float loadFactor() {
+ return loadFactor;
+ }
+
+ public String toString () {
+ return toString (-1);
+ }
+
+ public String toString (int n) {
+ Iterator i = entrySet().iterator();
+ if (!i.hasNext() || n == 0)
+ return "{}";
+
+ StringBuilder sb = new StringBuilder();
+ sb.append('{');
+ for (int m = 2; ; m++) {
+ MapIF.Entry e = i.next();
+ int key = e.getKey();
+ float value = e.getValue();
+ //sb.append("(m: " + m + ", n: " + n + ")");
+ sb.append(key);
+ sb.append('=');
+ sb.append(value);
+ if (! i.hasNext() || (m > n && n > 0)) {
+ if (i.hasNext()) {
+ sb.append (", ...");
+ }
+ return sb.append('}').toString();
+ }
+ sb.append(", ");
+ }
+ }
+
+ // methods not part of a standard HashMap
+
+ /**
+ * Adds values of keys from another map to this map.
+ *
+ * @param m
+ * the other map
+ */
+ public void plus(MapIF m) {
+ for (MapIF.Entry e : m.entrySet()) {
+ int key = e.getKey();
+
+ if (this.containsKey(key)) {
+ this.put(key, this.get(key) + e.getValue());
+ } else {
+ this.put(key, e.getValue());
+ }
+ }
+ }
+
+ /**
+ * Computes the dot product of this map with another map.
+ *
+ * @param m
+ * the other map
+ */
+ public float dot(MapIF m) {
+ float s = 0.0f;
+
+ for (MapIF.Entry e : m.entrySet()) {
+ int key = e.getKey();
+
+ if (this.containsKey(key)) {
+ s += this.get(key) * e.getValue();
+ }
+ }
+
+ return s;
+ }
+
+ /**
+ * Returns the length of the vector represented by this map.
+ *
+ * @return length of the vector represented by this map
+ */
+ public float length() {
+ float s = 0.0f;
+
+ for (MapIF.Entry e : this.entrySet()) {
+ s += e.getValue() * e.getValue();
+ }
+
+ return (float) Math.sqrt(s);
+ }
+
+ /**
+ * Normalizes values such that the vector represented by this map has unit
+ * length.
+ */
+ public void normalize() {
+ float l = this.length();
+
+ for (int f : this.keySet()) {
+ this.put(f, this.get(f) / l);
+ }
+
+ }
+
+ /**
+ * Returns entries sorted by descending value. Ties broken by the key.
+ *
+ * @return entries sorted by descending value
+ */
+ public MapIF.Entry[] getEntriesSortedByValue() {
+ if (this.size() == 0)
+ return null;
+
+ // for storing the entries
+ MapIF.Entry[] entries = new Entry[this.size()];
+ int i = 0;
+ Entry next = null;
+
+ int index = 0;
+ // advance to first entry
+ while (index < table.length && (next = table[index++]) == null)
+ ;
+
+ while (next != null) {
+ // current entry
+ Entry e = next;
+
+ // advance to next entry
+ next = e.next;
+ if ((next = e.next) == null) {
+ while (index < table.length && (next = table[index++]) == null)
+ ;
+ }
+
+ // add entry to array
+ entries[i++] = e;
+ }
+
+ // sort the entries
+ Arrays.sort(entries, new Comparator() {
+ public int compare(MapIF.Entry e1, MapIF.Entry e2) {
+ if (e1.getValue() > e2.getValue()) {
+ return -1;
+ } else if (e1.getValue() < e2.getValue()) {
+ return 1;
+ }
+
+ if (e1.getKey() == e2.getKey())
+ return 0;
+
+ return e1.getKey() > e2.getKey() ? 1 : -1;
+ }
+ });
+
+ return entries;
+ }
+
+ /**
+ * Returns top n entries sorted by descending value. Ties broken by
+ * the key.
+ *
+ * @param n
+ * number of entries to return
+ * @return top n entries sorted by descending value
+ */
+ public MapIF.Entry[] getEntriesSortedByValue(int n) {
+ MapIF.Entry[] entries = getEntriesSortedByValue();
+
+ if (entries == null)
+ return null;
+
+ if (entries.length < n)
+ return entries;
+
+ return Arrays.copyOfRange(entries, 0, n);
+ }
+
+
+ /**
+ * Returns entries sorted by ascending value. Ties broken by the key.
+ *
+ * @return entries sorted by ascending value
+ */
+ public MapIF.Entry[] getEntriesSortedByAscendingValue() {
+ if (this.size() == 0)
+ return null;
+
+ // for storing the entries
+ MapIF.Entry[] entries = new Entry[this.size()];
+ int i = 0;
+ Entry next = null;
+
+ int index = 0;
+ // advance to first entry
+ while (index < table.length && (next = table[index++]) == null)
+ ;
+
+ while (next != null) {
+ // current entry
+ Entry e = next;
+
+ // advance to next entry
+ next = e.next;
+ if ((next = e.next) == null) {
+ while (index < table.length && (next = table[index++]) == null)
+ ;
+ }
+
+ // add entry to array
+ entries[i++] = e;
+ }
+
+ // sort the entries
+ Arrays.sort(entries, new Comparator() {
+ public int compare(MapIF.Entry e1, MapIF.Entry e2) {
+ if (e1.getValue() > e2.getValue()) {
+ return 1;
+ } else if (e1.getValue() < e2.getValue()) {
+ return -1;
+ }
+
+ if (e1.getKey() == e2.getKey())
+ return 0;
+
+ return e1.getKey() > e2.getKey() ? -1 : 1;
+ }
+ });
+
+ return entries;
+ }
+
+ /**
+ * Returns top n entries sorted by descending value. Ties broken by
+ * the key.
+ *
+ * @param n
+ * number of entries to return
+ * @return top n entries sorted by descending value
+ */
+ public MapIF.Entry[] getEntriesSortedByAscendingValue(int n) {
+ MapIF.Entry[] entries = getEntriesSortedByAscendingValue();
+
+ if (entries == null)
+ return null;
+
+ if (entries.length < n)
+ return entries;
+
+ return Arrays.copyOfRange(entries, 0, n);
+ }
}
From 07b6a049e71ec705f09af28de15b1e1513bbfabd Mon Sep 17 00:00:00 2001
From: "Earl J. Wagner"
Date: Mon, 5 Mar 2012 14:06:53 -0500
Subject: [PATCH 17/18] refactored some variable names
---
.../aquaint2/Aquaint2DocnoMapping.java | 45 ++++++++++---------
1 file changed, 23 insertions(+), 22 deletions(-)
diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
index f146b5210..80bbdd36b 100644
--- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
+++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2DocnoMapping.java
@@ -45,7 +45,7 @@ public class Aquaint2DocnoMapping implements DocnoMapping {
private String[] docidEntries;
@Override
- public int getDocno(String docid) {
+ public int getDocno(String docid) {
LOG.trace("getDocno(docid: " + docid + ")");
Preconditions.checkNotNull(docid);
int sourceLength = docid.length() - 13;
@@ -61,22 +61,22 @@ public int getDocno(String docid) {
int entryId = findEntryId(source, year, month);
LOG.debug("entryId: " + entryId);
- String entryElt = docidEntries[entryId].split("\t")[day];
- LOG.debug("entryElt: " + entryElt);
-
- // then traverse the days to find the day and skip over missing articles to get the article number
- String[] entryEltParts = entryElt.split(" ");
- int result = articleNo + Integer.parseInt(entryEltParts[0]);
- String[] entryDayParts = entryEltParts[1].split(",");
- for (int i = 1; i < entryDayParts.length; i++) {
- int missingNo = Integer.parseInt(entryDayParts[i]);
- if (articleNo < missingNo) break;
- LOG.debug("skipping missingNo: " + missingNo);
- result--;
- }
+ String entryElt = docidEntries[entryId].split("\t")[day];
+ LOG.debug("entryElt: " + entryElt);
+
+ // then traverse the days to find the day and skip over missing articles to get the article number
+ String[] entryEltParts = entryElt.split(" ");
+ int result = articleNo + Integer.parseInt(entryEltParts[0]);
+ String[] entryDayParts = entryEltParts[1].split(",");
+ for (int i = 1; i < entryDayParts.length; i++) {
+ int missingNo = Integer.parseInt(entryDayParts[i]);
+ if (articleNo < missingNo) break;
+ LOG.debug("skipping missingNo: " + missingNo);
+ result--;
+ }
- LOG.trace("getDocno returning: " + result);
- return result;
+ LOG.trace("getDocno returning: " + result);
+ return result;
}
private int findEntryId(String source, int year, int month) {
@@ -100,7 +100,7 @@ private int findEntryId(String source, int year, int month) {
@Override
- public String getDocid(int docno) {
+ public String getDocid(int docno) {
Preconditions.checkArgument(docno > 0);
LOG.trace("getDocid(docno: " + docno + ")");
@@ -165,7 +165,7 @@ private String[] findEntryEltParts(int docno, String[] entryElts) {
@Override
- public void loadMapping(Path p, FileSystem fs) throws IOException {
+ public void loadMapping(Path p, FileSystem fs) throws IOException {
docidEntries = Aquaint2DocnoMapping.readDocnoData(p, fs);
}
@@ -202,8 +202,8 @@ static public void writeDocnoData(Path input, Path output, FileSystem fs) throws
LOG.debug("source: " + source + ", year: " + year + ", month: " + month + ", day: " + day + ", articleNo: " + articleNo);
if (! source.equals(prevSource) ||
- year != prevYear ||
- month != prevMonth) {
+ year != prevYear ||
+ month != prevMonth) {
LOG.debug("diff source, year or month, currentEntry: " + currentEntry);
if (currentEntry != null) {
list.add(currentEntry.toString());
@@ -322,14 +322,15 @@ public static void main(String[] args) throws IOException {
System.out.println("looking up docno for \"" + args[2] + "\"");
int idx = mapping.getDocno(args[2]);
if (idx > 0) {
- System.out.println(mapping.getDocno(args[2]));
+ System.out.println(idx);
} else {
System.err.print("Invalid docid!");
}
} else if (args[0].equals("getDocid")) {
+ int docno = Integer.parseInt(args[2]);
try {
System.out.println("looking up docid for " + args[2]);
- System.out.println(mapping.getDocid(Integer.parseInt(args[2])));
+ System.out.println(mapping.getDocid(docno));
} catch (Exception e) {
System.err.print("Invalid docno!");
}
From be21d865b4d7671b8722498df946a3dc8d298de8 Mon Sep 17 00:00:00 2001
From: "Earl J. Wagner"
Date: Mon, 5 Mar 2012 14:07:16 -0500
Subject: [PATCH 18/18] catch parse error
---
.../collection/aquaint2/Aquaint2Document.java | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java
index 567d45d2d..5e788cfc3 100644
--- a/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java
+++ b/src/dist/edu/umd/cloud9/collection/aquaint2/Aquaint2Document.java
@@ -116,8 +116,9 @@ public String getElementText(String elementTagName) {
result = raw.substring(start).trim();
LOG.error("found element text: " + result);
}
- result = TAGS_PATTERN.matcher(result).replaceAll("");
+ result = TAGS_PATTERN.matcher(result).replaceAll("\n");
result = WHITESPACE_PATTERN.matcher(result).replaceAll(" ");
+ //System.out.println(result);
}
return result;
}
@@ -147,7 +148,15 @@ private void setAquaint2Docid() {
LOG.trace("setAquaint2Docid()");
int start = 9;
int end = raw.indexOf("\"", start);
- docid = raw.substring(start, end).trim();
+ try {
+ docid = raw.substring(start, end).trim();
+ } catch (Exception e) {
+ LOG.error("exception: " + e);
+ LOG.error("start: " + start + ", end: " + end);
+ LOG.error("raw:\n" + raw);
+ String result = raw.substring(start).trim();
+ LOG.error("found element text: " + result);
+ }
LOG.trace("in setAquaint2Docid, docid: " + docid);
}