From 5225cc6eeb79340420fc1a8311d187669c9a1d7b Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Fri, 9 May 2025 23:26:31 -0700 Subject: [PATCH 01/42] Update scala and sbt --- build.sbt | 27 ++++++++++++++------------- project/build.properties | 5 ++++- project/plugins.sbt | 2 ++ 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/build.sbt b/build.sbt index 354fc7974..0bc3a2e3c 100644 --- a/build.sbt +++ b/build.sbt @@ -1,23 +1,24 @@ -// These were last checked on 2025-02-19. +// These were last checked on 2025-05-09. val scala211 = "2.11.12" // up to 2.11.12 -val scala212 = "2.12.19" // up to 2.12.20 -val scala213 = "2.13.14" // up to 2.13.16 +val scala212 = "2.12.20" // up to 2.12.20 +val scala213 = "2.13.16" // up to 2.13.16 val scala30 = "3.0.2" // up to 3.0.2 val scala31 = "3.1.3" // up to 3.1.3 val scala32 = "3.2.2" // up to 3.2.2 -val scala33 = "3.3.5" // up to 3.3.5 (LTS) +val scala33 = "3.3.6" // up to 3.3.6 (LTS) val scala34 = "3.4.3" // up to 3.4.3 val scala35 = "3.5.2" // up to 3.5.2 -val scala36 = "3.6.3" // up to 3.6.3 +val scala36 = "3.6.4" // up to 3.6.4 +val scala37 = "3.7.0" // up to 3.7.0 // See https://www.scala-lang.org/blog/2022/08/17/long-term-compatibility-plans.html. // Scala30: "If you are maintaining a library, you should drop Scala 3.0." Dropped. // Scala31: This is a LTS (long term support) version before it was called that. // Scala32: This is for experimentation, as in Scala Next, and not for release. // Scala33: This is the first official LTS, but hold off until necessary. -val scala3 = scala31 +val scala3 = scala33 -ThisBuild / crossScalaVersions := Seq(scala212, scala211, scala213, scala3) +ThisBuild / crossScalaVersions := Seq(scala213, scala3) ThisBuild / scalaVersion := crossScalaVersions.value.head lazy val root = (project in file(".")) @@ -33,17 +34,17 @@ lazy val library = project lazy val apps = project .dependsOn(library % "compile -> compile; test -> test") -lazy val webapp = project - .enablePlugins(PlayScala) - .dependsOn(library % "compile -> compile; test -> test") - .settings( +// lazy val webapp = project + // .enablePlugins(PlayScala) + // .dependsOn(library % "compile -> compile; test -> test") + // .settings( // scala3 doesn't have play (for 2.8.19 as specified by the project) and is ruled out completely. // scala213 has version problems for com.fasterxml.jackson.databind.JsonMappingException. // scala212 works! // scala211 isn't compiling and complains on twirlCompileTemplates. // This isn't a library. Only one version needs to work. We shouldn't use play for this anyway. - crossScalaVersions := Seq(scala212) - ) + // crossScalaVersions := Seq(scala212) + // ) lazy val debugger = project .dependsOn(library % "compile -> compile; test -> test") diff --git a/project/build.properties b/project/build.properties index 11956d958..4c19fc197 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1,6 +1,9 @@ +# This was last checked on 2025-05-09. # Version 1.7.2+ will cause problems when combined with the play plug-in used for the webapp! # [error] * org.scala-lang.modules:scala-xml_2.12:2.1.0 (early-semver) is selected over {1.2.0, 1.1.1} # [error] +- org.scala-lang:scala-compiler:2.12.17 (depends on 2.1.0) # [error] +- com.typesafe.sbt:sbt-native-packager:1.5.2 (scalaVersion=2.12, sbtVersion=1.0) (depends on 1.1.1) # [error] +- com.typesafe.play:twirl-api_2.12:1.5.1 (depends on 1.2.0) -sbt.version = 1.7.2 +# This error is solved by adding a VersionScheme.Always to plugins.sbt. +# up to 1.10.11 +sbt.version = 1.10.11 diff --git a/project/plugins.sbt b/project/plugins.sbt index 273ee7ce6..417c04d23 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,3 +1,5 @@ +ThisBuild / libraryDependencySchemes += "org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always + // Latest version numbers were updated on 2024 July 11. addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.2-1") // up to 2.2.1 * addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.3") // up to 3.9.21 * From 1d18c8a198510f27f271ca23624e30a129a963f4 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Tue, 13 May 2025 16:06:16 -0700 Subject: [PATCH 02/42] Clean up BalaurProcessor --- .../processors/clu/BalaurProcessor.scala | 194 ++++++++++-------- 1 file changed, 112 insertions(+), 82 deletions(-) diff --git a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala index b791d181e..dc69b9363 100644 --- a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala +++ b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala @@ -2,7 +2,7 @@ package org.clulab.processors.clu import com.typesafe.config.Config import com.typesafe.config.ConfigFactory -import org.clulab.numeric.{NumericEntityRecognizer, setLabelsAndNorms} +import org.clulab.numeric.{NumericEntityRecognizer, mkLabelsAndNorms} import org.clulab.processors.{Document, Processor, Sentence} import org.clulab.processors.clu.tokenizer._ import org.clulab.scala.WrappedArray._ @@ -13,12 +13,11 @@ import org.clulab.struct.DirectedGraph import org.clulab.struct.GraphMap import org.clulab.utils.{Configured, MathUtils, ToEnhancedDependencies} import org.slf4j.{Logger, LoggerFactory} - import org.clulab.odin.Mention - import BalaurProcessor._ import PostProcessor._ import org.clulab.processors.hexatagging.HexaDecoder +import org.clulab.struct.GraphMap.GraphMap class BalaurProcessor protected ( val config: Config, @@ -91,29 +90,27 @@ class BalaurProcessor protected ( throw new RuntimeException("ERROR: cannot call this method on its own in this processor!") } - /** Lematization; modifies the document in place */ - override def lemmatize(doc: Document): Unit = { - for(sent <- doc.sentences) { - val lemmas = new Array[String](sent.size) - for(i <- sent.words.indices) { - lemmas(i) = wordLemmatizer.lemmatizeWord(sent.words(i)) - - // a lemma may be empty in some weird Unicode situations - if(lemmas(i).isEmpty) { - logger.debug(s"""WARNING: Found empty lemma for word #$i "${sent.words(i)}" in sentence: ${sent.words.mkString(" ")}""") - lemmas(i) = sent.words(i).toLowerCase() - } - } - sent.lemmas = Some(lemmas) + /** Lemmatization; modifies the document in place */ + override def lemmatize(words: Array[String]): Array[String] = { + val lemmas = words.zipWithIndex.map { case (word, index) => + val lemma = wordLemmatizer.lemmatizeWord(word) + // a lemma may be empty in some weird Unicode situations + val nonEmptyLemma = + if (lemma.isEmpty) { + logger.debug(s"""WARNING: Found empty lemma for word #$index "$word" in sentence: ${words.mkString(" ")}""") + word.toLowerCase() + } + else lemma + + nonEmptyLemma } + + lemmas } /** Generates cheap lemmas with the word in lower case, for languages where a lemmatizer is not available */ - def cheapLemmatize(doc:Document): Unit = { - for(sent <- doc.sentences) { - val lemmas = sent.words.map(_.toLowerCase()).toArray - sent.lemmas = Some(lemmas) - } + def cheapLemmatize(sentence: Sentence): Array[String] = { + sentence.words.map(_.toLowerCase()) } override def recognizeNamedEntities(doc: Document): Unit = { @@ -144,64 +141,86 @@ class BalaurProcessor protected ( throw new RuntimeException("ERROR: functionality not supported in this procecessor!") } - override def annotate(doc: Document): Document = { - val verbose = false - - // lemmas are created deterministically, not through the MTL framework - lemmatize(doc) + override def annotate(document: Document): Document = { + // Process one sentence at a time through the MTL framework. + val partlyAnnotatedSentences = document.sentences.map { sentence => + val words = sentence.words + // Lemmas are created deterministically, not through the MTL framework. + val lemmas = lemmatize(words) - // process one sentence at a time through the MTL framework - for (sent <- doc.sentences) { try { - val allLabelsAndScores = tokenClassifier.predictWithScores(sent.words) - assignPosTags(allLabelsAndScores(TASK_TO_INDEX(POS_TASK)), sent) - assignNamedEntityLabels(allLabelsAndScores(TASK_TO_INDEX(NER_TASK)), sent) - assignChunkLabels(allLabelsAndScores(TASK_TO_INDEX(CHUNKING_TASK)), sent) - assignDependencyLabelsUsingHexaTags( + val allLabelsAndScores = tokenClassifier.predictWithScores(words) + val tags = mkPosTags(words, allLabelsAndScores(TASK_TO_INDEX(POS_TASK))) + val entities = { + val optionalEntities = mkOptionalNerLabels(words, sentence.startOffsets, sentence.endOffsets, tags, lemmas) + + mkNamedEntityLabels(words, allLabelsAndScores(TASK_TO_INDEX(NER_TASK)), optionalEntities) + } + val chunks = mkChunkLabels(words, allLabelsAndScores(TASK_TO_INDEX(CHUNKING_TASK))) + val graphs = mkDependencyLabelsUsingHexaTags( + words, lemmas, tags, allLabelsAndScores(TASK_TO_INDEX(HEXA_TERM_TASK)), - allLabelsAndScores(TASK_TO_INDEX(HEXA_NONTERM_TASK)), - sent + allLabelsAndScores(TASK_TO_INDEX(HEXA_NONTERM_TASK)) + ) + // Entities and norms need to still be patched and filled in, so this is only a partly annotated sentence. + val partlyAnnotatedDocument = sentence.copy( + tags = Some(tags), lemmas = Some(lemmas), entities = Some(entities), chunks = Some(chunks), graphs = graphs ) - } catch { - case e: EncoderMaxTokensRuntimeException => - // this sentence exceeds the maximum number of tokens for the encoder - // TODO: at some point do something smart here - println(s"ERROR: this sentence exceeds the maximum number of tokens for the encoder and will not be annotated: ${sent.words.mkString(" ")}") + partlyAnnotatedDocument + } + catch { + // No values, not even lemmas, will be included in the annotation is there was an exception. + case e: EncoderMaxTokensRuntimeException => + // TODO: at some point do something smart here + println(s"ERROR: This sentence exceeds the maximum number of tokens for the encoder and will not be annotated: ${sentence.words.mkString(" ")}") + sentence + case e: AssertionError => + println(s"ERROR: The output of predictWithScores does not satisfy assertions. The sentence will not be annotated: ${sentence.words.mkString(" ")}") + sentence } } + val partlyAnnotatedDocument = document.copy(sentences = partlyAnnotatedSentences) + val fullyAnnotatedDocument = + if (numericEntityRecognizerOpt.nonEmpty) { + val numericMentions = numericEntityRecognizerOpt.get.extractFrom(partlyAnnotatedDocument) + val (newLabels, newNorms) = mkLabelsAndNorms(partlyAnnotatedDocument, numericMentions) + val fullyAnnotatedSentences = partlyAnnotatedDocument.sentences.indices.map { index => + partlyAnnotatedDocument.sentences(index).copy( + entities = Some(newLabels(index)), + norms = Some(newNorms(index)) + ) + }.toArray + + partlyAnnotatedDocument.copy(sentences = fullyAnnotatedSentences) + } + else partlyAnnotatedDocument - // numeric entities using our numeric entity recognizer based on Odin rules - if(numericEntityRecognizerOpt.nonEmpty) { - val numericMentions = extractNumericEntityMentions(doc) - setLabelsAndNorms(doc, numericMentions) - } - - doc + fullyAnnotatedDocument } - def extractNumericEntityMentions(doc:Document): Seq[Mention] = { - numericEntityRecognizerOpt.get.extractFrom(doc) - } + private def mkPosTags(words: Array[String], labels: Array[Array[(String, Float)]]): Array[String] = { + assert(labels.length == words.length) - private def assignPosTags(labels: Array[Array[(String, Float)]], sent: Sentence): Unit = { - assert(labels.length == sent.words.length) - sent.tags = Some(postprocessPartOfSpeechTags(sent.words, labels.map(_.head._1).toArray)) - } + val tags = labels.map(_.head._1).toArray - /** Must be called after assignPosTags and lemmatize because it requires Sentence.tags and Sentence.lemmas */ - private def assignNamedEntityLabels(labels: Array[Array[(String, Float)]], sent: Sentence): Unit = { - assert(labels.length == sent.words.length) + postprocessPartOfSpeechTags(words, tags) + tags + } + private def mkOptionalNerLabels( + words: Array[String], startOffsets: Array[Int], endOffsets: Array[Int], + tags: Array[String], lemmas: Array[String] + ): Option[Array[String]] = { // NER labels from the custom NER - val optionalNERLabels: Option[Array[String]] = optionalNER.map { ner => + optionalNER.map { ner => val sentence = Sentence( - sent.words, - sent.startOffsets, - sent.endOffsets, - sent.words, - sent.tags, - sent.lemmas, + words, // Why isn't this raw? + startOffsets, + endOffsets, + words, + Some(tags), + Some(lemmas), entities = None, norms = None, chunks = None, @@ -212,18 +231,24 @@ class BalaurProcessor protected ( ner.find(sentence) } + } + + /** Must be called after assignPosTags and lemmatize because it requires Sentence.tags and Sentence.lemmas */ + private def mkNamedEntityLabels(words: Array[String], labels: Array[Array[(String, Float)]], optionalNERLabels: Option[Array[String]]): Array[String] = { + assert(labels.length == words.length) val genericLabels = NamedEntity.patch(labels.map(_.head._1).toArray) - if(optionalNERLabels.isEmpty) { - sent.entities = Some(genericLabels) - } else { + if (optionalNERLabels.isEmpty) { + genericLabels + } + else { //println(s"MERGING NE labels for sentence: ${sent.words.mkString(" ")}") //println(s"Generic labels: ${NamedEntity.patch(labels).mkString(", ")}") //println(s"Optional labels: ${optionalNERLabels.get.mkString(", ")}") val mergedLabels = NamedEntity.patch(mergeNerLabels(genericLabels, optionalNERLabels.get)) //println(s"Merged labels: ${mergedLabels.mkString(", ")}") - sent.entities = Some(mergedLabels) + mergedLabels } } @@ -246,9 +271,10 @@ class BalaurProcessor protected ( } } - private def assignChunkLabels(labels: Array[Array[(String, Float)]], sent: Sentence): Unit = { - assert(labels.length == sent.words.length) - sent.chunks = Some(labels.map(_.head._1).toArray) + private def mkChunkLabels(words: Array[String], labels: Array[Array[(String, Float)]]): Array[String] = { + assert(labels.length == words.length) + + labels.map(_.head._1).toArray } // The head has one score, the label has another. Here the two scores are interpolated @@ -286,11 +312,14 @@ class BalaurProcessor protected ( sentDependencies.toArray } - private def assignDependencyLabelsUsingHexaTags( + private def mkDependencyLabelsUsingHexaTags( + words: Array[String], lemmas: Array[String], tags: Array[String], termTags: Array[Array[PredictionScore]], - nonTermTags: Array[Array[PredictionScore]], - sent: Sentence): Unit = { + nonTermTags: Array[Array[PredictionScore]] + ): GraphMap = { val verbose = false + val graphs = GraphMap() + val size = words.length // bht is used just for debugging purposes here val (bht, deps, roots) = hexaDecoder.decode(termTags, nonTermTags, topK = 25, verbose) @@ -301,20 +330,21 @@ class BalaurProcessor protected ( println("Roots: " + roots.get.mkString(", ")) } - if(deps.nonEmpty && roots.nonEmpty) { + if (deps.nonEmpty && roots.nonEmpty) { // basic dependencies that replicate treebank annotations - val depGraph = new DirectedGraph[String](deps.get, Some(sent.size), roots) - sent.graphs += GraphMap.UNIVERSAL_BASIC -> depGraph + val depGraph = new DirectedGraph[String](deps.get, Some(size), roots) + graphs += GraphMap.UNIVERSAL_BASIC -> depGraph // enhanced dependencies as defined by Manning - val enhancedDepGraph = ToEnhancedDependencies.generateUniversalEnhancedDependencies(sent, depGraph) - sent.graphs += GraphMap.UNIVERSAL_ENHANCED -> enhancedDepGraph + val enhancedDepGraph = ToEnhancedDependencies.generateUniversalEnhancedDependencies(words, lemmas, tags, depGraph) + graphs += GraphMap.UNIVERSAL_ENHANCED -> enhancedDepGraph // ideally, hybrid dependencies should contain both syntactic dependencies and semantic roles // however, this processor produces only syntactic dependencies - sent.graphs += GraphMap.HYBRID_DEPENDENCIES -> enhancedDepGraph + graphs += GraphMap.HYBRID_DEPENDENCIES -> enhancedDepGraph } - } + graphs + } } object BalaurProcessor { From b540f25bd3bc517e423ed20f2b10aa531a38f246 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Tue, 13 May 2025 17:48:50 -0700 Subject: [PATCH 03/42] Stop assigning to a val in Document --- .../org/clulab/processors/Document.scala | 88 +++++++++++-------- 1 file changed, 50 insertions(+), 38 deletions(-) diff --git a/library/src/main/scala/org/clulab/processors/Document.scala b/library/src/main/scala/org/clulab/processors/Document.scala index 6435ab94c..ab5dbbba6 100644 --- a/library/src/main/scala/org/clulab/processors/Document.scala +++ b/library/src/main/scala/org/clulab/processors/Document.scala @@ -16,21 +16,20 @@ import scala.collection.mutable * Written by: Mihai Surdeanu and Gus Hahn-Powell. * Last Modified: Add apply method to copy Document. */ -class Document(val sentences: Array[Sentence]) extends Serializable { - +class Document( + val sentences: Array[Sentence], /** Unique id for this document, if any */ - var id: Option[String] = None - + val id: Option[String] = None, /** Clusters of coreferent mentions */ - var coreferenceChains: Option[CorefChains] = None - + val coreferenceChains: Option[CorefChains] = None, /** The original text corresponding to this document, if it was preserved by the corresponding processor */ - var text: Option[String] = None - + val text: Option[String] = None, /** Map of any arbitrary document attachments such as document creation time */ - protected var attachments: Option[mutable.HashMap[String, DocumentAttachment]] = None + protected val attachments: Option[mutable.HashMap[String, DocumentAttachment]] = None, + protected val documentCreationTime:Option[String] = None +) extends Serializable { - protected var documentCreationTime:Option[String] = None + def copy(sentences: Array[Sentence]): Document = ??? /** Clears any internal state potentially constructed by the annotators */ def clear(): Unit = { } @@ -67,11 +66,11 @@ class Document(val sentences: Array[Sentence]) extends Serializable { ) /** Adds an attachment to the document's attachment map */ - def addAttachment(name: String, attachment: DocumentAttachment): Unit = { - if (attachments.isEmpty) - attachments = Some(new mutable.HashMap[String, DocumentAttachment]()) - attachments.get += name -> attachment - } +// def addAttachment(name: String, attachment: DocumentAttachment): Unit = { +// if (attachments.isEmpty) +// attachments = Some(new mutable.HashMap[String, DocumentAttachment]()) +// attachments.get += name -> attachment +// } /** Retrieves the attachment with the given name */ def getAttachment(name: String): Option[DocumentAttachment] = attachments.flatMap(_.get(name)) @@ -96,14 +95,13 @@ class Document(val sentences: Array[Sentence]) extends Serializable { * The DCT will impacts how Sentence.norms are generated for DATE expressions * @param dct Document creation time */ - def setDCT(dct:String): Unit = documentCreationTime = Some(dct) +// def setDCT(dct:String): Unit = documentCreationTime = Some(dct) def getDCT: Option[String] = documentCreationTime def prettyPrint(pw: PrintWriter): Unit = { // let's print the sentence-level annotations - var sentenceCount = 0 - for (sentence <- sentences) { + sentences.zipWithIndex.foreach { case (sentence, sentenceCount) => pw.println("Sentence #" + sentenceCount + ":") pw.println("Tokens: " + sentence.words.zipWithIndex.mkString(" ")) pw.println("Start character offsets: " + sentence.startOffsets.mkString(" ")) @@ -157,7 +155,6 @@ class Document(val sentences: Array[Sentence]) extends Serializable { // on syntactic trees, including access to head phrases/words }) - sentenceCount += 1 pw.println("\n") } @@ -177,20 +174,18 @@ class Document(val sentences: Array[Sentence]) extends Serializable { }) } - def assimilate(document: Document, textOpt: Option[String]): Document = { - id = document.id - coreferenceChains = document.coreferenceChains - text = textOpt - attachments = document.attachments - documentCreationTime = document.documentCreationTime - this - } - // sentences are a val, so they must be initialized through the construction of a new Document. // Thereafter, the remaining values can be assimilated from the old document. The shortcut // is used so that subclasses don't have to duplicate almost everything in their copy. def copy(sentences: Array[Sentence] = sentences, textOpt: Option[String] = text): Document = { - new Document(sentences).assimilate(this, textOpt) + new Document( + sentences = sentences, // not this + id = this.id, + coreferenceChains = this.coreferenceChains, + text = textOpt, // not this + attachments = this.attachments, + documentCreationTime = this.documentCreationTime + ) } def offset(offset: Int): Document = @@ -202,20 +197,37 @@ class Document(val sentences: Array[Sentence]) extends Serializable { object Document { - def apply(sentences: Array[Sentence]): Document = new Document(sentences) + def apply(sentences: Array[Sentence]): Document = apply(sentences, text = None) + + def apply(sentences: Array[Sentence], text: Option[String]): Document = apply(id = None, sentences, coref = None, text) def apply(id: Option[String], sentences: Array[Sentence], coref: Option[CorefChains], text: Option[String]): Document = { - val d = Document(sentences) - d.id = id - d.coreferenceChains = coref - d.text = text - d + val document = new Document( + sentences, + id = id, + coreferenceChains = coref, + text = text + ) + + document } - /** Return a new Document with relevant fields copied from the given Document. */ - def apply (doc: Document): Document = - Document(doc.id, doc.sentences, doc.coreferenceChains, doc.text) + /** Return a new Document with some relevant fields copied from the given Document. */ + def apply(doc: Document): Document = + apply(doc.id, doc.sentences, doc.coreferenceChains, doc.text) + + def apply(doc: Document, sentences: Array[Sentence]): Document = { + val newDocument = new Document( + sentences, + id = doc.id, + coreferenceChains = doc.coreferenceChains, + text = doc.text, + attachments = doc.attachments, + documentCreationTime = doc.documentCreationTime + ) + newDocument + } } /** From 7543b9c2f4d84e2e5a41c24cc4423a6876d16b6d Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Wed, 14 May 2025 11:02:19 -0700 Subject: [PATCH 04/42] Pass the tests --- .../processors/apps/ColumnsToDocument.scala | 27 +-- .../apps/CommandLineInterface.scala | 2 +- .../apps/NumericEntityRecognizerShell.scala | 4 +- build.sbt | 2 +- .../org/clulab/numeric/EvalTimeNorm.scala | 2 +- .../numeric/NumericEntityRecognizer.scala | 42 ++--- .../scala/org/clulab/numeric/package.scala | 74 ++++---- .../org/clulab/processors/Document.scala | 25 +-- .../org/clulab/processors/Processor.scala | 40 +++-- .../org/clulab/processors/Sentence.scala | 169 ++++++++++++------ .../clulab/processors/clu/DocumentMaker.scala | 15 +- .../org/clulab/processors/clu/Veil.scala | 28 +-- .../serialization/DocumentSerializer.scala | 24 +-- .../serialization/json/JSONSerializer.scala | 66 ++++--- .../clulab/utils/ToEnhancedDependencies.scala | 78 ++++---- .../org/clulab/utils/TestHash.scala | 6 +- .../TestNumericEntityRecognition.scala | 2 +- .../clulab/numeric/TestSeasonNormalizer.scala | 4 +- .../clulab/processors/TestLexiconNER.scala | 4 +- .../org/clulab/processors/TestProcessor.scala | 91 +++++----- .../json/TestJSONSerializer.scala | 15 +- .../struct/TestDocumentAttachment.scala | 56 +++--- .../org/clulab/utils/TestFindHeads.scala | 6 +- 23 files changed, 428 insertions(+), 354 deletions(-) diff --git a/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala b/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala index 2789eb0d1..8822ba993 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala @@ -138,8 +138,10 @@ object ColumnsToDocument { } } if(words.nonEmpty) { - val s = new Sentence(words.toArray, startOffsets.toArray, endOffsets.toArray, words.toArray) - s.tags = Some(labels.toArray) + val s = new Sentence( + words.toArray, startOffsets.toArray, endOffsets.toArray, words.toArray, + tags = Some(labels.toArray) + ) sentences += s } logger.debug(s"Loaded ${sentences.size} sentences.") @@ -151,26 +153,5 @@ object ColumnsToDocument { } - def setTags(s:Sentence, tags:Array[String]): Unit = { - s.tags = Some(tags) - } - - def setChunks(s:Sentence, chunks:Array[String]): Unit = { - s.chunks = Some(chunks) - } - - def setEntities(s:Sentence, entities:Array[String]): Unit = { - s.entities = Some(entities) - } - - def annotateLemmas(doc:Document): Unit = { - proc.lemmatize(doc) // some features use lemmas, which are not available in the CoNLL data - } - - def annotateLemmmaTags(doc:Document): Unit = { - proc.lemmatize(doc) - proc.tagPartsOfSpeech(doc) - } - def annotateNil(doc:Document): Unit = {} } diff --git a/apps/src/main/scala/org/clulab/processors/apps/CommandLineInterface.scala b/apps/src/main/scala/org/clulab/processors/apps/CommandLineInterface.scala index 022a59cc0..0e84c662d 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/CommandLineInterface.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/CommandLineInterface.scala @@ -36,7 +36,7 @@ object CommandLineInterface extends App { } else if(props.containsKey(TOKENS)) { // one sentence per line; sentences are tokenized val sents = FileUtils.getLinesFromFile(props.getProperty(INPUT)) - val tokenizedSents = sents.map(_.split("\\s+").toIterable) + val tokenizedSents = sents.map(_.split("\\s+").toSeq) proc.annotateFromTokens(tokenizedSents) } else { // assume raw text diff --git a/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala b/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala index 0009b0f04..47225a369 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala @@ -1,6 +1,6 @@ package org.clulab.processors.apps -import org.clulab.numeric.{displayMentions, setLabelsAndNorms} +import org.clulab.numeric.{displayMentions, mkLabelsAndNorms} import org.clulab.processors.clu.BalaurProcessor import org.clulab.utils.ReloadableProcessor import org.clulab.utils.ReloadableShell @@ -37,7 +37,7 @@ class NumericEntityRecognizerShell(ruleDirOpt: Option[String]) extends Reloadabl val doc = proc.get.annotate(text) val mentions = proc.get.numericEntityRecognizerOpt.map(_.extractFrom(doc)).getOrElse(Seq.empty) - setLabelsAndNorms(doc, mentions) + mkLabelsAndNorms(doc, mentions) displayMentions(mentions, doc) } diff --git a/build.sbt b/build.sbt index 0bc3a2e3c..1ee3d3420 100644 --- a/build.sbt +++ b/build.sbt @@ -18,7 +18,7 @@ val scala37 = "3.7.0" // up to 3.7.0 // Scala33: This is the first official LTS, but hold off until necessary. val scala3 = scala33 -ThisBuild / crossScalaVersions := Seq(scala213, scala3) +ThisBuild / crossScalaVersions := Seq(scala213) // , scala3) ThisBuild / scalaVersion := crossScalaVersions.value.head lazy val root = (project in file(".")) diff --git a/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala b/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala index add58c14d..bf5190dba 100644 --- a/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala +++ b/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala @@ -34,7 +34,7 @@ object EvalTimeNorm { } val doc = proc.annotate(docText) val mentions = ner.extractFrom(doc) - setLabelsAndNorms(doc, mentions) + mkLabelsAndNorms(doc, mentions) val prediction = mentions.collect{ case m: Norm if m.neLabel.equals("DATE") || m.neLabel.equals("DATE-RANGE") => (m.startOffset.toString, m.endOffset.toString, m.neNorm) diff --git a/library/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala b/library/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala index 3d5976a7d..73cc1940d 100644 --- a/library/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala +++ b/library/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala @@ -19,41 +19,29 @@ class NumericEntityRecognizer protected (val lexiconNer: LexiconNER, val actions new NumericEntityRecognizer(lexiconNer, actions, extractorEngine) } - /** Matches the lexicon NER on this document, setting the `entities` field */ - def matchLexiconNer(document: Document): Seq[Option[Array[String]]] = { - val originalEntities = new ArrayBuffer[Option[Array[String]]]() - - for(sent <- document.sentences) { - originalEntities += sent.entities - - val labels = lexiconNer.find(sent) - // this needs to happen in place, otherwise Odin does not see these labels - // we will restore the original Sentence.entities at the end in `extractFrom` - sent.entities = Some(labels) - // println(s"ENTITIES: ${sent.entities.get.mkString(" ")}") - } - - originalEntities - } - /** * Entry point for numeric entity recognition * @param doc Input document * @return sets in place the sequence of NER labels and sequence of NER norms (using the TempEval-2 notation) */ - def extractFrom(doc:Document): Seq[Mention] = { - // dictionaries - val originalEntities = matchLexiconNer(doc) - // grammars - var mentions = extractor.extractFrom(doc) + def extractFrom(doc: Document): Seq[Mention] = { + val newSentences = doc.sentences.map { sentence => + val newEntities = lexiconNer.find(sentence) + + sentence.copy(entities = Some(newEntities)) + } + val newDocument = doc.copy(sentences = newSentences) + val mentions = { + val dirtyMentions = extractor.extractFrom(newDocument) + val cleanMentions = actions.cleanupAction(dirtyMentions) - // restore the original entities - for(i <- originalEntities.indices) { - doc.sentences(i).entities = originalEntities(i) + cleanMentions } - // global actions *after* all grammars are done - actions.cleanupAction(mentions) + // These mentions will have doc pointing to the newDocument, + // but sentence will be the index into the new sentences and + // will be valid for the original doc. + mentions } } diff --git a/library/src/main/scala/org/clulab/numeric/package.scala b/library/src/main/scala/org/clulab/numeric/package.scala index 70559d0f9..d41438014 100644 --- a/library/src/main/scala/org/clulab/numeric/package.scala +++ b/library/src/main/scala/org/clulab/numeric/package.scala @@ -70,58 +70,62 @@ package object numeric { * @param doc This document is modified in place * @param mentions The numeric mentions previously extracted */ - def setLabelsAndNorms(doc: Document, mentions: Seq[Mention]): Unit = { - // - // initialize entities and norms - // - for (sentence <- doc.sentences) { - sentence.entities = sentence.entities.orElse(Some(Array.fill(sentence.size)("O"))) - sentence.norms = sentence.norms .orElse(Some(Array.fill(sentence.size)(""))) + def mkLabelsAndNorms(doc: Document, mentions: Seq[Mention]): (Array[Array[String]], Array[Array[String]]) = { + val allEntities = doc.sentences.map { sentence => + sentence.entities.getOrElse(Array.fill(sentence.size)("O")) } + val allNorms = doc.sentences.map { sentence => + sentence.norms.getOrElse(Array.fill(sentence.size)("")) + } + + for (mention <- mentions) { + if (NumericActions.isNumeric(mention) && mention.isInstanceOf[Norm]) { + val sentenceIndex = mention.sentence + val entities = allEntities(sentenceIndex) + val norms = allNorms(sentenceIndex) - // - // convert numeric entities to entity labels and norms - // - for(mention <- mentions) { - if(NumericActions.isNumeric(mention) && mention.isInstanceOf[Norm]) { - addLabelsAndNorms(mention.asInstanceOf[Norm], mention.sentenceObj, mention.tokenInterval) + addLabelsAndNorms(mention.asInstanceOf[Norm], entities, norms, mention.tokenInterval) + removeOneEntityBeforeAnother(entities, norms, "B-LOC", "MEASUREMENT-LENGTH") } } - removeOneEntityBeforeAnother(doc, "B-LOC", "MEASUREMENT-LENGTH") + + (allEntities, allNorms) } - def removeOneEntityBeforeAnother(doc: Document, triggerEntity: String, toBeRemovedShortened: String): Unit = { + def removeOneEntityBeforeAnother(entities: Array[String], norms: Array[String], triggerEntity: String, toBeRemovedShortened: String): Unit = { // removes entities and norms for unallowable entity sequences, e.g., don't extract 'in' as 'inch' before B-LOC in '... Sahal 108 in Senegal' // toBeRemovedShortened is entity without BIO- - for(s <- doc.sentences) { - val zippedEntities = s.entities.get.zipWithIndex - for ((e, i) <- zippedEntities) { - if (i > 0 && e == triggerEntity && s.entities.get(i-1).endsWith(toBeRemovedShortened)) { - s.entities.get(i - 1) = "O" - // go in reverse replacing indices and norms in the immediate preceding mention - breakable { - for ((en, j) <- zippedEntities.slice(0, i ).reverse) { - if (en.endsWith(toBeRemovedShortened)) { - s.entities.get(j) = "O" - s.norms.get(j) = "" - } else break() - } + val zippedEntities = entities.zipWithIndex + + zippedEntities.foreach { case (outerEntity, outerIndex) => + if (outerIndex > 0 && outerEntity == triggerEntity && entities(outerIndex - 1).endsWith(toBeRemovedShortened)) { + // Go in reverse replacing indices and norms in the immediate preceding mention. + zippedEntities.slice(0, outerIndex).reverse + breakable { // TODO: rewrite + for ((innerEntity, innerIndex) <- zippedEntities.slice(0, outerIndex).reverse) { + if (innerEntity.endsWith(toBeRemovedShortened)) { + entities(innerIndex) = "O" + norms(innerIndex) = "" + } else break() } } } } } - private def addLabelsAndNorms(m: Norm, s: Sentence, tokenInt: Interval): Unit = { - var first = true + private def addLabelsAndNorms(m: Norm, entities: Array[String], norms: Array[String], tokenInt: Interval): Unit = { + val label = m.neLabel val norm = m.neNorm + // careful here: we may override some existing entities and norms // but, given that the numeric entity rules tend to be high precision, this is probably Ok... - for(i <- tokenInt.indices) { - val prefix = if(first) "B-" else "I-" - s.entities.get(i) = prefix + m.neLabel - s.norms.get(i) = norm - first = false + tokenInt.headOption.foreach { index => + entities(index) = "B-" + label + norms(index) = norm + } + tokenInt.tail.foreach { index => + entities(index) = "I-" + label + norms(index) = norm } } } diff --git a/library/src/main/scala/org/clulab/processors/Document.scala b/library/src/main/scala/org/clulab/processors/Document.scala index ab5dbbba6..f8d226c56 100644 --- a/library/src/main/scala/org/clulab/processors/Document.scala +++ b/library/src/main/scala/org/clulab/processors/Document.scala @@ -29,10 +29,17 @@ class Document( protected val documentCreationTime:Option[String] = None ) extends Serializable { - def copy(sentences: Array[Sentence]): Document = ??? + def copy( + sentences: Array[Sentence] = sentences, + id: Option[String] = id, + coreferenceChains: Option[CorefChains] = coreferenceChains, + text: Option[String] = text, + attachments: Option[mutable.HashMap[String, DocumentAttachment]] = None, + documentCreationTime: Option[String] = documentCreationTime + ): Document = new Document(sentences, id, coreferenceChains, text, attachments, documentCreationTime) /** Clears any internal state potentially constructed by the annotators */ - def clear(): Unit = { } + // def clear(): Unit = { } /** * Used to compare Documents. @@ -174,20 +181,6 @@ class Document( }) } - // sentences are a val, so they must be initialized through the construction of a new Document. - // Thereafter, the remaining values can be assimilated from the old document. The shortcut - // is used so that subclasses don't have to duplicate almost everything in their copy. - def copy(sentences: Array[Sentence] = sentences, textOpt: Option[String] = text): Document = { - new Document( - sentences = sentences, // not this - id = this.id, - coreferenceChains = this.coreferenceChains, - text = textOpt, // not this - attachments = this.attachments, - documentCreationTime = this.documentCreationTime - ) - } - def offset(offset: Int): Document = // If a subclass of Document constructs itself with an attachment or a documentCreationTime that // would be overwritten on the copy(), then it should provide its own copy() method(s). diff --git a/library/src/main/scala/org/clulab/processors/Processor.scala b/library/src/main/scala/org/clulab/processors/Processor.scala index 9528d613d..00d5fcdf1 100644 --- a/library/src/main/scala/org/clulab/processors/Processor.scala +++ b/library/src/main/scala/org/clulab/processors/Processor.scala @@ -2,6 +2,8 @@ package org.clulab.processors import org.clulab.processors.clu.BalaurProcessor +import scala.collection.mutable + /** * User: mihais * Date: 3/1/13 @@ -21,31 +23,37 @@ trait Processor { require(documents.length > 1) val headDocument = documents.head val tailDocuments = documents.tail - val combinedSentences = documents.flatMap(_.sentences).toArray - val combinedDocument = new Document(combinedSentences) val headId = headDocument.id require(tailDocuments.forall(_.id == headId)) - combinedDocument.id = headId - - require(combinedDocument.text.isEmpty) - combinedDocument.text = combinedTextOpt - + val headDctOpt = headDocument.getDCT + require(documents.tail.forall(_.getDCT == headDctOpt)) // Coreference chains involve Mentions that include references to documents. The Mentions are being // moved to a new Document and it would be infeasible to move the chains. - require(combinedDocument.coreferenceChains.isEmpty) require(documents.forall(_.coreferenceChains.isEmpty)) + val attachments = mutable.HashMap[String, DocumentAttachment]() + documents.foreach { document => document.getAttachmentKeys.foreach { attachmentKey => - require(combinedDocument.getAttachment(attachmentKey).forall(_ == document.getAttachment(attachmentKey).get)) - combinedDocument.addAttachment(attachmentKey, document.getAttachment(attachmentKey).get) + val valueOpt = attachments.get(attachmentKey) + val isValid = valueOpt.forall(_ == document.getAttachment(attachmentKey).get) + + require(isValid, "The attachments cannot contradict each other.") + attachments(attachmentKey) = document.getAttachment(attachmentKey).get } } - val headDctOpt = headDocument.getDCT - require(documents.tail.forall(_.getDCT == headDctOpt)) - headDctOpt.foreach(combinedDocument.setDCT) + val combinedSentences = documents.flatMap(_.sentences).toArray + val combinedDocument = new Document( + sentences = combinedSentences, + id = headId, + coreferenceChains = None, + text = combinedTextOpt, + attachments = Some(attachments), + documentCreationTime = headDctOpt + ) + combinedDocument } @@ -94,10 +102,10 @@ trait Processor { // (2) It is more efficient during annotate() where all the possible operations are chained. /** Part of speech tagging; modifies the document in place. */ - def tagPartsOfSpeech (doc:Document): Unit + def tagPartsOfSpeech(doc: Document): Unit - /** Lematization; modifies the document in place. */ - def lemmatize (doc:Document): Unit + /** Lemmatization; modifies the document in place. */ + def lemmatize(words: Array[String]): Array[String] /** Named Entity Recognition; modifies the document in place. */ def recognizeNamedEntities (doc:Document): Unit diff --git a/library/src/main/scala/org/clulab/processors/Sentence.scala b/library/src/main/scala/org/clulab/processors/Sentence.scala index 0465226c1..7158efecb 100644 --- a/library/src/main/scala/org/clulab/processors/Sentence.scala +++ b/library/src/main/scala/org/clulab/processors/Sentence.scala @@ -8,6 +8,66 @@ import org.clulab.utils.SeqUtils import scala.collection.mutable +case class WordTokenization(raw: String, startOffset: Int, endOffset: Int, word: String) + +// Is this SentenceTokenization, ArraySeq of WordTokenization +// Tokenation, Tokse +// Parseation, Parse +case class Tokenization( + raw: Array[String], + startOffsets: Array[Int], + endOffsets: Array[Int], + words: Array[String] +) { + + def reverse: Tokenization = { + Tokenization( + raw = raw.reverse, + startOffsets = startOffsets.reverse, + endOffsets = endOffsets.reverse, + words = words.reverse + ) + } +} + +// These are by the word ones and then there are relationships between words. +// So parse, might not be a thing that is per word. +//case class WordParse(tag: String, lemma: String, entity: String, norm: String, chunk: String) + +//case class SentenceParse(tags: Array[String], cyntacticTree, graphs, relations) + +// Again is this SentenceParse +case class Parse( + tags: Option[Array[String]] = None, + /** Lemmas */ + lemmas: Option[Array[String]] = None, + /** NE labels */ + entities: Option[Array[String]] = None, + /** Normalized values of named/numeric entities, such as dates */ + norms: Option[Array[String]] = None, + /** Shallow parsing labels */ + chunks: Option[Array[String]] = None, + /** Constituent tree of this sentence; includes head words */ + syntacticTree: Option[Tree] = None, + /** DAG of syntactic and semantic dependencies; word offsets start at 0 */ + graphs: GraphMap = GraphMap(), + /** Relation triples from OpenIE */ + relations:Option[Array[RelationTriple]] = None +) { + + def reverse: Parse = { + Parse( + tags = tags.map(_.reverse), + lemmas = lemmas.map(_.reverse), + entities = entities.map(_.reverse), + norms = norms.map(_.reverse), + chunks = chunks.map(_.reverse) + // TODO: reverse syntacticTree, graphs, and relations! + ) + } +} + + /** Stores the annotations for a single sentence */ class Sentence( /** Raw tokens in this sentence; these MUST match the original text */ @@ -24,25 +84,33 @@ class Sentence( * However, the number of raw tokens MUST always equal the number of words, so if the exact text must be recovered, * please use the raw tokens with the same positions */ - val words: Array[String]) extends Serializable { + val words: Array[String], /** POS tags for words */ - var tags: Option[Array[String]] = None + val tags: Option[Array[String]] = None, /** Lemmas */ - var lemmas: Option[Array[String]] = None + val lemmas: Option[Array[String]] = None, /** NE labels */ - var entities: Option[Array[String]] = None + val entities: Option[Array[String]] = None, /** Normalized values of named/numeric entities, such as dates */ - var norms: Option[Array[String]] = None + val norms: Option[Array[String]] = None, /** Shallow parsing labels */ - var chunks: Option[Array[String]] = None + val chunks: Option[Array[String]] = None, /** Constituent tree of this sentence; includes head words */ - var syntacticTree: Option[Tree] = None + val syntacticTree: Option[Tree] = None, /** DAG of syntactic and semantic dependencies; word offsets start at 0 */ - var graphs: GraphMap = GraphMap() + val graphs: GraphMap = GraphMap(), /** Relation triples from OpenIE */ - var relations:Option[Array[RelationTriple]] = None + val relations:Option[Array[RelationTriple]] = None +) extends Serializable { + def getTokenization: Tokenization = { + Tokenization(raw, startOffsets, endOffsets, words) + } + + def getParse: Parse = { + Parse(tags, lemmas, entities, norms, chunks, syntacticTree, graphs, relations) + } def size:Int = raw.length @@ -150,42 +218,47 @@ class Sentence( } /** Reverts the current sentence */ - def revert():Sentence = { - val reverted = new Sentence( - SeqUtils.revert(raw).toArray, - SeqUtils.revert(startOffsets).toArray, - SeqUtils.revert(endOffsets).toArray, - SeqUtils.revert(words).toArray) - if(tags.nonEmpty) - reverted.tags = Some(SeqUtils.revert(tags.get).toArray) - if(lemmas.nonEmpty) - reverted.lemmas = Some(SeqUtils.revert(lemmas.get).toArray) - if(entities.nonEmpty) - reverted.entities = Some(SeqUtils.revert(entities.get).toArray) - if(norms.nonEmpty) - reverted.norms = Some(SeqUtils.revert(norms.get).toArray) - if(chunks.nonEmpty) - reverted.chunks = Some(SeqUtils.revert(chunks.get).toArray) + def revert(): Sentence = { + val reversedTokenization = this.getTokenization.reverse + val reversedParse = this.getParse.reverse + val reversedSentence = Sentence( + reversedTokenization.raw, + reversedTokenization.startOffsets, + reversedTokenization.endOffsets, + reversedTokenization.words + ) + // TODO: Make this work +// reversedSentence.tags = reversedParse.tags +// reversedSentence.lemmas = reversedParse.lemmas +// reversedSentence.entities = reversedParse.entities +// reversedSentence.norms = reversedParse.norms +// reversedSentence.chunks = reversedParse.chunks // TODO: revert syntacticTree and graphs! - reverted - } - - def assimilate(sentence: Sentence): Sentence = { - tags = sentence.tags - lemmas = sentence.lemmas - entities = sentence.entities - norms = sentence.norms - chunks = sentence.chunks - syntacticTree = sentence.syntacticTree - graphs = sentence.graphs - relations = sentence.relations - this + reversedSentence } - def copy(raw: Array[String] = raw, startOffsets: Array[Int] = startOffsets, endOffsets: Array[Int] = endOffsets, words: Array[String] = words): Sentence = - new Sentence(raw, startOffsets, endOffsets, words).assimilate(this) + // TODO + def copy( + raw: Array[String] = raw, + startOffsets: Array[Int] = startOffsets, + endOffsets: Array[Int] = endOffsets, + words: Array[String] = words, + + tags: Option[Array[String]] = tags, + lemmas: Option[Array[String]] = lemmas, + entities: Option[Array[String]] = entities, + norms: Option[Array[String]] = norms, + chunks: Option[Array[String]] = chunks, + syntacticTree: Option[Tree] = syntacticTree, + graphs: GraphMap = graphs, + relations: Option[Array[RelationTriple]] = relations + ): Sentence = + new Sentence( + raw, startOffsets, endOffsets, words, + tags, lemmas, entities, norms, chunks, syntacticTree, graphs, relations + ) def offset(offset: Int): Sentence = { if (offset == 0) this @@ -227,17 +300,9 @@ object Sentence { deps: GraphMap, relations: Option[Array[RelationTriple]] ): Sentence = { - val s = Sentence(raw, startOffsets, endOffsets, words) - // update annotations - s.tags = tags - s.lemmas = lemmas - s.entities = entities - s.norms = norms - s.chunks = chunks - s.syntacticTree = tree - s.graphs = deps - s.relations = relations - s + new Sentence( + raw, startOffsets, endOffsets, words, + tags, lemmas, entities, norms, chunks, tree, deps, relations + ) } - } \ No newline at end of file diff --git a/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala b/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala index 92168b4bd..bde915cfc 100644 --- a/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala +++ b/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala @@ -17,8 +17,9 @@ object DocumentMaker { text:String, keepText:Boolean): Document = { val sents = tokenizer.tokenize(text) - val doc = new Document(sents) - if(keepText) doc.text = Some(text) + val textOpt = Option.when(keepText)(text) + val doc = Document(sents, textOpt) + doc } @@ -46,8 +47,9 @@ object DocumentMaker { //println("End offsets: " + sent.endOffsets.mkString(", ")) sents += sent } - val doc = new Document(sents.toArray) - if(keepText) doc.text = Some(sentences.mkString(mkSep(charactersBetweenSentences))) + val textOpt = Option.when(keepText)(sentences.mkString(mkSep(charactersBetweenSentences))) + val doc = Document(sents.toArray, textOpt) + doc } @@ -77,8 +79,9 @@ object DocumentMaker { } } - val doc = new Document(sents.toArray) - if(keepText) doc.text = Some(text.toString) + val textOpt = Option.when(keepText)(text.toString) + val doc = Document(sents.toArray, textOpt) + doc } diff --git a/library/src/main/scala/org/clulab/processors/clu/Veil.scala b/library/src/main/scala/org/clulab/processors/clu/Veil.scala index 6e4494ca4..481abf2b9 100644 --- a/library/src/main/scala/org/clulab/processors/clu/Veil.scala +++ b/library/src/main/scala/org/clulab/processors/clu/Veil.scala @@ -48,7 +48,7 @@ class VeiledText(originalText: String, veiledLetters: Seq[Range]) extends Veil { } protected def unveilDocument(veiledDocument: Document): Document = { - val unveiledDocument = veiledDocument.copy(textOpt = Some(originalText)) + val unveiledDocument = veiledDocument.copy(text = Some(originalText)) unveiledDocument } @@ -164,21 +164,27 @@ class VeiledDocument(originalDocument: Document, veiledWords: Seq[(Int, Range)]) val unveiledStartOffsets = originalSentence.startOffsets val unveiledEndOffsets = originalSentence.endOffsets val unveiledWords = originalSentence.words + val unveiledSentence = veiledSentence.copy(unveiledRaw, unveiledStartOffsets, unveiledEndOffsets, unveiledWords) def unveilStringArray(veiledArrayOpt: Option[Array[String]], veil: String): Option[Array[String]] = this.unveilStringArray(veiledArrayOpt, sentenceIndex, veil) - unveiledSentence.tags = unveilStringArray(unveiledSentence.tags, Veil.veiledTag) - unveiledSentence.lemmas = unveilStringArray(unveiledSentence.lemmas, Veil.veiledLemma) - unveiledSentence.entities = unveilStringArray(unveiledSentence.entities, Veil.veiledEntity) - unveiledSentence.norms = unveilStringArray(unveiledSentence.norms, Veil.veiledNorm) - unveiledSentence.chunks = unveilStringArray(unveiledSentence.chunks, Veil.veiledChunk) - - unveiledSentence.syntacticTree = unveilSyntacticTree(unveiledSentence.syntacticTree) - unveiledSentence.graphs = unveilGraphs(unveiledSentence.graphs, sentenceIndex) - unveiledSentence.relations = unveilRelations(unveiledSentence.relations) - unveiledSentence + val tags = unveilStringArray(unveiledSentence.tags, Veil.veiledTag) + val lemmas = unveilStringArray(unveiledSentence.lemmas, Veil.veiledLemma) + val entities = unveilStringArray(unveiledSentence.entities, Veil.veiledEntity) + val norms = unveilStringArray(unveiledSentence.norms, Veil.veiledNorm) + val chunks = unveilStringArray(unveiledSentence.chunks, Veil.veiledChunk) + + val syntacticTree = unveilSyntacticTree(unveiledSentence.syntacticTree) + val graphs = unveilGraphs(unveiledSentence.graphs, sentenceIndex) + val relations = unveilRelations(unveiledSentence.relations) + + val newSentence = Sentence( + unveiledSentence.raw, unveiledSentence.startOffsets, unveiledSentence.endOffsets, unveiledSentence.words, + tags, lemmas, entities, norms, chunks, syntacticTree, graphs, relations + ) + newSentence } protected def unveilDocument(veiledDocument: Document): Document = { diff --git a/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala b/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala index 8016375ee..7f3103591 100644 --- a/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala +++ b/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala @@ -102,13 +102,9 @@ class DocumentSerializer extends Logging { assert(bits(0) == END_OF_DOCUMENT, s"END_OF_DOCUMENT expected, found ${bits(0)}") - val doc = Document(sents.toArray) - doc.coreferenceChains = coref - doc.text = text - - // TODO: Hack by Enrique to resolve the document object for the relations - for(sen <- doc.sentences){ + /* + val relationsOpt = for(sen <- sents){ sen.relations match { case Some(relations) => val newRelations = relations.map(r => RelationTriple(r.confidence, r.subjectInterval, r.relationInterval, r.objectInterval)) @@ -116,13 +112,21 @@ class DocumentSerializer extends Logging { case None => () } } + */ - namedDocumentAttachmentsOpt.foreach { namedDocumentAttachments => - namedDocumentAttachments.foreach { case (name: String, documentAttachment: DocumentAttachment) => - doc.addAttachment(name, documentAttachment) - } + val attachmentsOpt = namedDocumentAttachmentsOpt.map { namedDocumentAttachments => + val attachments = mutable.HashMap[String, DocumentAttachment]() + + attachments.addAll(namedDocumentAttachments) + attachments } + val doc = new Document( + sentences = sents.toArray, + text = text, + attachments = attachmentsOpt + ) + doc } diff --git a/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala b/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala index 181400b2d..1ae8f456f 100644 --- a/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala +++ b/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala @@ -1,8 +1,7 @@ package org.clulab.serialization.json import java.io.File -import org.clulab.processors.DocumentAttachmentBuilderFromJson -import org.clulab.processors.{Document, Sentence} +import org.clulab.processors.{Document, DocumentAttachment, DocumentAttachmentBuilderFromJson, Parse, Sentence} import org.clulab.struct.Edge import org.clulab.struct.{DirectedGraph, GraphMap} import org.clulab.utils.FileUtils @@ -12,6 +11,8 @@ import org.json4s._ import org.json4s.jackson.JsonMethods._ import org.json4s.jackson.prettyJson +import scala.collection.mutable + /** JSON serialization utilities */ // This annotation is to avoid "Compiler synthesis of Manifest and OptManifest is deprecated". @@ -23,10 +24,11 @@ object JSONSerializer { def jsonAST(f: File): JValue = jsonAST(FileUtils.getTextFromFile(f)) - protected def addDocumentAttachments(doc: Document, jValue: JValue): Unit = { + protected def getDocumentAttachments(jValue: JValue): Option[mutable.HashMap[String, DocumentAttachment]] = { // See also DocumentSerializer for text version of nearly the same thing. (jValue \ DOCUMENT_ATTACHMENTS_KEY) match { case jObject: JObject => + val attachments = new mutable.HashMap[String, DocumentAttachment]() val keys = jObject.values.keys keys.foreach { (key: String) => (jObject \ key) match { @@ -38,7 +40,7 @@ object JSONSerializer { val documentAttachmentBuilder = obj.asInstanceOf[DocumentAttachmentBuilderFromJson] val value = (jObject \ DOCUMENT_ATTACHMENTS_VALUE_KEY) val documentAttachment = documentAttachmentBuilder.mkDocumentAttachment(value) - doc.addAttachment(key, documentAttachment) + attachments(key) = documentAttachment case jValue: JValue => val text = prettyJson(jValue) throw new RuntimeException(s"ERROR: While deserializing document attachments expected JObject but found this: $text") @@ -46,20 +48,27 @@ object JSONSerializer { case null => // noop. It should never get here. (Famous last words.) Scala 3 prefers null over _. } } + Some(attachments) case _ => // Leave documentAttachments as is: None + None } } def toDocument(json: JValue): Document = { // recover sentences val sentences = (json \ "sentences").asInstanceOf[JArray].arr.map(sjson => toSentence(sjson)).toArray + val id = getStringOption(json, "id") + val text = getStringOption(json, "text") // initialize document - val d = Document(sentences) - // update id - d.id = getStringOption(json, "id") - // update text - d.text = getStringOption(json, "text") - addDocumentAttachments(d, json) + val attachments = getDocumentAttachments(json) + val d = new Document( + id = id, + sentences = sentences, + coreferenceChains = None, + text = text, + attachments = attachments + ) + d } def toDocument(docHash: String, djson: JValue): Document = toDocument(djson \ docHash) @@ -73,20 +82,29 @@ object JSONSerializer { case contents => Some(contents.extract[Array[String]]) } - val s = json.extract[Sentence] - val preferredSize = s.words.length - // build dependencies - val graphs = (json \ "graphs").extract[JObject].obj.map { case (key, json) => - key -> toDirectedGraph(json, Some(preferredSize)) - }.toMap - s.graphs = GraphMap(graphs) - // build labels - s.tags = getLabels(json, "tags") - s.lemmas = getLabels(json, "lemmas") - s.entities = getLabels(json, "entities") - s.norms = getLabels(json, "norms") - s.chunks = getLabels(json, "chunks") - s + val tokenizedSentence = json.extract[Sentence] + + val tags = getLabels(json, "tags") + val lemmas = getLabels(json, "lemmas") + val entities = getLabels(json, "entities") + val norms = getLabels(json, "norms") + val chunks = getLabels(json, "chunks") + val syntacticTree = None // TODO: Are these not serialized? + val graphs = { + val preferredSize = tokenizedSentence.words.length + val graphs = (json \ "graphs").extract[JObject].obj.map { case (key, json) => + key -> toDirectedGraph(json, Some(preferredSize)) + }.toMap + + GraphMap(graphs) + } + val relations = None // TODO: Are these not serialized? + val parsedSentence = Sentence( + tokenizedSentence.raw, tokenizedSentence.startOffsets, tokenizedSentence.endOffsets, tokenizedSentence.words, + tags, lemmas, entities, norms, chunks, syntacticTree, graphs, relations + ) + + parsedSentence } def toDirectedGraph(json: JValue, preferredSizeOpt: Option[Int] = None): DirectedGraph[String] = { diff --git a/library/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala b/library/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala index 3c19d2c1d..63eab7913 100644 --- a/library/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala +++ b/library/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala @@ -23,29 +23,29 @@ import scala.collection.mutable.{ArrayBuffer, ListBuffer} object ToEnhancedDependencies { type EdgeSpec = (Int, Int, String) - def generateStanfordEnhancedDependencies(sentence:Sentence, dg:DirectedGraph[String]): DirectedGraph[String] = { + def generateStanfordEnhancedDependencies(words: Array[String], tags: Array[String], dg:DirectedGraph[String]): DirectedGraph[String] = { val dgi = dg.toDirectedGraphIndex() - collapsePrepositionsStanford(sentence, dgi) + collapsePrepositionsStanford(words, dgi) raiseSubjects(dgi) - pushSubjectsObjectsInsideRelativeClauses(sentence, dgi, universal = false) - propagateSubjectsAndObjectsInConjVerbs(sentence, dgi, universal = false) - propagateConjSubjectsAndObjects(sentence, dgi) - dgi.toDirectedGraph(Some(sentence.size)) + pushSubjectsObjectsInsideRelativeClauses(tags, dgi, universal = false) + propagateSubjectsAndObjectsInConjVerbs(tags, dgi, universal = false) + propagateConjSubjectsAndObjects(tags, dgi) + dgi.toDirectedGraph(Some(words.length)) } - def generateUniversalEnhancedDependencies(sentence:Sentence, dg:DirectedGraph[String]): DirectedGraph[String] = { + def generateUniversalEnhancedDependencies(words: Array[String], lemmas: Array[String], tags: Array[String], dg: DirectedGraph[String]): DirectedGraph[String] = { val dgi = dg.toDirectedGraphIndex() - collapseMWEs(sentence, dgi) - val collapsedNmods = collapsePrepositionsUniversal(sentence, dgi) + collapseMWEs(lemmas, tags, dgi) + val collapsedNmods = collapsePrepositionsUniversal(words, lemmas, tags, dgi) replicateCollapsedNmods(collapsedNmods, dgi) raiseSubjects(dgi) - pushSubjectsObjectsInsideRelativeClauses(sentence, dgi, universal = true) - propagateSubjectsAndObjectsInConjVerbs(sentence, dgi, universal = true) - propagateConjSubjectsAndObjects(sentence, dgi) + pushSubjectsObjectsInsideRelativeClauses(tags, dgi, universal = true) + propagateSubjectsAndObjectsInConjVerbs(tags, dgi, universal = true) // requires tags + propagateConjSubjectsAndObjects(tags, dgi) mergeNsubjXcomp(dgi) - replicateCopulativeSubjects(sentence, dgi) - expandConj(sentence, dgi) // this must be last because several of the above methods expect "conj" labels - dgi.toDirectedGraph(Some(sentence.size)) + replicateCopulativeSubjects(dgi) + expandConj(words, dgi) // this must be last because several of the above methods expect "conj" labels + dgi.toDirectedGraph(Some(words.length)) } /** @@ -66,7 +66,7 @@ object ToEnhancedDependencies { * Replicates copulative subjects across conjunctions * It is difficult and expensive => nsubj from 2 to 0 and from 4 to 0 */ - def replicateCopulativeSubjects(sentence: Sentence, dgi: DirectedGraphIndex[String]): Unit = { + def replicateCopulativeSubjects(dgi: DirectedGraphIndex[String]): Unit = { val nsubjs = dgi.findByName("nsubj") for(nsubj <- nsubjs) { val cops = dgi.findByHeadAndName(nsubj.source, "cop") @@ -102,13 +102,13 @@ object ToEnhancedDependencies { * @param sentence * @param dgi */ - def expandConj(sentence: Sentence, dgi: DirectedGraphIndex[String]): Unit = { + def expandConj(words: Array[String], dgi: DirectedGraphIndex[String]): Unit = { val toRemove = new ListBuffer[Edge[String]] val conjs = dgi.findByName("conj") for (conj <- conjs) { var shouldRemove = false for(cc <- dgi.findByName("cc").filter(_.source == conj.source)) { - val ccWord = sentence.words(cc.destination).toLowerCase() + val ccWord = words(cc.destination).toLowerCase() dgi.addEdge(conj.source, conj.destination, s"conj_$ccWord") shouldRemove = true } @@ -125,12 +125,12 @@ object ToEnhancedDependencies { * @param sentence The sentence to operate on * @param dgi The directed graph of collapsed dependencies at this stage */ - def collapsePrepositionsStanford(sentence:Sentence, dgi:DirectedGraphIndex[String]): Unit = { + def collapsePrepositionsStanford(words: Array[String], dgi:DirectedGraphIndex[String]): Unit = { val toRemove = new ListBuffer[Edge[String]] val preps = dgi.findByName("prep") for(prep <- preps) { toRemove += prep - val word = sentence.words(prep.destination) + val word = words(prep.destination) for(pobj <- dgi.findByName("pobj").filter(_.source == prep.destination)) { dgi.addEdge(prep.source, pobj.destination, s"prep_$word") toRemove += pobj @@ -140,12 +140,12 @@ object ToEnhancedDependencies { } def collapsePrepositionsUniversal( - sentence:Sentence, + words: Array[String], lemmas: Array[String], tags: Array[String], dgi:DirectedGraphIndex[String]): Seq[EdgeSpec] = { val collapsedNmods = new ArrayBuffer[EdgeSpec]() - collapsePrepositionsUniversalNmodCase(sentence, dgi, collapsedNmods) - collapsePrepositionsUniversalDueTo(sentence, dgi, collapsedNmods) + collapsePrepositionsUniversalNmodCase(words, dgi, collapsedNmods) + collapsePrepositionsUniversalDueTo(lemmas, tags, dgi, collapsedNmods) collapsedNmods } @@ -156,7 +156,7 @@ object ToEnhancedDependencies { * @param dgi The directed graph of collapsed dependencies at this stage */ def collapsePrepositionsUniversalNmodCase( - sentence:Sentence, + words: Array[String], dgi:DirectedGraphIndex[String], collapsedNmods: ArrayBuffer[EdgeSpec]): Unit = { @@ -166,9 +166,9 @@ object ToEnhancedDependencies { for(prep <- preps) { toRemove += prep for(c <- dgi.findByName("case").filter(_.source == prep.destination)) { - val word = sentence.words(c.destination).toLowerCase() + val word = words(c.destination).toLowerCase() // find multi-word prepositions such as "such as" - val mwe = findMultiWord(word, c.destination, sentence, dgi) + val mwe = findMultiWord(word, c.destination, words, dgi) // TODO: add nmod:agent (if word == "by") and passive voice here? dgi.addEdge(prep.source, prep.destination, s"nmod_$mwe") @@ -189,16 +189,15 @@ object ToEnhancedDependencies { * @param dgi The directed graph of collapsed dependencies at this stage */ def collapsePrepositionsUniversalDueTo( - sentence:Sentence, + lemmas: Array[String], tags: Array[String], dgi:DirectedGraphIndex[String], collapsedNmods: ArrayBuffer[EdgeSpec]): Unit = { - val tags = sentence.tags.get val toRemove = new ListBuffer[Edge[String]] var shouldRemove = false val preps = dgi.findByName("mwe") for(prep <- preps) { - if(sentence.lemmas.get(prep.source) == "due" && sentence.lemmas.get(prep.destination) == "to") { + if(lemmas(prep.source) == "due" && lemmas(prep.destination) == "to") { // found a "due to" MWE for(leftDep <- dgi.findByModifier(prep.source)) { // found the dep from "famine" to "due" @@ -235,15 +234,15 @@ object ToEnhancedDependencies { * @param dgi */ def collapseMWEs( - sentence:Sentence, + lemmas: Array[String], + tags: Array[String], dgi:DirectedGraphIndex[String]): Unit = { - val lemmas = sentence.lemmas.get - val tags = sentence.tags.get + val size = lemmas.length val toRemove = new ListBuffer[Edge[String]] var shouldRemove = true - for(i <- 0 until sentence.size - 1) { + for(i <- 0 until size - 1) { if(lemmas(i) == "due" && lemmas(i + 1) == "to" && tags(i) == "IN") { val toHeads = dgi.findByModifier(i + 1) var found = false @@ -262,7 +261,7 @@ object ToEnhancedDependencies { if(shouldRemove) remove(toRemove, dgi) } - def findMultiWord(first: String, firstPos: Int, sentence: Sentence, dgi:DirectedGraphIndex[String]): String = { + def findMultiWord(first: String, firstPos: Int, words: Array[String], dgi:DirectedGraphIndex[String]): String = { val buffer = new StringBuilder buffer.append(first) @@ -273,7 +272,7 @@ object ToEnhancedDependencies { if(mods.isEmpty) { done = true } else { - val word = sentence.words(mods.head.destination).toLowerCase() + val word = words(mods.head.destination).toLowerCase() buffer.append("_") buffer.append(word) head = mods.head.destination @@ -303,9 +302,8 @@ object ToEnhancedDependencies { * @param sentence The sentence to operate on * @param dgi The directed graph of collapsed dependencies at this stage */ - def propagateSubjectsAndObjectsInConjVerbs(sentence:Sentence, dgi:DirectedGraphIndex[String], universal:Boolean): Unit = { + def propagateSubjectsAndObjectsInConjVerbs(tags: Array[String], dgi:DirectedGraphIndex[String], universal:Boolean): Unit = { val conjs = dgi.findByName("conj").sortBy(_.source) - val tags = sentence.tags.get for(conj <- conjs) { val left = math.min(conj.source, conj.destination) val right = math.max(conj.source, conj.destination) @@ -387,9 +385,8 @@ object ToEnhancedDependencies { * @param sentence The sentence to operate on * @param dgi The directed graph of collapsed dependencies at this stage */ - def propagateConjSubjectsAndObjects(sentence:Sentence, dgi:DirectedGraphIndex[String]): Unit = { + def propagateConjSubjectsAndObjects(tags: Array[String], dgi:DirectedGraphIndex[String]): Unit = { val conjs = dgi.findByName("conj").sortBy(_.source) - val tags = sentence.tags.get for(conj <- conjs) { val left = math.min(conj.source, conj.destination) val right = math.max(conj.source, conj.destination) @@ -424,11 +421,10 @@ object ToEnhancedDependencies { * @param sentence The sentence to operate on * @param dgi The directed graph of collapsed dependencies at this stage */ - def pushSubjectsObjectsInsideRelativeClauses(sentence:Sentence, dgi:DirectedGraphIndex[String], universal:Boolean): Unit = { + def pushSubjectsObjectsInsideRelativeClauses(tags: Array[String], dgi:DirectedGraphIndex[String], universal:Boolean): Unit = { val rels = if(universal) dgi.findByName("acl:relcl") else dgi.findByName("rcmod") - val tags = sentence.tags.get for(rel <- rels) { val head = rel.source diff --git a/library/src/test/scala-2.13/org/clulab/utils/TestHash.scala b/library/src/test/scala-2.13/org/clulab/utils/TestHash.scala index 857f10727..31e03d8ec 100644 --- a/library/src/test/scala-2.13/org/clulab/utils/TestHash.scala +++ b/library/src/test/scala-2.13/org/clulab/utils/TestHash.scala @@ -34,7 +34,8 @@ class TestHash extends Test { behavior of "Hash" it should "compute the expected equivalence hash for a Document" in { - val expectedHash = 1145238653 + val expectedHash = -1029127286 +// val expectedHash = 1145238653 val actualHash = document.equivalenceHash actualHash should be (expectedHash) @@ -56,7 +57,8 @@ class TestHash extends Test { } it should "compute the expected equivalence hashes for Mentions" in { - val expectedHashes = Array(1317064233, 418554464, 269168883, 1021871359, 1657321605) + val expectedHashes = Array(-674187334, 1183699787, 391766831, -495035159, -2089326276) +// val expectedHashes = Array(1317064233, 418554464, 269168883, 1021871359, 1657321605) val actualHashes = allMentions.map(getEquivalenceHash) actualHashes should be (expectedHashes) diff --git a/library/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala b/library/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala index 2a9214736..864505e79 100644 --- a/library/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala +++ b/library/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala @@ -656,7 +656,7 @@ class TestNumericEntityRecognition extends Test { def numericParse(sentence: String): (Array[String], Array[String], Array[String]) = { val doc = proc.annotate(sentence) val mentions = ner.extractFrom(doc) - setLabelsAndNorms(doc, mentions) + mkLabelsAndNorms(doc, mentions) // assume 1 sentence per doc val sent = doc.sentences.head diff --git a/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala b/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala index 2bad85f6e..d1f104f8d 100644 --- a/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala +++ b/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala @@ -15,9 +15,9 @@ class TestSeasonNormalizer extends Test { def mkEntitiesAndNorms(processor: BalaurProcessor, text: String): (Array[String], Array[String]) = { val document = processor.annotate(text) - val mentions = processor.extractNumericEntityMentions(document) + val mentions = processor.numericEntityRecognizerOpt.get.extractFrom(document) - setLabelsAndNorms(document, mentions) + mkLabelsAndNorms(document, mentions) (document.sentences.head.entities.get, document.sentences.head.norms.get) } diff --git a/library/src/test/scala/org/clulab/processors/TestLexiconNER.scala b/library/src/test/scala/org/clulab/processors/TestLexiconNER.scala index 4c2fa4c37..48115479c 100644 --- a/library/src/test/scala/org/clulab/processors/TestLexiconNER.scala +++ b/library/src/test/scala/org/clulab/processors/TestLexiconNER.scala @@ -24,8 +24,8 @@ import scala.util.Using class TestLexiconNER extends CluTest { def mkSentence(text: String): Sentence = { - val doc = proc.mkDocument(text) - proc.annotate(doc) + val simpleDoc = proc.mkDocument(text) + val doc = proc.annotate(simpleDoc) doc.sentences.head } diff --git a/library/src/test/scala/org/clulab/processors/TestProcessor.scala b/library/src/test/scala/org/clulab/processors/TestProcessor.scala index 2f57e439f..e6f1e0d1b 100644 --- a/library/src/test/scala/org/clulab/processors/TestProcessor.scala +++ b/library/src/test/scala/org/clulab/processors/TestProcessor.scala @@ -9,7 +9,6 @@ class TestProcessor extends CluTest { "Processor" should "tokenize raw text correctly" in { val doc = proc.mkDocument("John Doe went to China. There, he visited Beijing.") - doc.clear() doc.sentences(0).words(0) should be ("John") doc.sentences(0).words(1) should be ("Doe") @@ -40,8 +39,8 @@ class TestProcessor extends CluTest { } it should "POS tag correctly" in { - val doc = proc.mkDocument("John Doe went to China. There, he visited Beijing.") - proc.annotate(doc) + val simpleDoc = proc.mkDocument("John Doe went to China. There, he visited Beijing.") + val doc = proc.annotate(simpleDoc) doc.sentences(0).tags.get(0) should be ("NNP") doc.sentences(0).tags.get(1) should be ("NNP") @@ -59,17 +58,16 @@ class TestProcessor extends CluTest { } it should "POS tag parentheses correctly" in { - val doc = proc.mkDocument("This is a test (of parentheses).") - proc.annotate(doc) + val simpleDoc = proc.mkDocument("This is a test (of parentheses).") + val doc = proc.annotate(simpleDoc) doc.sentences(0).tags.get(4) should be ("-LRB-") doc.sentences(0).tags.get(7) should be ("-RRB-") } it should "recognize syntactic chunks correctly" in { - val doc = proc.mkDocument("He reckons the current account deficit will narrow to only 1.8 billion.") - proc.annotate(doc) - doc.clear() + val simpleDoc = proc.mkDocument("He reckons the current account deficit will narrow to only 1.8 billion.") + val doc = proc.annotate(simpleDoc) doc.sentences(0).chunks.get(0) should be ("B-NP") doc.sentences(0).chunks.get(1) should be ("B-VP") @@ -86,9 +84,8 @@ class TestProcessor extends CluTest { } it should "lemmatize text correctly" in { - val doc = proc.mkDocument("John Doe went to the shops.") - proc.annotate(doc) - doc.clear() + val simpleDoc = proc.mkDocument("John Doe went to the shops.") + val doc = proc.annotate(simpleDoc) doc.sentences(0).lemmas.get(0) should be ("john") doc.sentences(0).lemmas.get(2) should be ("go") @@ -112,40 +109,44 @@ class TestProcessor extends CluTest { } it should "parse MWEs correctly" in { - var sent = "Foods such as icecream are tasty." - var doc = proc.mkDocument(sent) - println(s"WORDS: ${doc.sentences.head.words.mkString(", ")}") - - proc.annotate(doc) - println(s"Enhanced universal dependencies for sentence: $sent") - println(doc.sentences.head.universalEnhancedDependencies.get) - - doc.sentences.head.universalEnhancedDependencies.get.hasEdge(0, 3, "nmod_such_as") should be (true) - doc.sentences.head.universalEnhancedDependencies.get.hasEdge(0, 3, "nmod") should be (false) - - sent = "There was famine due to drought." - doc = proc.mkDocument(sent) - println(s"WORDS: ${doc.sentences.head.words.mkString(", ")}") - - proc.annotate(doc) - println(s"Enhanced universal dependencies for sentence: $sent") - println(doc.sentences.head.universalEnhancedDependencies.get) - - doc.sentences.head.universalEnhancedDependencies.get.hasEdge(2, 5, "nmod_due_to") should be (true) - doc.sentences.head.universalEnhancedDependencies.get.hasEdge(2, 3, "amod") should be (false) - doc.sentences.head.universalEnhancedDependencies.get.hasEdge(2, 5, "nmod") should be (false) - - sent = "They ate cake due to hunger." - doc = proc.mkDocument(sent) - println(s"WORDS: ${doc.sentences.head.words.mkString(", ")}") - - proc.annotate(doc) - println(s"Enhanced universal dependencies for sentence: $sent") - println(doc.sentences.head.universalEnhancedDependencies.get) - - doc.sentences.head.universalEnhancedDependencies.get.hasEdge(1, 5, "nmod_due_to") should be (true) - doc.sentences.head.universalEnhancedDependencies.get.hasEdge(1, 3, "amod") should be (false) - doc.sentences.head.universalEnhancedDependencies.get.hasEdge(1, 5, "nmod") should be (false) + { + val sent = "Foods such as icecream are tasty." + val simpleDoc = proc.mkDocument(sent) + println(s"WORDS: ${simpleDoc.sentences.head.words.mkString(", ")}") + + val doc = proc.annotate(simpleDoc) + println(s"Enhanced universal dependencies for sentence: $sent") + println(doc.sentences.head.universalEnhancedDependencies.get) + + doc.sentences.head.universalEnhancedDependencies.get.hasEdge(0, 3, "nmod_such_as") should be(true) + doc.sentences.head.universalEnhancedDependencies.get.hasEdge(0, 3, "nmod") should be(false) + } + { + val sent = "There was famine due to drought." + val simpleDoc = proc.mkDocument(sent) + println(s"WORDS: ${simpleDoc.sentences.head.words.mkString(", ")}") + + val doc = proc.annotate(simpleDoc) + println(s"Enhanced universal dependencies for sentence: $sent") + println(doc.sentences.head.universalEnhancedDependencies.get) + + doc.sentences.head.universalEnhancedDependencies.get.hasEdge(2, 5, "nmod_due_to") should be(true) + doc.sentences.head.universalEnhancedDependencies.get.hasEdge(2, 3, "amod") should be(false) + doc.sentences.head.universalEnhancedDependencies.get.hasEdge(2, 5, "nmod") should be(false) + } + { + val sent = "They ate cake due to hunger." + val simpleDoc = proc.mkDocument(sent) + println(s"WORDS: ${simpleDoc.sentences.head.words.mkString(", ")}") + + val doc = proc.annotate(simpleDoc) + println(s"Enhanced universal dependencies for sentence: $sent") + println(doc.sentences.head.universalEnhancedDependencies.get) + + doc.sentences.head.universalEnhancedDependencies.get.hasEdge(1, 5, "nmod_due_to") should be(true) + doc.sentences.head.universalEnhancedDependencies.get.hasEdge(1, 3, "amod") should be(false) + doc.sentences.head.universalEnhancedDependencies.get.hasEdge(1, 5, "nmod") should be(false) + } } it should "parse incomplete sentence without crashing" in { diff --git a/library/src/test/scala/org/clulab/serialization/json/TestJSONSerializer.scala b/library/src/test/scala/org/clulab/serialization/json/TestJSONSerializer.scala index 5acf466d4..ceabd13f3 100644 --- a/library/src/test/scala/org/clulab/serialization/json/TestJSONSerializer.scala +++ b/library/src/test/scala/org/clulab/serialization/json/TestJSONSerializer.scala @@ -24,8 +24,8 @@ class TestJSONSerializer extends Test { "A Document with an ID" should "produce json with an \"id\" field" in { - val d = jsonStringToDocument(""" {"sentences":[{"raw":["Gonzo","married","Camilla","."], "words":["Gonzo","married","Camilla","."],"startOffsets":[0,6,14,21],"endOffsets":[5,13,21,22],"tags":["NNP","VBD","NNP","."],"lemmas":["Gonzo","marry","Camilla","."],"entities":["O","O","PERSON","O"],"norms":["O","O","O","O"],"chunks":["B-NP","B-VP","B-NP","O"],"graphs":{"stanford-basic":{"edges":[{"source":1,"destination":0,"relation":"nsubj"},{"source":1,"destination":2,"relation":"dobj"},{"source":1,"destination":3,"relation":"punct"}],"roots":[1]},"stanford-collapsed":{"edges":[{"source":1,"destination":0,"relation":"nsubj"},{"source":1,"destination":2,"relation":"dobj"},{"source":1,"destination":3,"relation":"punct"}],"roots":[1]}}}]} """) - d.id = Some("this-is-an-id") + val id = "this-is-an-id" + val d = jsonStringToDocument(s""" {"id":"$id","sentences":[{"raw":["Gonzo","married","Camilla","."], "words":["Gonzo","married","Camilla","."],"startOffsets":[0,6,14,21],"endOffsets":[5,13,21,22],"tags":["NNP","VBD","NNP","."],"lemmas":["Gonzo","marry","Camilla","."],"entities":["O","O","PERSON","O"],"norms":["O","O","O","O"],"chunks":["B-NP","B-VP","B-NP","O"],"graphs":{"stanford-basic":{"edges":[{"source":1,"destination":0,"relation":"nsubj"},{"source":1,"destination":2,"relation":"dobj"},{"source":1,"destination":3,"relation":"punct"}],"roots":[1]},"stanford-collapsed":{"edges":[{"source":1,"destination":0,"relation":"nsubj"},{"source":1,"destination":2,"relation":"dobj"},{"source":1,"destination":3,"relation":"punct"}],"roots":[1]}}}]} """) (d.jsonAST \ "id") should equal (JString("this-is-an-id")) } @@ -35,8 +35,7 @@ class TestJSONSerializer extends Test { } "A Document with text" should "produce json with a \"text\" field" in { - val d = jsonStringToDocument(""" {"sentences":[{"raw":["Gonzo","married","Camilla","."], "words":["Gonzo","married","Camilla","."],"startOffsets":[0,6,14,21],"endOffsets":[5,13,21,22],"tags":["NNP","VBD","NNP","."],"lemmas":["Gonzo","marry","Camilla","."],"entities":["O","O","PERSON","O"],"norms":["O","O","O","O"],"chunks":["B-NP","B-VP","B-NP","O"],"graphs":{"stanford-basic":{"edges":[{"source":1,"destination":0,"relation":"nsubj"},{"source":1,"destination":2,"relation":"dobj"},{"source":1,"destination":3,"relation":"punct"}],"roots":[1]},"stanford-collapsed":{"edges":[{"source":1,"destination":0,"relation":"nsubj"},{"source":1,"destination":2,"relation":"dobj"},{"source":1,"destination":3,"relation":"punct"}],"roots":[1]}}}]} """) - d.text = Some(text) + val d = jsonStringToDocument(s""" {"text":"$text","sentences":[{"raw":["Gonzo","married","Camilla","."], "words":["Gonzo","married","Camilla","."],"startOffsets":[0,6,14,21],"endOffsets":[5,13,21,22],"tags":["NNP","VBD","NNP","."],"lemmas":["Gonzo","marry","Camilla","."],"entities":["O","O","PERSON","O"],"norms":["O","O","O","O"],"chunks":["B-NP","B-VP","B-NP","O"],"graphs":{"stanford-basic":{"edges":[{"source":1,"destination":0,"relation":"nsubj"},{"source":1,"destination":2,"relation":"dobj"},{"source":1,"destination":3,"relation":"punct"}],"roots":[1]},"stanford-collapsed":{"edges":[{"source":1,"destination":0,"relation":"nsubj"},{"source":1,"destination":2,"relation":"dobj"},{"source":1,"destination":3,"relation":"punct"}],"roots":[1]}}}]} """) (d.jsonAST \ "text") should equal (JString(text)) } @@ -61,11 +60,11 @@ class TestJSONSerializer extends Test { class Scratch(var document: Document) extends JSONSerialization { def jsonAST: JValue = document.jsonAST } - - doc.text = Some("This is a test") // Original failing test requires text + + val docWithText = doc.copy(sentences = doc.sentences, text = Some("This is a test")) val documentSerializer = new DocumentSerializer() - val expectedDocAsJSON = new Scratch(doc).json() - val docSaved = documentSerializer.save(doc, keepText = true) + val expectedDocAsJSON = new Scratch(docWithText).json() + val docSaved = documentSerializer.save(docWithText, keepText = true) val docLoaded = documentSerializer.load(docSaved) val actualDocAsJSON = new Scratch(docLoaded).json() diff --git a/library/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala b/library/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala index b84e337a3..b34393b2b 100644 --- a/library/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala +++ b/library/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala @@ -1,7 +1,6 @@ package org.clulab.struct -import org.clulab.processors.Document -import org.clulab.processors.Sentence +import org.clulab.processors.{Document, DocumentAttachment, Sentence} import org.clulab.serialization.DocumentSerializer import org.clulab.serialization.json._ import org.clulab.struct.test.CaseClass @@ -17,6 +16,7 @@ import java.io.ByteArrayInputStream import java.io.ByteArrayOutputStream import java.io.ObjectInputStream import java.io.ObjectOutputStream +import scala.collection.mutable import scala.util.Using class TestDocumentAttachment extends Test { @@ -124,12 +124,13 @@ class TestDocumentAttachment extends Test { // } "Document with TextNameDocumentAttachment" should "serialize as text" in { - val oldDocument = new Document(Array.empty[Sentence]) - - oldDocument.addAttachment(FIRST_KEY, new TextNameDocumentAttachment(FIRST_NAME)) - oldDocument.addAttachment(MIDDLE_KEY, new TextNameDocumentAttachment(MIDDLE_NAME)) - oldDocument.addAttachment(LAST_KEY, new TextNameDocumentAttachment(LAST_NAME)) - oldDocument.addAttachment(ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) + val oldAttachments = mutable.HashMap[String, DocumentAttachment]( + (FIRST_KEY, new TextNameDocumentAttachment(FIRST_NAME)), + (MIDDLE_KEY, new TextNameDocumentAttachment(MIDDLE_NAME)), + (LAST_KEY, new TextNameDocumentAttachment(LAST_NAME)), + (ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) + ) + val oldDocument = new Document(sentences = Array.empty[Sentence], attachments = Some(oldAttachments)) val documentSerializer = new DocumentSerializer() val documentString = documentSerializer.save(oldDocument) @@ -146,12 +147,13 @@ class TestDocumentAttachment extends Test { } "Document with ObjectNameDocumentAttachment" should "serialize as text" in { - val oldDocument = new Document(Array.empty[Sentence]) - - oldDocument.addAttachment(FIRST_KEY, new ObjectNameDocumentAttachment(FIRST_NAME)) - oldDocument.addAttachment(MIDDLE_KEY, new ObjectNameDocumentAttachment(MIDDLE_NAME)) - oldDocument.addAttachment(LAST_KEY, new ObjectNameDocumentAttachment(LAST_NAME)) - oldDocument.addAttachment(ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) + val oldAttachments = mutable.HashMap[String, DocumentAttachment]( + (FIRST_KEY, new ObjectNameDocumentAttachment(FIRST_NAME)), + (MIDDLE_KEY, new ObjectNameDocumentAttachment(MIDDLE_NAME)), + (LAST_KEY, new ObjectNameDocumentAttachment(LAST_NAME)), + (ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) + ) + val oldDocument = new Document(sentences = Array.empty[Sentence], attachments = Some(oldAttachments)) val documentSerializer = new DocumentSerializer() // This should be a messy string. @@ -169,12 +171,14 @@ class TestDocumentAttachment extends Test { } "Document with TextNameDocumentAttachments" should "serialize as json" in { - val oldDocument = new Document(Array.empty[Sentence]) + val oldAttachments = mutable.HashMap[String, DocumentAttachment]( + (FIRST_KEY, new TextNameDocumentAttachment(FIRST_NAME)), + (MIDDLE_KEY, new TextNameDocumentAttachment(MIDDLE_NAME)), + (LAST_KEY, new TextNameDocumentAttachment(LAST_NAME)), + (ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) + ) + val oldDocument = new Document(sentences = Array.empty[Sentence], attachments = Some(oldAttachments)) - oldDocument.addAttachment(FIRST_KEY, new TextNameDocumentAttachment(FIRST_NAME)) - oldDocument.addAttachment(MIDDLE_KEY, new TextNameDocumentAttachment(MIDDLE_NAME)) - oldDocument.addAttachment(LAST_KEY, new TextNameDocumentAttachment(LAST_NAME)) - oldDocument.addAttachment(ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) // This shouldn't compile. /*oldDocument.addAttachment("wrong", new NameMethodAttachment("name"))*/ @@ -193,12 +197,13 @@ class TestDocumentAttachment extends Test { } "Document with ObjectNameDocumentAttachment" should "serialize as json" in { - val oldDocument = new Document(Array.empty[Sentence]) - - oldDocument.addAttachment(FIRST_KEY, new ObjectNameDocumentAttachment(FIRST_NAME)) - oldDocument.addAttachment(MIDDLE_KEY, new ObjectNameDocumentAttachment(MIDDLE_NAME)) - oldDocument.addAttachment(LAST_KEY, new ObjectNameDocumentAttachment(LAST_NAME)) - oldDocument.addAttachment(ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) + val oldAttachments = mutable.HashMap[String, DocumentAttachment]( + (FIRST_KEY, new ObjectNameDocumentAttachment(FIRST_NAME)), + (MIDDLE_KEY, new ObjectNameDocumentAttachment(MIDDLE_NAME)), + (LAST_KEY, new ObjectNameDocumentAttachment(LAST_NAME)), + (ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) + ) + val oldDocument = new Document(Array.empty[Sentence], attachments = Some(oldAttachments)) // This should be a messy string. val documentString = prettyJson(renderJValue(oldDocument.jsonAST)) @@ -214,4 +219,3 @@ class TestDocumentAttachment extends Test { /*require(newDocument == oldDocument)*/ } } - diff --git a/library/src/test/scala/org/clulab/utils/TestFindHeads.scala b/library/src/test/scala/org/clulab/utils/TestFindHeads.scala index 13390e71e..13e36fb85 100644 --- a/library/src/test/scala/org/clulab/utils/TestFindHeads.scala +++ b/library/src/test/scala/org/clulab/utils/TestFindHeads.scala @@ -9,10 +9,12 @@ class TestFindHeads extends Test { def newSentence(words: Array[String], directedGraph: DirectedGraph[String]): Sentence = { val startOffsets = Array(0) // unused val endOffsets = Array(0) // unused - val sentence = new Sentence(words, startOffsets, endOffsets, words) + val sentence = new Sentence( + words, startOffsets, endOffsets, words, + tags = Some(words) + ) sentence.graphs(UNIVERSAL_BASIC) = directedGraph - sentence.tags = Some(words) sentence } From 9ccca3686e524969295d0ee87cf3132ce0f47153 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Thu, 22 May 2025 22:37:23 -0700 Subject: [PATCH 05/42] Compile for Scala 3 --- build.sbt | 2 +- .../org/clulab/processors/Sentence.scala | 97 +++---------------- .../processors/clu/BalaurProcessor.scala | 13 +-- .../serialization/json/JSONSerializer.scala | 2 +- .../scala/org/clulab/struct/Annotation.scala | 39 ++++++++ .../org/clulab/struct/Tokenization.scala | 21 ++++ 6 files changed, 82 insertions(+), 92 deletions(-) create mode 100644 library/src/main/scala/org/clulab/struct/Annotation.scala create mode 100644 library/src/main/scala/org/clulab/struct/Tokenization.scala diff --git a/build.sbt b/build.sbt index 1ee3d3420..e7c465370 100644 --- a/build.sbt +++ b/build.sbt @@ -18,7 +18,7 @@ val scala37 = "3.7.0" // up to 3.7.0 // Scala33: This is the first official LTS, but hold off until necessary. val scala3 = scala33 -ThisBuild / crossScalaVersions := Seq(scala213) // , scala3) +ThisBuild / crossScalaVersions := Seq(scala3, scala213) ThisBuild / scalaVersion := crossScalaVersions.value.head lazy val root = (project in file(".")) diff --git a/library/src/main/scala/org/clulab/processors/Sentence.scala b/library/src/main/scala/org/clulab/processors/Sentence.scala index 7158efecb..42ce43b0b 100644 --- a/library/src/main/scala/org/clulab/processors/Sentence.scala +++ b/library/src/main/scala/org/clulab/processors/Sentence.scala @@ -1,73 +1,11 @@ package org.clulab.processors -import org.clulab.scala.WrappedArray._ import org.clulab.struct.{DirectedGraph, GraphMap, RelationTriple, Tree} import org.clulab.struct.GraphMap._ import org.clulab.utils.Hash -import org.clulab.utils.SeqUtils import scala.collection.mutable -case class WordTokenization(raw: String, startOffset: Int, endOffset: Int, word: String) - -// Is this SentenceTokenization, ArraySeq of WordTokenization -// Tokenation, Tokse -// Parseation, Parse -case class Tokenization( - raw: Array[String], - startOffsets: Array[Int], - endOffsets: Array[Int], - words: Array[String] -) { - - def reverse: Tokenization = { - Tokenization( - raw = raw.reverse, - startOffsets = startOffsets.reverse, - endOffsets = endOffsets.reverse, - words = words.reverse - ) - } -} - -// These are by the word ones and then there are relationships between words. -// So parse, might not be a thing that is per word. -//case class WordParse(tag: String, lemma: String, entity: String, norm: String, chunk: String) - -//case class SentenceParse(tags: Array[String], cyntacticTree, graphs, relations) - -// Again is this SentenceParse -case class Parse( - tags: Option[Array[String]] = None, - /** Lemmas */ - lemmas: Option[Array[String]] = None, - /** NE labels */ - entities: Option[Array[String]] = None, - /** Normalized values of named/numeric entities, such as dates */ - norms: Option[Array[String]] = None, - /** Shallow parsing labels */ - chunks: Option[Array[String]] = None, - /** Constituent tree of this sentence; includes head words */ - syntacticTree: Option[Tree] = None, - /** DAG of syntactic and semantic dependencies; word offsets start at 0 */ - graphs: GraphMap = GraphMap(), - /** Relation triples from OpenIE */ - relations:Option[Array[RelationTriple]] = None -) { - - def reverse: Parse = { - Parse( - tags = tags.map(_.reverse), - lemmas = lemmas.map(_.reverse), - entities = entities.map(_.reverse), - norms = norms.map(_.reverse), - chunks = chunks.map(_.reverse) - // TODO: reverse syntacticTree, graphs, and relations! - ) - } -} - - /** Stores the annotations for a single sentence */ class Sentence( /** Raw tokens in this sentence; these MUST match the original text */ @@ -104,14 +42,6 @@ class Sentence( val relations:Option[Array[RelationTriple]] = None ) extends Serializable { - def getTokenization: Tokenization = { - Tokenization(raw, startOffsets, endOffsets, words) - } - - def getParse: Parse = { - Parse(tags, lemmas, entities, norms, chunks, syntacticTree, graphs, relations) - } - def size:Int = raw.length def indices: Range = 0 until size @@ -219,23 +149,22 @@ class Sentence( /** Reverts the current sentence */ def revert(): Sentence = { - val reversedTokenization = this.getTokenization.reverse - val reversedParse = this.getParse.reverse val reversedSentence = Sentence( - reversedTokenization.raw, - reversedTokenization.startOffsets, - reversedTokenization.endOffsets, - reversedTokenization.words + raw.reverse, + startOffsets.reverse, + endOffsets.reverse, + words.reverse, + tags.map(_.reverse), + lemmas.map(_.reverse), + entities.map(_.reverse), + norms.map(_.reverse), + chunks.map(_.reverse), + // TODO: revert syntacticTree and graphs! + syntacticTree, + graphs, + relations ) - // TODO: Make this work -// reversedSentence.tags = reversedParse.tags -// reversedSentence.lemmas = reversedParse.lemmas -// reversedSentence.entities = reversedParse.entities -// reversedSentence.norms = reversedParse.norms -// reversedSentence.chunks = reversedParse.chunks - // TODO: revert syntacticTree and graphs! - reversedSentence } diff --git a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala index dc69b9363..68bd2c6de 100644 --- a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala +++ b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala @@ -3,9 +3,11 @@ package org.clulab.processors.clu import com.typesafe.config.Config import com.typesafe.config.ConfigFactory import org.clulab.numeric.{NumericEntityRecognizer, mkLabelsAndNorms} +import org.clulab.processors.clu.tokenizer.{EnglishLemmatizer, Lemmatizer, OpenDomainEnglishTokenizer, OpenDomainPortugueseTokenizer, OpenDomainSpanishTokenizer, PortugueseLemmatizer, SpanishLemmatizer, Tokenizer} import org.clulab.processors.{Document, Processor, Sentence} -import org.clulab.processors.clu.tokenizer._ -import org.clulab.scala.WrappedArray._ + +import scala.collection.immutable.ArraySeq +//import org.clulab.scala.WrappedArray._ import org.clulab.scala_transformers.encoder.TokenClassifier import org.clulab.scala_transformers.encoder.EncoderMaxTokensRuntimeException import org.clulab.sequences.{LexiconNER, NamedEntity} @@ -13,7 +15,6 @@ import org.clulab.struct.DirectedGraph import org.clulab.struct.GraphMap import org.clulab.utils.{Configured, MathUtils, ToEnhancedDependencies} import org.slf4j.{Logger, LoggerFactory} -import org.clulab.odin.Mention import BalaurProcessor._ import PostProcessor._ import org.clulab.processors.hexatagging.HexaDecoder @@ -149,7 +150,7 @@ class BalaurProcessor protected ( val lemmas = lemmatize(words) try { - val allLabelsAndScores = tokenClassifier.predictWithScores(words) + val allLabelsAndScores = tokenClassifier.predictWithScores(ArraySeq.unsafeWrapArray(words)) val tags = mkPosTags(words, allLabelsAndScores(TASK_TO_INDEX(POS_TASK))) val entities = { val optionalEntities = mkOptionalNerLabels(words, sentence.startOffsets, sentence.endOffsets, tags, lemmas) @@ -255,13 +256,13 @@ class BalaurProcessor protected ( private def mergeNerLabels(generic: Array[String], custom: Array[String]): Array[String] = { require(generic.length == custom.length) - val customNamedEntities = NamedEntity.collect(custom) + val customNamedEntities = NamedEntity.collect(ArraySeq.unsafeWrapArray(custom)) val result = generic.toArray // A copy of the generic labels is created here. if (customNamedEntities.isEmpty) result else { - val genericNamedEntities = NamedEntity.collect(generic) + val genericNamedEntities = NamedEntity.collect(ArraySeq.unsafeWrapArray(generic)) //println(s"Generic NamedEntity: ${genericNamedEntities.mkString(", ")}") //println(s"Custom NamedEntity: ${customNamedEntities.mkString(", ")}") diff --git a/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala b/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala index 1ae8f456f..903bed37e 100644 --- a/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala +++ b/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala @@ -1,7 +1,7 @@ package org.clulab.serialization.json import java.io.File -import org.clulab.processors.{Document, DocumentAttachment, DocumentAttachmentBuilderFromJson, Parse, Sentence} +import org.clulab.processors.{Document, DocumentAttachment, DocumentAttachmentBuilderFromJson, Sentence} import org.clulab.struct.Edge import org.clulab.struct.{DirectedGraph, GraphMap} import org.clulab.utils.FileUtils diff --git a/library/src/main/scala/org/clulab/struct/Annotation.scala b/library/src/main/scala/org/clulab/struct/Annotation.scala new file mode 100644 index 000000000..9489154f5 --- /dev/null +++ b/library/src/main/scala/org/clulab/struct/Annotation.scala @@ -0,0 +1,39 @@ +package org.clulab.struct + +import org.clulab.struct.GraphMap.GraphMap + +// These are by the word ones and then there are relationships between words. +// So parse, might not be a thing that is per word. +//case class WordParse(tag: String, lemma: String, entity: String, norm: String, chunk: String) + +//case class SentenceParse(tags: Array[String], syntacticTree, graphs, relations) + +case class Annotation( + tags: Option[Array[String]] = None, + /** Lemmas */ + lemmas: Option[Array[String]] = None, + /** NE labels */ + entities: Option[Array[String]] = None, + /** Normalized values of named/numeric entities, such as dates */ + norms: Option[Array[String]] = None, + /** Shallow parsing labels */ + chunks: Option[Array[String]] = None, + /** Constituent tree of this sentence; includes head words */ + syntacticTree: Option[Tree] = None, + /** DAG of syntactic and semantic dependencies; word offsets start at 0 */ + graphs: GraphMap = GraphMap(), + /** Relation triples from OpenIE */ + relations:Option[Array[RelationTriple]] = None +) { + + def reverse: Annotation = { + Annotation( + tags = tags.map(_.reverse), + lemmas = lemmas.map(_.reverse), + entities = entities.map(_.reverse), + norms = norms.map(_.reverse), + chunks = chunks.map(_.reverse) + // TODO: reverse syntacticTree, graphs, and relations! + ) + } +} diff --git a/library/src/main/scala/org/clulab/struct/Tokenization.scala b/library/src/main/scala/org/clulab/struct/Tokenization.scala new file mode 100644 index 000000000..78e8b21da --- /dev/null +++ b/library/src/main/scala/org/clulab/struct/Tokenization.scala @@ -0,0 +1,21 @@ +package org.clulab.struct + +// An alternative design would not use aligned arrays, but an array of structures. +case class WordTokenization(raw: String, startOffset: Int, endOffset: Int, word: String) + +case class Tokenization( + raw: Array[String], + startOffsets: Array[Int], + endOffsets: Array[Int], + words: Array[String] +) { + + def reverse: Tokenization = { + Tokenization( + raw = raw.reverse, + startOffsets = startOffsets.reverse, + endOffsets = endOffsets.reverse, + words = words.reverse + ) + } +} From 98c8115d5a842012ee5d60a4e302fce95552934d Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Fri, 23 May 2025 00:15:01 -0700 Subject: [PATCH 06/42] Pass Scala3 tests --- .../serialization/json/JSONSerializer.scala | 26 ++++++++++++------- .../scala-3/org/clulab/utils/TestHash.scala | 6 +++-- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala b/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala index 903bed37e..cac75f40a 100644 --- a/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala +++ b/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala @@ -77,21 +77,27 @@ object JSONSerializer { def toSentence(json: JValue): Sentence = { - def getLabels(json: JValue, k: String): Option[Array[String]] = json \ k match { + def getStrings(json: JValue, k: String): Array[String] = (json \ k).extract[Array[String]] + + def getInts(json: JValue, k: String): Array[Int] = (json \ k).extract[Array[Int]] + + def getLabelsOpt(json: JValue, k: String): Option[Array[String]] = json \ k match { case JNothing => None case contents => Some(contents.extract[Array[String]]) } - val tokenizedSentence = json.extract[Sentence] - - val tags = getLabels(json, "tags") - val lemmas = getLabels(json, "lemmas") - val entities = getLabels(json, "entities") - val norms = getLabels(json, "norms") - val chunks = getLabels(json, "chunks") + val raw = getStrings(json, "raw") + val startOffsets = getInts(json, "startOffsets") + val endOffsets = getInts(json, "endOffsets") + val words = getStrings(json, "words") + val tags = getLabelsOpt(json, "tags") + val lemmas = getLabelsOpt(json, "lemmas") + val entities = getLabelsOpt(json, "entities") + val norms = getLabelsOpt(json, "norms") + val chunks = getLabelsOpt(json, "chunks") val syntacticTree = None // TODO: Are these not serialized? val graphs = { - val preferredSize = tokenizedSentence.words.length + val preferredSize = words.length val graphs = (json \ "graphs").extract[JObject].obj.map { case (key, json) => key -> toDirectedGraph(json, Some(preferredSize)) }.toMap @@ -100,7 +106,7 @@ object JSONSerializer { } val relations = None // TODO: Are these not serialized? val parsedSentence = Sentence( - tokenizedSentence.raw, tokenizedSentence.startOffsets, tokenizedSentence.endOffsets, tokenizedSentence.words, + raw, startOffsets, endOffsets, words, tags, lemmas, entities, norms, chunks, syntacticTree, graphs, relations ) diff --git a/library/src/test/scala-3/org/clulab/utils/TestHash.scala b/library/src/test/scala-3/org/clulab/utils/TestHash.scala index c1dcf17a8..9a08e0ca5 100644 --- a/library/src/test/scala-3/org/clulab/utils/TestHash.scala +++ b/library/src/test/scala-3/org/clulab/utils/TestHash.scala @@ -35,7 +35,8 @@ class TestHash extends Test { behavior of "Hash" it should "compute the expected equivalence hash for a Document" in { - val expectedHash = 1145238653 + val expectedHash = -1029127286 +// val expectedHash = 1145238653 val actualHash = document.equivalenceHash actualHash should be (expectedHash) @@ -57,7 +58,8 @@ class TestHash extends Test { } it should "compute the expected equivalence hashes for Mentions" in { - val expectedHashes = Array(1317064233, 418554464, 269168883, 1021871359, 1657321605) + val expectedHashes = Array(-674187334, 1183699787, 391766831, -495035159, -2089326276) +// val expectedHashes = Array(1317064233, 418554464, 269168883, 1021871359, 1657321605) val actualHashes = allMentions.map(getEquivalenceHash) actualHashes should be (expectedHashes) From 211cd2aede935850649a24f3f4cff2a1115e239f Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Fri, 23 May 2025 09:47:17 -0700 Subject: [PATCH 07/42] NumericUtils --- .../processors/apps/NumericEntityRecognizerShell.scala | 6 +++--- .../src/main/scala/org/clulab/numeric/EvalTimeNorm.scala | 2 +- .../clulab/numeric/{package.scala => NumericUtils.scala} | 9 +++++---- .../clulab/numeric/TestNumericEntityRecognition.scala | 2 +- .../scala/org/clulab/numeric/TestSeasonNormalizer.scala | 2 +- 5 files changed, 11 insertions(+), 10 deletions(-) rename library/src/main/scala/org/clulab/numeric/{package.scala => NumericUtils.scala} (95%) diff --git a/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala b/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala index 47225a369..3a93ff4bd 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala @@ -1,6 +1,6 @@ package org.clulab.processors.apps -import org.clulab.numeric.{displayMentions, mkLabelsAndNorms} +import org.clulab.numeric.NumericUtils import org.clulab.processors.clu.BalaurProcessor import org.clulab.utils.ReloadableProcessor import org.clulab.utils.ReloadableShell @@ -37,8 +37,8 @@ class NumericEntityRecognizerShell(ruleDirOpt: Option[String]) extends Reloadabl val doc = proc.get.annotate(text) val mentions = proc.get.numericEntityRecognizerOpt.map(_.extractFrom(doc)).getOrElse(Seq.empty) - mkLabelsAndNorms(doc, mentions) - displayMentions(mentions, doc) + NumericUtils.mkLabelsAndNorms(doc, mentions) + NumericUtils.displayMentions(mentions, doc) } def reload(): Unit = { diff --git a/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala b/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala index bf5190dba..08acac195 100644 --- a/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala +++ b/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala @@ -34,7 +34,7 @@ object EvalTimeNorm { } val doc = proc.annotate(docText) val mentions = ner.extractFrom(doc) - mkLabelsAndNorms(doc, mentions) + NumericUtils.mkLabelsAndNorms(doc, mentions) val prediction = mentions.collect{ case m: Norm if m.neLabel.equals("DATE") || m.neLabel.equals("DATE-RANGE") => (m.startOffset.toString, m.endOffset.toString, m.neNorm) diff --git a/library/src/main/scala/org/clulab/numeric/package.scala b/library/src/main/scala/org/clulab/numeric/NumericUtils.scala similarity index 95% rename from library/src/main/scala/org/clulab/numeric/package.scala rename to library/src/main/scala/org/clulab/numeric/NumericUtils.scala index d41438014..261b8dc74 100644 --- a/library/src/main/scala/org/clulab/numeric/package.scala +++ b/library/src/main/scala/org/clulab/numeric/NumericUtils.scala @@ -1,13 +1,14 @@ -package org.clulab +package org.clulab.numeric import org.clulab.numeric.actions.NumericActions -import org.clulab.numeric.mentions.{DateMention, DateRangeMention, MeasurementMention, Norm, PercentageMention} +import org.clulab.numeric.mentions.Norm import org.clulab.odin.{EventMention, Mention} -import org.clulab.processors.{Document, Sentence} +import org.clulab.processors.Document import org.clulab.struct.Interval + import _root_.scala.util.control.Breaks._ -package object numeric { +object NumericUtils { def displayMentions(mentions: Seq[Mention], doc: Document): Unit = { val mentionsBySentence = mentions.groupBy(_.sentence).map { case (sentence, mentions) => sentence -> mentions.sortBy(_.start) diff --git a/library/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala b/library/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala index 864505e79..4bd2bd2ea 100644 --- a/library/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala +++ b/library/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala @@ -656,7 +656,7 @@ class TestNumericEntityRecognition extends Test { def numericParse(sentence: String): (Array[String], Array[String], Array[String]) = { val doc = proc.annotate(sentence) val mentions = ner.extractFrom(doc) - mkLabelsAndNorms(doc, mentions) + NumericUtils.mkLabelsAndNorms(doc, mentions) // assume 1 sentence per doc val sent = doc.sentences.head diff --git a/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala b/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala index d1f104f8d..8f8fa38ff 100644 --- a/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala +++ b/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala @@ -17,7 +17,7 @@ class TestSeasonNormalizer extends Test { val document = processor.annotate(text) val mentions = processor.numericEntityRecognizerOpt.get.extractFrom(document) - mkLabelsAndNorms(document, mentions) + NumericUtils.mkLabelsAndNorms(document, mentions) (document.sentences.head.entities.get, document.sentences.head.norms.get) } From 8ea1301601cb2e4b3052aac42396e70efc348830 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Fri, 23 May 2025 10:12:38 -0700 Subject: [PATCH 08/42] GraphMapType --- .../main/scala-2.11_2.12/org/clulab/struct/GraphMap.scala | 5 +++-- .../src/main/scala-2.13/org/clulab/struct/GraphMap.scala | 8 ++++---- library/src/main/scala-3/org/clulab/struct/GraphMap.scala | 8 ++++---- .../src/main/scala/org/clulab/processors/Sentence.scala | 6 +++--- .../src/main/scala/org/clulab/processors/clu/Veil.scala | 4 ++-- .../scala/org/clulab/serialization/json/package.scala | 2 +- library/src/main/scala/org/clulab/struct/Annotation.scala | 4 ++-- 7 files changed, 19 insertions(+), 18 deletions(-) diff --git a/library/src/main/scala-2.11_2.12/org/clulab/struct/GraphMap.scala b/library/src/main/scala-2.11_2.12/org/clulab/struct/GraphMap.scala index 57ad2411e..de5a5472f 100644 --- a/library/src/main/scala-2.11_2.12/org/clulab/struct/GraphMap.scala +++ b/library/src/main/scala-2.11_2.12/org/clulab/struct/GraphMap.scala @@ -7,10 +7,11 @@ class GraphMap protected extends mutable.HashMap[String, DirectedGraph[String]] } object GraphMap extends GraphMapNames { + type GraphMapType = GraphMap - def apply(): GraphMap = new GraphMap() + def apply(): GraphMapType = new GraphMap() - def apply(existing: Map[String, DirectedGraph[String]]): GraphMap = { + def apply(existing: Map[String, DirectedGraph[String]]): GraphMapType = { val gm = GraphMap() gm ++= existing } diff --git a/library/src/main/scala-2.13/org/clulab/struct/GraphMap.scala b/library/src/main/scala-2.13/org/clulab/struct/GraphMap.scala index fd1b32794..805226874 100644 --- a/library/src/main/scala-2.13/org/clulab/struct/GraphMap.scala +++ b/library/src/main/scala-2.13/org/clulab/struct/GraphMap.scala @@ -6,14 +6,14 @@ object GraphMap extends GraphMapNames { // This was previously a class inheriting from HashMap. However, // [warn] ...: inheritance from class HashMap in package mutable is deprecated (since 2.13.0): HashMap will be made final; use .withDefault for the common use case of computing a default value - type GraphMap = mutable.HashMap[String, DirectedGraph[String]] + type GraphMapType = mutable.HashMap[String, DirectedGraph[String]] - def apply(): GraphMap = { + def apply(): GraphMapType = { // we have very few dependency types, so let's create a small hash to save memory. - new GraphMap(2, mutable.HashMap.defaultLoadFactor) + new GraphMapType(2, mutable.HashMap.defaultLoadFactor) } - def apply(existing: scala.collection.Map[String, DirectedGraph[String]]): GraphMap = { + def apply(existing: scala.collection.Map[String, DirectedGraph[String]]): GraphMapType = { val gm = GraphMap() gm ++= existing } diff --git a/library/src/main/scala-3/org/clulab/struct/GraphMap.scala b/library/src/main/scala-3/org/clulab/struct/GraphMap.scala index fd1b32794..805226874 100644 --- a/library/src/main/scala-3/org/clulab/struct/GraphMap.scala +++ b/library/src/main/scala-3/org/clulab/struct/GraphMap.scala @@ -6,14 +6,14 @@ object GraphMap extends GraphMapNames { // This was previously a class inheriting from HashMap. However, // [warn] ...: inheritance from class HashMap in package mutable is deprecated (since 2.13.0): HashMap will be made final; use .withDefault for the common use case of computing a default value - type GraphMap = mutable.HashMap[String, DirectedGraph[String]] + type GraphMapType = mutable.HashMap[String, DirectedGraph[String]] - def apply(): GraphMap = { + def apply(): GraphMapType = { // we have very few dependency types, so let's create a small hash to save memory. - new GraphMap(2, mutable.HashMap.defaultLoadFactor) + new GraphMapType(2, mutable.HashMap.defaultLoadFactor) } - def apply(existing: scala.collection.Map[String, DirectedGraph[String]]): GraphMap = { + def apply(existing: scala.collection.Map[String, DirectedGraph[String]]): GraphMapType = { val gm = GraphMap() gm ++= existing } diff --git a/library/src/main/scala/org/clulab/processors/Sentence.scala b/library/src/main/scala/org/clulab/processors/Sentence.scala index 42ce43b0b..02396a4af 100644 --- a/library/src/main/scala/org/clulab/processors/Sentence.scala +++ b/library/src/main/scala/org/clulab/processors/Sentence.scala @@ -37,7 +37,7 @@ class Sentence( /** Constituent tree of this sentence; includes head words */ val syntacticTree: Option[Tree] = None, /** DAG of syntactic and semantic dependencies; word offsets start at 0 */ - val graphs: GraphMap = GraphMap(), + val graphs: GraphMapType = GraphMap(), /** Relation triples from OpenIE */ val relations:Option[Array[RelationTriple]] = None ) extends Serializable { @@ -181,7 +181,7 @@ class Sentence( norms: Option[Array[String]] = norms, chunks: Option[Array[String]] = chunks, syntacticTree: Option[Tree] = syntacticTree, - graphs: GraphMap = graphs, + graphs: GraphMapType = graphs, relations: Option[Array[RelationTriple]] = relations ): Sentence = new Sentence( @@ -226,7 +226,7 @@ object Sentence { norms: Option[Array[String]], chunks: Option[Array[String]], tree: Option[Tree], - deps: GraphMap, + deps: GraphMapType, relations: Option[Array[RelationTriple]] ): Sentence = { new Sentence( diff --git a/library/src/main/scala/org/clulab/processors/clu/Veil.scala b/library/src/main/scala/org/clulab/processors/clu/Veil.scala index 481abf2b9..20d136209 100644 --- a/library/src/main/scala/org/clulab/processors/clu/Veil.scala +++ b/library/src/main/scala/org/clulab/processors/clu/Veil.scala @@ -2,7 +2,7 @@ package org.clulab.processors.clu import org.clulab.processors.{Document, Processor, Sentence} import org.clulab.struct.{DirectedGraph, Edge, GraphMap, RelationTriple, Tree} -import org.clulab.struct.GraphMap._ +import org.clulab.struct.GraphMap.GraphMapType import scala.collection.mutable.{Set => MutableSet} @@ -136,7 +136,7 @@ class VeiledDocument(originalDocument: Document, veiledWords: Seq[(Int, Range)]) } } - def unveilGraphs(veiledGraphs: GraphMap, sentenceIndex: Int): GraphMap = { + def unveilGraphs(veiledGraphs: GraphMapType, sentenceIndex: Int): GraphMapType = { val unveilArray = unveilArrays(sentenceIndex) val unveiledGraphs = GraphMap() val originalLength = originalDocument.sentences(sentenceIndex).words.length diff --git a/library/src/main/scala/org/clulab/serialization/json/package.scala b/library/src/main/scala/org/clulab/serialization/json/package.scala index 27adb3fd9..06cf6715b 100644 --- a/library/src/main/scala/org/clulab/serialization/json/package.scala +++ b/library/src/main/scala/org/clulab/serialization/json/package.scala @@ -52,7 +52,7 @@ package object json { } } - implicit class GraphMapOps(gm: GraphMap) extends JSONSerialization { + implicit class GraphMapOps(gm: GraphMapType) extends JSONSerialization { def jsonAST: JValue = Extraction.decompose(gm.toMap.map { case (k, v) => k -> v.jsonAST }) // instead of mapValues } diff --git a/library/src/main/scala/org/clulab/struct/Annotation.scala b/library/src/main/scala/org/clulab/struct/Annotation.scala index 9489154f5..4323cecf3 100644 --- a/library/src/main/scala/org/clulab/struct/Annotation.scala +++ b/library/src/main/scala/org/clulab/struct/Annotation.scala @@ -1,6 +1,6 @@ package org.clulab.struct -import org.clulab.struct.GraphMap.GraphMap +import org.clulab.struct.GraphMap.GraphMapType // These are by the word ones and then there are relationships between words. // So parse, might not be a thing that is per word. @@ -21,7 +21,7 @@ case class Annotation( /** Constituent tree of this sentence; includes head words */ syntacticTree: Option[Tree] = None, /** DAG of syntactic and semantic dependencies; word offsets start at 0 */ - graphs: GraphMap = GraphMap(), + graphs: GraphMapType = GraphMap(), /** Relation triples from OpenIE */ relations:Option[Array[RelationTriple]] = None ) { From 1996cf370c9451ea1bd0ba3661f20a39ce5146f8 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Fri, 23 May 2025 10:36:22 -0700 Subject: [PATCH 09/42] Scala2 --- library/build.sbt | 2 +- .../main/scala/org/clulab/processors/clu/DocumentMaker.scala | 2 ++ .../scala/org/clulab/serialization/DocumentSerializer.scala | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/library/build.sbt b/library/build.sbt index a562a04b5..9b2e770d0 100644 --- a/library/build.sbt +++ b/library/build.sbt @@ -54,7 +54,7 @@ libraryDependencies ++= { "org.scalatest" %% "scalatest" % "3.2.15" % Test, // up to 3.2.19, Apache-2.0 // for odin "org.apache.commons" % "commons-text" % "1.1", // up to 1.12.0, Apache-2.0 - "org.scala-lang.modules" %% "scala-collection-compat" % "2.11.0", // up to 2.12.0 // Apache-2.0 + "org.scala-lang.modules" %% "scala-collection-compat" % "2.13.0", // up to 2.13.0 // Apache-2.0 "org.scala-lang.modules" %% "scala-parser-combinators" % combinatorsVersion, // Apache-2.0 "org.yaml" % "snakeyaml" % "1.14", // up to 2.2, Apache-2.0 // progress bar for training diff --git a/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala b/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala index bde915cfc..a00fcac83 100644 --- a/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala +++ b/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala @@ -7,6 +7,8 @@ import org.clulab.processors.Document import scala.collection.mutable.ArrayBuffer import org.clulab.processors.Sentence +import scala.collection.compat._ + class DocumentMaker object DocumentMaker { diff --git a/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala b/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala index 7f3103591..b4ccb0122 100644 --- a/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala +++ b/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala @@ -117,7 +117,7 @@ class DocumentSerializer extends Logging { val attachmentsOpt = namedDocumentAttachmentsOpt.map { namedDocumentAttachments => val attachments = mutable.HashMap[String, DocumentAttachment]() - attachments.addAll(namedDocumentAttachments) + attachments ++= namedDocumentAttachments attachments } From 39a9dca35c34e641b44418934e677cbd570e55ee Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Fri, 23 May 2025 13:35:35 -0700 Subject: [PATCH 10/42] Check in Balaur as well --- .../processors/clu/BalaurProcessor.scala | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala index 68bd2c6de..d33ca6336 100644 --- a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala +++ b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala @@ -2,23 +2,24 @@ package org.clulab.processors.clu import com.typesafe.config.Config import com.typesafe.config.ConfigFactory -import org.clulab.numeric.{NumericEntityRecognizer, mkLabelsAndNorms} -import org.clulab.processors.clu.tokenizer.{EnglishLemmatizer, Lemmatizer, OpenDomainEnglishTokenizer, OpenDomainPortugueseTokenizer, OpenDomainSpanishTokenizer, PortugueseLemmatizer, SpanishLemmatizer, Tokenizer} +import org.clulab.numeric.NumericEntityRecognizer +import org.clulab.numeric.NumericUtils import org.clulab.processors.{Document, Processor, Sentence} - -import scala.collection.immutable.ArraySeq +import org.clulab.processors.clu.tokenizer.{EnglishLemmatizer, Lemmatizer, OpenDomainEnglishTokenizer, OpenDomainPortugueseTokenizer, OpenDomainSpanishTokenizer, PortugueseLemmatizer, SpanishLemmatizer, Tokenizer} +import org.clulab.processors.hexatagging.HexaDecoder //import org.clulab.scala.WrappedArray._ -import org.clulab.scala_transformers.encoder.TokenClassifier import org.clulab.scala_transformers.encoder.EncoderMaxTokensRuntimeException +import org.clulab.scala_transformers.encoder.TokenClassifier import org.clulab.sequences.{LexiconNER, NamedEntity} import org.clulab.struct.DirectedGraph import org.clulab.struct.GraphMap +import org.clulab.struct.GraphMap.GraphMapType import org.clulab.utils.{Configured, MathUtils, ToEnhancedDependencies} import org.slf4j.{Logger, LoggerFactory} + +import scala.collection.compat.immutable.ArraySeq + import BalaurProcessor._ -import PostProcessor._ -import org.clulab.processors.hexatagging.HexaDecoder -import org.clulab.struct.GraphMap.GraphMap class BalaurProcessor protected ( val config: Config, @@ -40,8 +41,8 @@ class BalaurProcessor protected ( config, optionalNER, newNumericEntityRecognizerOpt(seasonPathOpt), - mkTokenizer(BalaurProcessor.getArgString(config, s"$prefix.language", Some("EN"))), - mkLemmatizer(BalaurProcessor.getArgString(config, s"$prefix.language", Some("EN"))), + mkTokenizer(getConfigArgString(config, s"$prefix.language", Some("EN"))), + mkLemmatizer(getConfigArgString(config, s"$prefix.language", Some("EN"))), // TokenClassifier.fromFiles(config.getString(s"$prefix.modelName")) TokenClassifier.fromResources(config.getString(s"$prefix.modelName")) ) @@ -185,7 +186,7 @@ class BalaurProcessor protected ( val fullyAnnotatedDocument = if (numericEntityRecognizerOpt.nonEmpty) { val numericMentions = numericEntityRecognizerOpt.get.extractFrom(partlyAnnotatedDocument) - val (newLabels, newNorms) = mkLabelsAndNorms(partlyAnnotatedDocument, numericMentions) + val (newLabels, newNorms) = NumericUtils.mkLabelsAndNorms(partlyAnnotatedDocument, numericMentions) val fullyAnnotatedSentences = partlyAnnotatedDocument.sentences.indices.map { index => partlyAnnotatedDocument.sentences(index).copy( entities = Some(newLabels(index)), @@ -205,7 +206,7 @@ class BalaurProcessor protected ( val tags = labels.map(_.head._1).toArray - postprocessPartOfSpeechTags(words, tags) + PostProcessor.postprocessPartOfSpeechTags(words, tags) tags } @@ -317,7 +318,7 @@ class BalaurProcessor protected ( words: Array[String], lemmas: Array[String], tags: Array[String], termTags: Array[Array[PredictionScore]], nonTermTags: Array[Array[PredictionScore]] - ): GraphMap = { + ): GraphMapType = { val verbose = false val graphs = GraphMap() val size = words.length @@ -391,7 +392,7 @@ object BalaurProcessor { } } - def getArgString (config: Config, argPath: String, defaultValue: Option[String]): String = + def getConfigArgString (config: Config, argPath: String, defaultValue: Option[String]): String = if (config.hasPath(argPath)) config.getString(argPath) else if(defaultValue.nonEmpty) defaultValue.get else throw new RuntimeException(s"ERROR: parameter $argPath must be defined!") From cec4087143b099e5b4c18bd0efd4dd0ba1c8306c Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Fri, 23 May 2025 14:03:23 -0700 Subject: [PATCH 11/42] Start with very basic compatibility --- .../processors/clu/BalaurProcessor.scala | 10 ++++----- .../org/clulab/sequences/NamedEntity.scala | 2 +- .../org/clulab/utils/WrappedArraySeq.scala | 21 +++++++++++++++++++ 3 files changed, 26 insertions(+), 7 deletions(-) create mode 100644 library/src/main/scala/org/clulab/utils/WrappedArraySeq.scala diff --git a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala index d33ca6336..62a47bda1 100644 --- a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala +++ b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala @@ -7,7 +7,7 @@ import org.clulab.numeric.NumericUtils import org.clulab.processors.{Document, Processor, Sentence} import org.clulab.processors.clu.tokenizer.{EnglishLemmatizer, Lemmatizer, OpenDomainEnglishTokenizer, OpenDomainPortugueseTokenizer, OpenDomainSpanishTokenizer, PortugueseLemmatizer, SpanishLemmatizer, Tokenizer} import org.clulab.processors.hexatagging.HexaDecoder -//import org.clulab.scala.WrappedArray._ +import org.clulab.utils.WrappedArraySeq import org.clulab.scala_transformers.encoder.EncoderMaxTokensRuntimeException import org.clulab.scala_transformers.encoder.TokenClassifier import org.clulab.sequences.{LexiconNER, NamedEntity} @@ -17,8 +17,6 @@ import org.clulab.struct.GraphMap.GraphMapType import org.clulab.utils.{Configured, MathUtils, ToEnhancedDependencies} import org.slf4j.{Logger, LoggerFactory} -import scala.collection.compat.immutable.ArraySeq - import BalaurProcessor._ class BalaurProcessor protected ( @@ -151,7 +149,7 @@ class BalaurProcessor protected ( val lemmas = lemmatize(words) try { - val allLabelsAndScores = tokenClassifier.predictWithScores(ArraySeq.unsafeWrapArray(words)) + val allLabelsAndScores = tokenClassifier.predictWithScores(WrappedArraySeq(words).toImmutableSeq) val tags = mkPosTags(words, allLabelsAndScores(TASK_TO_INDEX(POS_TASK))) val entities = { val optionalEntities = mkOptionalNerLabels(words, sentence.startOffsets, sentence.endOffsets, tags, lemmas) @@ -257,13 +255,13 @@ class BalaurProcessor protected ( private def mergeNerLabels(generic: Array[String], custom: Array[String]): Array[String] = { require(generic.length == custom.length) - val customNamedEntities = NamedEntity.collect(ArraySeq.unsafeWrapArray(custom)) + val customNamedEntities = NamedEntity.collect(WrappedArraySeq(custom).toImmutableSeq) val result = generic.toArray // A copy of the generic labels is created here. if (customNamedEntities.isEmpty) result else { - val genericNamedEntities = NamedEntity.collect(ArraySeq.unsafeWrapArray(generic)) + val genericNamedEntities = NamedEntity.collect(WrappedArraySeq(generic).toImmutableSeq) //println(s"Generic NamedEntity: ${genericNamedEntities.mkString(", ")}") //println(s"Custom NamedEntity: ${customNamedEntities.mkString(", ")}") diff --git a/library/src/main/scala/org/clulab/sequences/NamedEntity.scala b/library/src/main/scala/org/clulab/sequences/NamedEntity.scala index 1c2b2bcb9..5e65094f4 100644 --- a/library/src/main/scala/org/clulab/sequences/NamedEntity.scala +++ b/library/src/main/scala/org/clulab/sequences/NamedEntity.scala @@ -23,7 +23,7 @@ object NamedEntity { val INSIDE = "I-" val OUTSIDE = "O" - def collect(bioLabels: IndexedSeq[String]): IndexedSeq[NamedEntity] = { + def collect(bioLabels: Seq[String]): Seq[NamedEntity] = { def mkNamedEntity(label: String, begin: Int): NamedEntity = { // Start looking for the end one after the begin. diff --git a/library/src/main/scala/org/clulab/utils/WrappedArraySeq.scala b/library/src/main/scala/org/clulab/utils/WrappedArraySeq.scala new file mode 100644 index 000000000..a9f13f830 --- /dev/null +++ b/library/src/main/scala/org/clulab/utils/WrappedArraySeq.scala @@ -0,0 +1,21 @@ +package org.clulab.utils + +import scala.collection.mutable +import scala.collection.compat.immutable.ArraySeq + +class WrappedArraySeq[T](array: Array[T]) { + def toSeq: Seq[T] = toImmutableSeq + + def toMutableSeq: mutable.Seq[T] = { + array + } + + def toImmutableSeq: Seq[T] = { + ArraySeq.unsafeWrapArray(array) + } +} + +object WrappedArraySeq { + + def apply[T](array: Array[T]): WrappedArraySeq[T] = new WrappedArraySeq(array) +} From 57d1fa56adbaad12ac3054f85e8bc86aaf10f84e Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Fri, 23 May 2025 23:56:23 -0700 Subject: [PATCH 12/42] Down to last 13 --- build.sbt | 2 +- .../org/clulab/scala/SeqView.scala | 5 ++ .../scala-2.13/org/clulab/scala/SeqView.scala | 5 ++ .../scala-3/org/clulab/scala/SeqView.scala | 5 ++ .../org/clulab/numeric/NumericUtils.scala | 11 +-- .../numeric/actions/NumericActions.scala | 4 +- .../scala/org/clulab/odin/impl/Values.scala | 2 +- .../org/clulab/processors/Document.scala | 12 +-- .../org/clulab/processors/Processor.scala | 2 +- .../org/clulab/processors/Sentence.scala | 78 +++++++++---------- .../processors/clu/BalaurProcessor.scala | 30 +++---- .../clulab/processors/clu/DocumentMaker.scala | 11 +-- .../clulab/processors/clu/PostProcessor.scala | 2 +- .../org/clulab/processors/clu/Veil.scala | 6 +- .../clu/tokenizer/SentenceSplitter.scala | 11 +-- .../clulab/sequences/CombinedLexiconNER.scala | 6 +- .../clulab/sequences/CompactLexiconNER.scala | 14 ++-- .../org/clulab/sequences/LexiconNER.scala | 29 +++---- .../org/clulab/sequences/NamedEntity.scala | 6 +- .../sequences/SeparatedLexiconNER.scala | 2 +- .../scala/org/clulab/sequences/Tagger.scala | 2 +- .../serialization/CoNLLUSerializer.scala | 2 +- .../serialization/DocumentSerializer.scala | 21 ++--- .../serialization/json/JSONSerializer.scala | 2 +- .../clulab/serialization/json/package.scala | 2 +- .../org/clulab/struct/BooleanHashTrie.scala | 4 +- .../scala/org/clulab/struct/IntHashTrie.scala | 4 +- .../clulab/utils/ToEnhancedDependencies.scala | 22 +++--- 28 files changed, 161 insertions(+), 141 deletions(-) create mode 100644 library/src/main/scala-2.11_2.12/org/clulab/scala/SeqView.scala create mode 100644 library/src/main/scala-2.13/org/clulab/scala/SeqView.scala create mode 100644 library/src/main/scala-3/org/clulab/scala/SeqView.scala diff --git a/build.sbt b/build.sbt index e7c465370..0d5ffcc14 100644 --- a/build.sbt +++ b/build.sbt @@ -18,7 +18,7 @@ val scala37 = "3.7.0" // up to 3.7.0 // Scala33: This is the first official LTS, but hold off until necessary. val scala3 = scala33 -ThisBuild / crossScalaVersions := Seq(scala3, scala213) +ThisBuild / crossScalaVersions := Seq(scala212, scala3, scala213, scala211) ThisBuild / scalaVersion := crossScalaVersions.value.head lazy val root = (project in file(".")) diff --git a/library/src/main/scala-2.11_2.12/org/clulab/scala/SeqView.scala b/library/src/main/scala-2.11_2.12/org/clulab/scala/SeqView.scala new file mode 100644 index 000000000..c49d930cb --- /dev/null +++ b/library/src/main/scala-2.11_2.12/org/clulab/scala/SeqView.scala @@ -0,0 +1,5 @@ +package org.clulab.scala + +object SeqView { + type Immutable[T] = scala.collection.SeqView[T, Seq[T]] +} diff --git a/library/src/main/scala-2.13/org/clulab/scala/SeqView.scala b/library/src/main/scala-2.13/org/clulab/scala/SeqView.scala new file mode 100644 index 000000000..d55c09e97 --- /dev/null +++ b/library/src/main/scala-2.13/org/clulab/scala/SeqView.scala @@ -0,0 +1,5 @@ +package org.clulab.scala + +object SeqView { + type Immutable[T] = scala.collection.View[T] +} diff --git a/library/src/main/scala-3/org/clulab/scala/SeqView.scala b/library/src/main/scala-3/org/clulab/scala/SeqView.scala new file mode 100644 index 000000000..d55c09e97 --- /dev/null +++ b/library/src/main/scala-3/org/clulab/scala/SeqView.scala @@ -0,0 +1,5 @@ +package org.clulab.scala + +object SeqView { + type Immutable[T] = scala.collection.View[T] +} diff --git a/library/src/main/scala/org/clulab/numeric/NumericUtils.scala b/library/src/main/scala/org/clulab/numeric/NumericUtils.scala index 261b8dc74..80149f1ad 100644 --- a/library/src/main/scala/org/clulab/numeric/NumericUtils.scala +++ b/library/src/main/scala/org/clulab/numeric/NumericUtils.scala @@ -71,12 +71,12 @@ object NumericUtils { * @param doc This document is modified in place * @param mentions The numeric mentions previously extracted */ - def mkLabelsAndNorms(doc: Document, mentions: Seq[Mention]): (Array[Array[String]], Array[Array[String]]) = { + def mkLabelsAndNorms(doc: Document, mentions: Seq[Mention]): (Seq[Seq[String]], Seq[Seq[String]]) = { val allEntities = doc.sentences.map { sentence => - sentence.entities.getOrElse(Array.fill(sentence.size)("O")) + sentence.entities.getOrElse(Seq.fill(sentence.size)("O")) } val allNorms = doc.sentences.map { sentence => - sentence.norms.getOrElse(Array.fill(sentence.size)("")) + sentence.norms.getOrElse(Seq.fill(sentence.size)("")) } for (mention <- mentions) { @@ -93,7 +93,7 @@ object NumericUtils { (allEntities, allNorms) } - def removeOneEntityBeforeAnother(entities: Array[String], norms: Array[String], triggerEntity: String, toBeRemovedShortened: String): Unit = { + def removeOneEntityBeforeAnother(entities: Seq[String], norms: Seq[String], triggerEntity: String, toBeRemovedShortened: String): Unit = { // removes entities and norms for unallowable entity sequences, e.g., don't extract 'in' as 'inch' before B-LOC in '... Sahal 108 in Senegal' // toBeRemovedShortened is entity without BIO- val zippedEntities = entities.zipWithIndex @@ -114,7 +114,8 @@ object NumericUtils { } } - private def addLabelsAndNorms(m: Norm, entities: Array[String], norms: Array[String], tokenInt: Interval): Unit = { + // TODO: These need to be mutable + private def addLabelsAndNorms(m: Norm, entities: Seq[String], norms: Seq[String], tokenInt: Interval): Unit = { val label = m.neLabel val norm = m.neNorm diff --git a/library/src/main/scala/org/clulab/numeric/actions/NumericActions.scala b/library/src/main/scala/org/clulab/numeric/actions/NumericActions.scala index e2c3fcf97..5d686c2ba 100644 --- a/library/src/main/scala/org/clulab/numeric/actions/NumericActions.scala +++ b/library/src/main/scala/org/clulab/numeric/actions/NumericActions.scala @@ -252,14 +252,14 @@ class NumericActions(seasonNormalizer: SeasonNormalizer, unitNormalizer: UnitNor /** filter out season homonyms (fall, spring) **/ def postprocessNumericEntities(mentions: Seq[Mention]): Seq[Mention] = { - def prevWordsMatch(words: Array[String], wordIndex: Int): Boolean = { + def prevWordsMatch(words: Seq[String], wordIndex: Int): Boolean = { val prevWords = words.slice(wordIndex - 2, wordIndex).map(_.toLowerCase) prevWords.exists(NumericActions.preSeasons) || prevWords.containsSlice(NumericActions.inThe) } - def contextWordsMatch(words: Array[String], wordIndex: Int): Boolean = { + def contextWordsMatch(words: Seq[String], wordIndex: Int): Boolean = { val window = 5 val contextWords = words.slice(wordIndex - window, wordIndex + window).map(_.toLowerCase) diff --git a/library/src/main/scala/org/clulab/odin/impl/Values.scala b/library/src/main/scala/org/clulab/odin/impl/Values.scala index c2e03bbd5..0b78e7f45 100644 --- a/library/src/main/scala/org/clulab/odin/impl/Values.scala +++ b/library/src/main/scala/org/clulab/odin/impl/Values.scala @@ -3,7 +3,7 @@ package org.clulab.odin.impl import org.clulab.processors.Document trait Values { - def values(strings: Option[Array[String]], msg: String): Array[String] = + def values(strings: Option[Seq[String]], msg: String): Seq[String] = strings match { case None => sys.error(msg) case Some(strings) => strings diff --git a/library/src/main/scala/org/clulab/processors/Document.scala b/library/src/main/scala/org/clulab/processors/Document.scala index f8d226c56..1cae6a826 100644 --- a/library/src/main/scala/org/clulab/processors/Document.scala +++ b/library/src/main/scala/org/clulab/processors/Document.scala @@ -17,7 +17,7 @@ import scala.collection.mutable * Last Modified: Add apply method to copy Document. */ class Document( - val sentences: Array[Sentence], + val sentences: Seq[Sentence], /** Unique id for this document, if any */ val id: Option[String] = None, /** Clusters of coreferent mentions */ @@ -30,7 +30,7 @@ class Document( ) extends Serializable { def copy( - sentences: Array[Sentence] = sentences, + sentences: Seq[Sentence] = sentences, id: Option[String] = id, coreferenceChains: Option[CorefChains] = coreferenceChains, text: Option[String] = text, @@ -190,11 +190,11 @@ class Document( object Document { - def apply(sentences: Array[Sentence]): Document = apply(sentences, text = None) + def apply(sentences: Seq[Sentence]): Document = apply(sentences, text = None) - def apply(sentences: Array[Sentence], text: Option[String]): Document = apply(id = None, sentences, coref = None, text) + def apply(sentences: Seq[Sentence], text: Option[String]): Document = apply(id = None, sentences, coref = None, text) - def apply(id: Option[String], sentences: Array[Sentence], coref: Option[CorefChains], text: Option[String]): Document = { + def apply(id: Option[String], sentences: Seq[Sentence], coref: Option[CorefChains], text: Option[String]): Document = { val document = new Document( sentences, id = id, @@ -209,7 +209,7 @@ object Document { def apply(doc: Document): Document = apply(doc.id, doc.sentences, doc.coreferenceChains, doc.text) - def apply(doc: Document, sentences: Array[Sentence]): Document = { + def apply(doc: Document, sentences: Seq[Sentence]): Document = { val newDocument = new Document( sentences, id = doc.id, diff --git a/library/src/main/scala/org/clulab/processors/Processor.scala b/library/src/main/scala/org/clulab/processors/Processor.scala index 00d5fcdf1..e3df1e506 100644 --- a/library/src/main/scala/org/clulab/processors/Processor.scala +++ b/library/src/main/scala/org/clulab/processors/Processor.scala @@ -105,7 +105,7 @@ trait Processor { def tagPartsOfSpeech(doc: Document): Unit /** Lemmatization; modifies the document in place. */ - def lemmatize(words: Array[String]): Array[String] + def lemmatize(words: Seq[String]): Seq[String] /** Named Entity Recognition; modifies the document in place. */ def recognizeNamedEntities (doc:Document): Unit diff --git a/library/src/main/scala/org/clulab/processors/Sentence.scala b/library/src/main/scala/org/clulab/processors/Sentence.scala index 02396a4af..c5d74a2a0 100644 --- a/library/src/main/scala/org/clulab/processors/Sentence.scala +++ b/library/src/main/scala/org/clulab/processors/Sentence.scala @@ -9,11 +9,11 @@ import scala.collection.mutable /** Stores the annotations for a single sentence */ class Sentence( /** Raw tokens in this sentence; these MUST match the original text */ - val raw: Array[String], + val raw: Seq[String], /** Start character offsets for the raw tokens; start at 0 */ - val startOffsets: Array[Int], + val startOffsets: Seq[Int], /** End character offsets for the raw tokens; start at 0 */ - val endOffsets: Array[Int], + val endOffsets: Seq[Int], /** * Words produced from raw tokens, closer to what the downstream components expect @@ -22,24 +22,24 @@ class Sentence( * However, the number of raw tokens MUST always equal the number of words, so if the exact text must be recovered, * please use the raw tokens with the same positions */ - val words: Array[String], + val words: Seq[String], /** POS tags for words */ - val tags: Option[Array[String]] = None, + val tags: Option[Seq[String]] = None, /** Lemmas */ - val lemmas: Option[Array[String]] = None, + val lemmas: Option[Seq[String]] = None, /** NE labels */ - val entities: Option[Array[String]] = None, + val entities: Option[Seq[String]] = None, /** Normalized values of named/numeric entities, such as dates */ - val norms: Option[Array[String]] = None, + val norms: Option[Seq[String]] = None, /** Shallow parsing labels */ - val chunks: Option[Array[String]] = None, + val chunks: Option[Seq[String]] = None, /** Constituent tree of this sentence; includes head words */ val syntacticTree: Option[Tree] = None, /** DAG of syntactic and semantic dependencies; word offsets start at 0 */ val graphs: GraphMapType = GraphMap(), /** Relation triples from OpenIE */ - val relations:Option[Array[RelationTriple]] = None + val relations:Option[Seq[RelationTriple]] = None ) extends Serializable { def size:Int = raw.length @@ -64,7 +64,7 @@ class Sentence( def equivalenceHash: Int = { val stringCode = "org.clulab.processors.Sentence" - def getAnnotationsHash(labelsOpt: Option[Array[_]]): Int = labelsOpt + def getAnnotationsHash(labelsOpt: Option[Seq[_]]): Int = labelsOpt .map { labels => val hs = labels.map(_.hashCode) val result = Hash.withLast(labels.length)( @@ -170,19 +170,19 @@ class Sentence( // TODO def copy( - raw: Array[String] = raw, - startOffsets: Array[Int] = startOffsets, - endOffsets: Array[Int] = endOffsets, - words: Array[String] = words, - - tags: Option[Array[String]] = tags, - lemmas: Option[Array[String]] = lemmas, - entities: Option[Array[String]] = entities, - norms: Option[Array[String]] = norms, - chunks: Option[Array[String]] = chunks, + raw: Seq[String] = raw, + startOffsets: Seq[Int] = startOffsets, + endOffsets: Seq[Int] = endOffsets, + words: Seq[String] = words, + + tags: Option[Seq[String]] = tags, + lemmas: Option[Seq[String]] = lemmas, + entities: Option[Seq[String]] = entities, + norms: Option[Seq[String]] = norms, + chunks: Option[Seq[String]] = chunks, syntacticTree: Option[Tree] = syntacticTree, graphs: GraphMapType = graphs, - relations: Option[Array[RelationTriple]] = relations + relations: Option[Seq[RelationTriple]] = relations ): Sentence = new Sentence( raw, startOffsets, endOffsets, words, @@ -203,31 +203,31 @@ class Sentence( object Sentence { def apply( - raw:Array[String], - startOffsets: Array[Int], - endOffsets: Array[Int]): Sentence = + raw:Seq[String], + startOffsets: Seq[Int], + endOffsets: Seq[Int]): Sentence = new Sentence(raw, startOffsets, endOffsets, raw) // words are identical to raw tokens (a common situation) def apply( - raw:Array[String], - startOffsets: Array[Int], - endOffsets: Array[Int], - words: Array[String]): Sentence = + raw:Seq[String], + startOffsets: Seq[Int], + endOffsets: Seq[Int], + words: Seq[String]): Sentence = new Sentence(raw, startOffsets, endOffsets, words) def apply( - raw: Array[String], - startOffsets: Array[Int], - endOffsets: Array[Int], - words: Array[String], - tags: Option[Array[String]], - lemmas: Option[Array[String]], - entities: Option[Array[String]], - norms: Option[Array[String]], - chunks: Option[Array[String]], + raw: Seq[String], + startOffsets: Seq[Int], + endOffsets: Seq[Int], + words: Seq[String], + tags: Option[Seq[String]], + lemmas: Option[Seq[String]], + entities: Option[Seq[String]], + norms: Option[Seq[String]], + chunks: Option[Seq[String]], tree: Option[Tree], deps: GraphMapType, - relations: Option[Array[RelationTriple]] + relations: Option[Seq[RelationTriple]] ): Sentence = { new Sentence( raw, startOffsets, endOffsets, words, diff --git a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala index 62a47bda1..df968610b 100644 --- a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala +++ b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala @@ -91,7 +91,7 @@ class BalaurProcessor protected ( } /** Lemmatization; modifies the document in place */ - override def lemmatize(words: Array[String]): Array[String] = { + override def lemmatize(words: Seq[String]): Seq[String] = { val lemmas = words.zipWithIndex.map { case (word, index) => val lemma = wordLemmatizer.lemmatizeWord(word) // a lemma may be empty in some weird Unicode situations @@ -109,7 +109,7 @@ class BalaurProcessor protected ( } /** Generates cheap lemmas with the word in lower case, for languages where a lemmatizer is not available */ - def cheapLemmatize(sentence: Sentence): Array[String] = { + def cheapLemmatize(sentence: Sentence): Seq[String] = { sentence.words.map(_.toLowerCase()) } @@ -149,7 +149,7 @@ class BalaurProcessor protected ( val lemmas = lemmatize(words) try { - val allLabelsAndScores = tokenClassifier.predictWithScores(WrappedArraySeq(words).toImmutableSeq) + val allLabelsAndScores = tokenClassifier.predictWithScores(words) val tags = mkPosTags(words, allLabelsAndScores(TASK_TO_INDEX(POS_TASK))) val entities = { val optionalEntities = mkOptionalNerLabels(words, sentence.startOffsets, sentence.endOffsets, tags, lemmas) @@ -199,7 +199,7 @@ class BalaurProcessor protected ( fullyAnnotatedDocument } - private def mkPosTags(words: Array[String], labels: Array[Array[(String, Float)]]): Array[String] = { + private def mkPosTags(words: Seq[String], labels: Seq[Array[(String, Float)]]): Seq[String] = { assert(labels.length == words.length) val tags = labels.map(_.head._1).toArray @@ -209,9 +209,9 @@ class BalaurProcessor protected ( } private def mkOptionalNerLabels( - words: Array[String], startOffsets: Array[Int], endOffsets: Array[Int], - tags: Array[String], lemmas: Array[String] - ): Option[Array[String]] = { + words: Seq[String], startOffsets: Seq[Int], endOffsets: Seq[Int], + tags: Seq[String], lemmas: Seq[String] + ): Option[Seq[String]] = { // NER labels from the custom NER optionalNER.map { ner => val sentence = Sentence( @@ -234,10 +234,10 @@ class BalaurProcessor protected ( } /** Must be called after assignPosTags and lemmatize because it requires Sentence.tags and Sentence.lemmas */ - private def mkNamedEntityLabels(words: Array[String], labels: Array[Array[(String, Float)]], optionalNERLabels: Option[Array[String]]): Array[String] = { + private def mkNamedEntityLabels(words: Seq[String], labels: Array[Array[(String, Float)]], optionalNERLabels: Option[Seq[String]]): Seq[String] = { assert(labels.length == words.length) - val genericLabels = NamedEntity.patch(labels.map(_.head._1).toArray) + val genericLabels = NamedEntity.patch(labels.map(_.head._1)) if (optionalNERLabels.isEmpty) { genericLabels @@ -252,16 +252,16 @@ class BalaurProcessor protected ( } } - private def mergeNerLabels(generic: Array[String], custom: Array[String]): Array[String] = { + private def mergeNerLabels(generic: Seq[String], custom: Seq[String]): Seq[String] = { require(generic.length == custom.length) - val customNamedEntities = NamedEntity.collect(WrappedArraySeq(custom).toImmutableSeq) + val customNamedEntities = NamedEntity.collect(custom) val result = generic.toArray // A copy of the generic labels is created here. if (customNamedEntities.isEmpty) result else { - val genericNamedEntities = NamedEntity.collect(WrappedArraySeq(generic).toImmutableSeq) + val genericNamedEntities = NamedEntity.collect(generic) //println(s"Generic NamedEntity: ${genericNamedEntities.mkString(", ")}") //println(s"Custom NamedEntity: ${customNamedEntities.mkString(", ")}") @@ -271,10 +271,10 @@ class BalaurProcessor protected ( } } - private def mkChunkLabels(words: Array[String], labels: Array[Array[(String, Float)]]): Array[String] = { + private def mkChunkLabels(words: Seq[String], labels: Array[Array[(String, Float)]]): Seq[String] = { assert(labels.length == words.length) - labels.map(_.head._1).toArray + labels.map(_.head._1) } // The head has one score, the label has another. Here the two scores are interpolated @@ -313,7 +313,7 @@ class BalaurProcessor protected ( } private def mkDependencyLabelsUsingHexaTags( - words: Array[String], lemmas: Array[String], tags: Array[String], + words: Seq[String], lemmas: Seq[String], tags: Seq[String], termTags: Array[Array[PredictionScore]], nonTermTags: Array[Array[PredictionScore]] ): GraphMapType = { diff --git a/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala b/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala index a00fcac83..6fd8eaa76 100644 --- a/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala +++ b/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala @@ -1,13 +1,14 @@ package org.clulab.processors.clu -import org.slf4j.LoggerFactory -import org.slf4j.Logger -import org.clulab.processors.clu.tokenizer.Tokenizer import org.clulab.processors.Document -import scala.collection.mutable.ArrayBuffer import org.clulab.processors.Sentence +import org.clulab.processors.clu.tokenizer.Tokenizer +import org.clulab.scala.WrappedArrayBuffer._ +import org.slf4j.Logger +import org.slf4j.LoggerFactory import scala.collection.compat._ +import scala.collection.mutable.ArrayBuffer class DocumentMaker @@ -73,7 +74,7 @@ object DocumentMaker { charOffset += charactersBetweenTokens } // note: NO postprocessing happens in this case, so use it carefully! - sents += new Sentence(sentence.toArray, startOffsets.toArray, endOffsets.toArray, sentence.toArray) + sents += new Sentence(sentence.toSeq, startOffsets, endOffsets, sentence.toSeq) charOffset += charactersBetweenSentences - charactersBetweenTokens if(keepText) { text.append(sentence.mkString(mkSep(charactersBetweenTokens))) diff --git a/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala b/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala index 8de6a5be2..5e3001f86 100644 --- a/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala +++ b/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala @@ -15,7 +15,7 @@ object PostProcessor { val WET_OR_DRY_SEASON = Pattern.compile("""(?i)[0-9]+(ds|ws)""") /** POS tag corrections, in place */ - def postprocessPartOfSpeechTags(words: Array[String], tags: Array[String]): Array[String] = { + def postprocessPartOfSpeechTags(words: Seq[String], tags: Seq[String]): Seq[String] = { // unigram patterns words.indices.foreach { index => diff --git a/library/src/main/scala/org/clulab/processors/clu/Veil.scala b/library/src/main/scala/org/clulab/processors/clu/Veil.scala index 20d136209..aca16ee42 100644 --- a/library/src/main/scala/org/clulab/processors/clu/Veil.scala +++ b/library/src/main/scala/org/clulab/processors/clu/Veil.scala @@ -122,7 +122,7 @@ class VeiledDocument(originalDocument: Document, veiledWords: Seq[(Int, Range)]) originalDocument.copy(veiledSentences) } - def unveilStringArray(veiledArrayOpt: Option[Array[String]], sentenceIndex: Int, veil: String): Option[Array[String]] = { + def unveilStringArray(veiledArrayOpt: Option[Seq[String]], sentenceIndex: Int, veil: String): Option[Seq[String]] = { val unveilArray = unveilArrays(sentenceIndex) val originalLength = originalDocument.sentences(sentenceIndex).words.length @@ -156,7 +156,7 @@ class VeiledDocument(originalDocument: Document, veiledWords: Seq[(Int, Range)]) def unveilSyntacticTree(syntacticTreeOpt: Option[Tree]): Option[Tree] = syntacticTreeOpt // TODO - def unveilRelations(relations: Option[Array[RelationTriple]]): Option[Array[RelationTriple]] = relations + def unveilRelations(relations: Option[Seq[RelationTriple]]): Option[Seq[RelationTriple]] = relations protected def unveilSentence(veiledSentence: Sentence, sentenceIndex: Int): Sentence = { val originalSentence = originalDocument.sentences(sentenceIndex) @@ -167,7 +167,7 @@ class VeiledDocument(originalDocument: Document, veiledWords: Seq[(Int, Range)]) val unveiledSentence = veiledSentence.copy(unveiledRaw, unveiledStartOffsets, unveiledEndOffsets, unveiledWords) - def unveilStringArray(veiledArrayOpt: Option[Array[String]], veil: String): Option[Array[String]] = + def unveilStringArray(veiledArrayOpt: Option[Seq[String]], veil: String): Option[Seq[String]] = this.unveilStringArray(veiledArrayOpt, sentenceIndex, veil) val tags = unveilStringArray(unveiledSentence.tags, Veil.veiledTag) diff --git a/library/src/main/scala/org/clulab/processors/clu/tokenizer/SentenceSplitter.scala b/library/src/main/scala/org/clulab/processors/clu/tokenizer/SentenceSplitter.scala index f644da4f0..2b9dd435e 100644 --- a/library/src/main/scala/org/clulab/processors/clu/tokenizer/SentenceSplitter.scala +++ b/library/src/main/scala/org/clulab/processors/clu/tokenizer/SentenceSplitter.scala @@ -1,6 +1,7 @@ package org.clulab.processors.clu.tokenizer import org.clulab.processors.Sentence +import org.clulab.scala.WrappedArrayBuffer._ import java.io.{BufferedReader, InputStreamReader} import scala.collection.mutable.ArrayBuffer @@ -18,7 +19,7 @@ abstract class RuleBasedSentenceSplitter extends SentenceSplitter { * Sentence splitting over a stream of tokens * This includes detection of abbreviations as well **/ - override def split(tokens:Array[RawToken], sentenceSplit:Boolean):Array[Sentence] = { + override def split(tokens:Array[RawToken], sentenceSplit:Boolean):Seq[Sentence] = { val sentences = new ArrayBuffer[Sentence]() var raw = new ArrayBuffer[String]() var words = new ArrayBuffer[String]() @@ -67,7 +68,7 @@ abstract class RuleBasedSentenceSplitter extends SentenceSplitter { // found a valid end of sentence; start an empty one if (isEos) { - sentences += Sentence(raw.toArray, beginPositions.toArray, endPositions.toArray, words.toArray) + sentences += Sentence(raw.toSeq, beginPositions.toSeq, endPositions.toSeq, words.toSeq) raw = new ArrayBuffer[String]() words = new ArrayBuffer[String]() beginPositions = new ArrayBuffer[Int]() @@ -104,7 +105,7 @@ abstract class RuleBasedSentenceSplitter extends SentenceSplitter { // // create the current sentence // - sentences += Sentence(raw.toArray, beginPositions.toArray, endPositions.toArray, words.toArray) + sentences += Sentence(raw, beginPositions, endPositions, words) raw = new ArrayBuffer[String]() words = new ArrayBuffer[String]() beginPositions = new ArrayBuffer[Int]() @@ -130,10 +131,10 @@ abstract class RuleBasedSentenceSplitter extends SentenceSplitter { // a few words left over at the end if (words.nonEmpty) { - sentences += Sentence(raw.toArray, beginPositions.toArray, endPositions.toArray, words.toArray) + sentences += Sentence(raw, beginPositions, endPositions, words) } - sentences.toArray + sentences } def isAbbreviation(word:String):Boolean diff --git a/library/src/main/scala/org/clulab/sequences/CombinedLexiconNER.scala b/library/src/main/scala/org/clulab/sequences/CombinedLexiconNER.scala index cbc12a745..9c12ab411 100644 --- a/library/src/main/scala/org/clulab/sequences/CombinedLexiconNER.scala +++ b/library/src/main/scala/org/clulab/sequences/CombinedLexiconNER.scala @@ -64,7 +64,7 @@ class CombinedLexiconNER ( * @param sentence The input sentence * @return An array of BIO notations the store the outcome of the matches */ - def find(sentence: Sentence): Array[String] = { + def find(sentence: Sentence): Seq[String] = { val caseSensitiveTokens = getTokens(sentence) val caseInsensitiveTokens = if (hasCaseInsensitive) caseSensitiveTokens.map(_.toLowerCase) else caseSensitiveTokens val seq = findLongestMatch(sentence, caseSensitiveTokens, caseInsensitiveTokens) @@ -79,7 +79,7 @@ class CombinedLexiconNER ( * This means that the longest match is always chosen, even if coming from a matcher with lower priority * Only ties are disambiguated according to the order provided in the constructor */ - protected def findLongestMatch(sentence: Sentence, caseSensitiveTokens: Array[String], caseInsensitiveTokens: Array[String]): Array[String] = { + protected def findLongestMatch(sentence: Sentence, caseSensitiveTokens: Seq[String], caseInsensitiveTokens: Seq[String]): Seq[String] = { val labels = new Array[String](caseSensitiveTokens.length) val length = labels.length var offset = 0 @@ -91,7 +91,7 @@ class CombinedLexiconNER ( def getSpanAndIndex: CombinedLexiconNER.SpanAndIndex = { - def innerGetSpanAndIndex(condition: Boolean, intHashTrie: IntHashTrie, tokens: => Array[String]): CombinedLexiconNER.SpanAndIndex = { + def innerGetSpanAndIndex(condition: Boolean, intHashTrie: IntHashTrie, tokens: => Seq[String]): CombinedLexiconNER.SpanAndIndex = { if (condition) { val intTrieNodeMatch = intHashTrie.findAt(tokens, offset) CombinedLexiconNER.SpanAndIndex(intTrieNodeMatch.length, intTrieNodeMatch.completePath) diff --git a/library/src/main/scala/org/clulab/sequences/CompactLexiconNER.scala b/library/src/main/scala/org/clulab/sequences/CompactLexiconNER.scala index 924c8688f..08bee6769 100644 --- a/library/src/main/scala/org/clulab/sequences/CompactLexiconNER.scala +++ b/library/src/main/scala/org/clulab/sequences/CompactLexiconNER.scala @@ -56,7 +56,7 @@ class CompactLexiconNER( def getLabels: Seq[String] = labels - def find(sentence: Sentence): Array[String] = { + def find(sentence: Sentence): Seq[String] = { val caseSensitiveTokens = getTokens(sentence) val caseInsensitiveTokens = if (hasCaseInsensitive) caseSensitiveTokens.map(_.toLowerCase) @@ -66,14 +66,14 @@ class CompactLexiconNER( seq } - protected def findLongestMatch(sentence: Sentence, caseSensitiveTokens: Array[String], - caseInsensitiveTokens: Array[String]): Array[String] = { + protected def findLongestMatch(sentence: Sentence, caseSensitiveTokens: Seq[String], + caseInsensitiveTokens: Seq[String]): Seq[String] = { val labels = new Array[String](caseSensitiveTokens.length) val length = labels.length var offset = 0 - val caseSensitiveStringIds = if (hasCaseSensitive) caseSensitiveTokens.map( caseSensitiveCompactTrie.stringIds) else Array.empty[Int] - val caseInsensitiveStringIds = if (hasCaseInsensitive) caseInsensitiveTokens.map(caseInsensitiveCompactTrie.stringIds) else Array.empty[Int] + val caseSensitiveStringIds = if (hasCaseSensitive) caseSensitiveTokens.map( caseSensitiveCompactTrie.stringIds) else Seq.empty[Int] + val caseInsensitiveStringIds = if (hasCaseInsensitive) caseInsensitiveTokens.map(caseInsensitiveCompactTrie.stringIds) else Seq.empty[Int] // These are intended to cut down on the number of objects created. // It worked better when there was only one setting for case. @@ -88,7 +88,7 @@ class CompactLexiconNER( def updateSpanAndIndex(): Unit = { - def innerGetSpanAndIndex(condition: Boolean, stringIds: Array[Int], spanAndIndex: SpanAndIndex, + def innerGetSpanAndIndex(condition: Boolean, stringIds: Seq[Int], spanAndIndex: SpanAndIndex, compactTrie: CompactTrie): SpanAndIndex = { if (condition) { val id = stringIds(offset) @@ -136,7 +136,7 @@ class CompactLexiconNER( labels } - def findAt(ids: Array[Int], wordIndex: Int, nodeMatch: SpanAndIndex, compactTrie: CompactTrie): Unit = { + def findAt(ids: Seq[Int], wordIndex: Int, nodeMatch: SpanAndIndex, compactTrie: CompactTrie): Unit = { def linearSearch(value: Int, left: Int, right: Int): Int = { var index = left diff --git a/library/src/main/scala/org/clulab/sequences/LexiconNER.scala b/library/src/main/scala/org/clulab/sequences/LexiconNER.scala index b1c643fd3..cc8bebf16 100644 --- a/library/src/main/scala/org/clulab/sequences/LexiconNER.scala +++ b/library/src/main/scala/org/clulab/sequences/LexiconNER.scala @@ -1,6 +1,7 @@ package org.clulab.sequences import org.clulab.processors.Sentence +import org.clulab.scala.SeqView import org.clulab.scala.WrappedArray._ import org.clulab.struct.{EntityValidator, TrueEntityValidator} import org.clulab.utils.ArrayView @@ -55,7 +56,7 @@ abstract class LexiconNER(val knownCaseInsensitives: Set[String], val useLemmas: * @param sentence The input sentence * @return An array of BIO notations the store the outcome of the matches */ - def find(sentence: Sentence): Array[String] + def find(sentence: Sentence): Seq[String] def getLabels: Seq[String] /** @@ -74,49 +75,49 @@ abstract class LexiconNER(val knownCaseInsensitives: Set[String], val useLemmas: } } - def hasCondition(wordsView: ArrayView[String], condition: Char => Boolean): Boolean = + def hasCondition(wordsView: SeqView.Immutable[String], condition: Char => Boolean): Boolean = wordsView.exists(_.exists(condition)) - def hasLetter(wordsView: ArrayView[String]): Boolean = + def hasLetter(wordsView: SeqView.Immutable[String]): Boolean = hasCondition(wordsView, Character.isLetter) - def hasDigit(wordsView: ArrayView[String]): Boolean = + def hasDigit(wordsView: SeqView.Immutable[String]): Boolean = hasCondition(wordsView, Character.isDigit) - def hasUpperCaseLetters(wordsView: ArrayView[String]): Boolean = + def hasUpperCaseLetters(wordsView: SeqView.Immutable[String]): Boolean = hasCondition(wordsView, Character.isUpperCase) - def hasSpace(wordsView: ArrayView[String]): Boolean = wordsView.length > 1 + def hasSpace(wordsView: SeqView.Immutable[String]): Boolean = wordsView.size > 1 - def countCharacters(wordsView: ArrayView[String]): Int = + def countCharacters(wordsView: SeqView.Immutable[String]): Int = // Go ahead and calculate them all even though we only need to know if they exceed a value. wordsView.foldLeft(0) { (sum, word) => sum + word.length } - val contentQualifiers: Array[ArrayView[String] => Boolean] = Array( + val contentQualifiers: Array[SeqView.Immutable[String] => Boolean] = Array( // Start with the quick and easy ones. hasSpace, - { wordsView => countCharacters(wordsView) > LexiconNER.KNOWN_CASE_INSENSITIVE_LENGTH }, + { (wordsView: SeqView.Immutable[String]) => countCharacters(wordsView) > LexiconNER.KNOWN_CASE_INSENSITIVE_LENGTH }, hasDigit, hasUpperCaseLetters, - { wordsView => knownCaseInsensitives.contains(wordsView.head) } + { (wordsView: SeqView.Immutable[String]) => knownCaseInsensitives.contains(wordsView.head) } ) protected def contentfulSpan(sentence: Sentence, start: Int, length: Int): Boolean = { - val wordsView = ArrayView(sentence.words, start, start + length) + val wordsView = sentence.words.view(start, start + length) // A valid view/span must have a letter and at least one of the other qualifiers. val contentful = hasLetter(wordsView) && contentQualifiers.exists(_(wordsView)) contentful } - protected val getTokens: Sentence => Array[String] = + protected val getTokens: Sentence => Seq[String] = // Decide this once and for all and don't revisit it each time getTokens is called. if (useLemmas) getLemmas else getWords - protected def getLemmas(sentence: Sentence): Array[String] = sentence.lemmas.get + protected def getLemmas(sentence: Sentence): Seq[String] = sentence.lemmas.get - protected def getWords(sentence: Sentence): Array[String] = sentence.words + protected def getWords(sentence: Sentence): Seq[String] = sentence.words } object LexiconNER { diff --git a/library/src/main/scala/org/clulab/sequences/NamedEntity.scala b/library/src/main/scala/org/clulab/sequences/NamedEntity.scala index 5e65094f4..3ad767ca0 100644 --- a/library/src/main/scala/org/clulab/sequences/NamedEntity.scala +++ b/library/src/main/scala/org/clulab/sequences/NamedEntity.scala @@ -57,7 +57,7 @@ object NamedEntity { bioLabels } - def isValid(bioLabels: Array[String], index: Int): Boolean = { + def isValid(bioLabels: Seq[String], index: Int): Boolean = { val currBioLabel = bioLabels(index) !currBioLabel.startsWith(INSIDE) || { 0 < index && { @@ -69,7 +69,7 @@ object NamedEntity { } } - def isValid(bioLabels: Array[String]): Boolean = + def isValid(bioLabels: Seq[String]): Boolean = bioLabels.indices.forall(isValid(bioLabels, _)) // Only INSIDEs can be invalid and they are made valid by @@ -78,7 +78,7 @@ object NamedEntity { BEGIN + bioLabel.drop(INSIDE.length) // Note that this patches the array in place! - def patch(bioLabels: Array[String]): Array[String] = { + def patch(bioLabels: Seq[String]): Seq[String] = { bioLabels.indices.foreach { index => if (!isValid(bioLabels, index)) bioLabels(index) = toBegin(bioLabels(index)) diff --git a/library/src/main/scala/org/clulab/sequences/SeparatedLexiconNER.scala b/library/src/main/scala/org/clulab/sequences/SeparatedLexiconNER.scala index 435b91b5d..ac3053997 100644 --- a/library/src/main/scala/org/clulab/sequences/SeparatedLexiconNER.scala +++ b/library/src/main/scala/org/clulab/sequences/SeparatedLexiconNER.scala @@ -110,7 +110,7 @@ class SeparatedLexiconNER( labels } - protected def findAt(tokens: Array[String], caseInsensitiveTokens: Array[String], offset: Int): (Int, Int) = { + protected def findAt(tokens: Seq[String], caseInsensitiveTokens: Seq[String], offset: Int): (Int, Int) = { def findAt(matcher: BooleanHashTrie): Int = matcher.findAt(if (matcher.caseInsensitive) caseInsensitiveTokens else tokens, offset).length diff --git a/library/src/main/scala/org/clulab/sequences/Tagger.scala b/library/src/main/scala/org/clulab/sequences/Tagger.scala index 973e4dba2..8e3f54e5c 100644 --- a/library/src/main/scala/org/clulab/sequences/Tagger.scala +++ b/library/src/main/scala/org/clulab/sequences/Tagger.scala @@ -8,5 +8,5 @@ import org.clulab.processors.Sentence * Date: 10/12/17 */ trait Tagger[L] { - def find(sentence:Sentence):Array[L] + def find(sentence:Sentence):Seq[L] } diff --git a/library/src/main/scala/org/clulab/serialization/CoNLLUSerializer.scala b/library/src/main/scala/org/clulab/serialization/CoNLLUSerializer.scala index 523873e65..9508b133f 100644 --- a/library/src/main/scala/org/clulab/serialization/CoNLLUSerializer.scala +++ b/library/src/main/scala/org/clulab/serialization/CoNLLUSerializer.scala @@ -8,7 +8,7 @@ object CoNLLUSerializer { val UNDEF = "_" val ROOT = "root" - def getOrElseUndef(stringsOpt: Option[Array[String]], i: Int): String = + def getOrElseUndef(stringsOpt: Option[Seq[String]], i: Int): String = stringsOpt.map(_(i)).getOrElse(UNDEF) /** diff --git a/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala b/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala index b4ccb0122..c2f3f885c 100644 --- a/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala +++ b/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala @@ -3,6 +3,7 @@ package org.clulab.serialization import org.clulab.processors.DocumentAttachment import org.clulab.processors.DocumentAttachmentBuilderFromText import org.clulab.processors.{Document, Sentence} +import org.clulab.scala.WrappedArrayBuffer._ import org.clulab.struct._ import org.clulab.utils.Logging import org.json4s.DefaultFormats @@ -122,7 +123,7 @@ class DocumentSerializer extends Logging { } val doc = new Document( - sentences = sents.toArray, + sentences = sents, text = text, attachments = attachmentsOpt ) @@ -170,7 +171,7 @@ class DocumentSerializer extends Logging { Interval(t(0), t(1)) } - private def loadRelations(r: BufferedReader, sz: Int):Option[Array[RelationTriple]] = { + private def loadRelations(r: BufferedReader, sz: Int):Option[Seq[RelationTriple]] = { val ret = (0 until sz) map { _ => val line = r.readLine() @@ -178,7 +179,7 @@ class DocumentSerializer extends Logging { val relInterval = tokens(2) match { case "N" => None; case s => Some(mkRelationInterval(s)) } RelationTriple(tokens(0).toFloat, mkRelationInterval(tokens(1)), relInterval, mkRelationInterval(tokens(3))) } - Some(ret.toArray) + Some(ret) } private def loadSentence(r:BufferedReader): Sentence = { @@ -236,7 +237,7 @@ class DocumentSerializer extends Logging { var deps = GraphMap() var tree:Option[Tree] = None - var relations:Option[Array[RelationTriple]] = None + var relations:Option[Seq[RelationTriple]] = None while ({ bits = read(r) if (bits(0) == START_DEPENDENCIES) { @@ -256,10 +257,10 @@ class DocumentSerializer extends Logging { }) () Sentence( - rawBuffer.toArray, - startOffsetBuffer.toArray, - endOffsetBuffer.toArray, - wordBuffer.toArray, + rawBuffer, + startOffsetBuffer, + endOffsetBuffer, + wordBuffer, bufferOption(tagBuffer, nilTags), bufferOption(lemmaBuffer, nilLemmas), bufferOption(entityBuffer, nilEntities), @@ -292,10 +293,10 @@ class DocumentSerializer extends Logging { dg } - private def bufferOption[T: ClassTag](b:ArrayBuffer[T], allNils:Boolean): Option[Array[T]] = { + private def bufferOption[T: ClassTag](b:ArrayBuffer[T], allNils:Boolean): Option[Seq[T]] = { if (b.isEmpty) None else if (allNils) None - else Some(b.toArray) + else Some(b) } def save(doc:Document, os:PrintWriter): Unit = save(doc, os, keepText = false) diff --git a/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala b/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala index cac75f40a..26853d11b 100644 --- a/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala +++ b/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala @@ -81,7 +81,7 @@ object JSONSerializer { def getInts(json: JValue, k: String): Array[Int] = (json \ k).extract[Array[Int]] - def getLabelsOpt(json: JValue, k: String): Option[Array[String]] = json \ k match { + def getLabelsOpt(json: JValue, k: String): Option[Seq[String]] = json \ k match { case JNothing => None case contents => Some(contents.extract[Array[String]]) } diff --git a/library/src/main/scala/org/clulab/serialization/json/package.scala b/library/src/main/scala/org/clulab/serialization/json/package.scala index 06cf6715b..a27c14174 100644 --- a/library/src/main/scala/org/clulab/serialization/json/package.scala +++ b/library/src/main/scala/org/clulab/serialization/json/package.scala @@ -22,7 +22,7 @@ package object json { } // Arrays cannot be directly converted to JValue - implicit class ArrayOps(s: Option[Array[String]]) { + implicit class ArrayOps(s: Option[Seq[String]]) { def toSerializableJSON: Option[List[String]] = s match { case Some(s) => Some(s.toList) case None => None diff --git a/library/src/main/scala/org/clulab/struct/BooleanHashTrie.scala b/library/src/main/scala/org/clulab/struct/BooleanHashTrie.scala index fd0b586fa..dc4bed380 100644 --- a/library/src/main/scala/org/clulab/struct/BooleanHashTrie.scala +++ b/library/src/main/scala/org/clulab/struct/BooleanHashTrie.scala @@ -80,7 +80,7 @@ class BooleanHashTrie(val label: String, val caseInsensitive: Boolean = true) ex * When multiple paths are found, the longest one is kept * Text must be normalized (i.e., case folding) BEFORE this call, if necessary! */ - def findAt(sequenceNormalized: Array[String], offset: Int): BooleanTrieNode.Match = { + def findAt(sequenceNormalized: Seq[String], offset: Int): BooleanTrieNode.Match = { val longestMatch = new BooleanTrieNode.Match() entries.get(sequenceNormalized(offset)).map { tree => @@ -129,7 +129,7 @@ case class BooleanTrieNode(token: String, var completePath: Boolean, var childre * @param longestMatch The value of the longest match interval * @return true if search should stop here; false otherwise */ - def find(sequence: Array[String], + def find(sequence: Seq[String], startOffset: Int, currentSpanLength: Int, longestMatch: BooleanTrieNode.Match): Boolean = { diff --git a/library/src/main/scala/org/clulab/struct/IntHashTrie.scala b/library/src/main/scala/org/clulab/struct/IntHashTrie.scala index 70a22984e..9b3403cc5 100644 --- a/library/src/main/scala/org/clulab/struct/IntHashTrie.scala +++ b/library/src/main/scala/org/clulab/struct/IntHashTrie.scala @@ -82,7 +82,7 @@ class IntHashTrie(val caseInsensitive: Boolean = true) extends Serializable { * When multiple paths are found, the longest one is kept * Text must be normalized (i.e., case folding) BEFORE this call, if necessary! */ - def findAt(sequenceNormalized: Array[String], offset: Int): IntTrieNode.Match = { + def findAt(sequenceNormalized: Seq[String], offset: Int): IntTrieNode.Match = { val longestMatch = new IntTrieNode.Match() entries.get(sequenceNormalized(offset)).map { tree => @@ -134,7 +134,7 @@ case class IntTrieNode(token:String, var completePath: Int, var children: Option * @param longestMatch The value of the longest match interval * @return true if search should stop here; false otherwise */ - def find(sequence: Array[String], + def find(sequence: Seq[String], startOffset: Int, currentSpanLength: Int, longestMatch: IntTrieNode.Match): Boolean = { diff --git a/library/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala b/library/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala index 63eab7913..e6da8e3b7 100644 --- a/library/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala +++ b/library/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala @@ -33,7 +33,7 @@ object ToEnhancedDependencies { dgi.toDirectedGraph(Some(words.length)) } - def generateUniversalEnhancedDependencies(words: Array[String], lemmas: Array[String], tags: Array[String], dg: DirectedGraph[String]): DirectedGraph[String] = { + def generateUniversalEnhancedDependencies(words: Seq[String], lemmas: Seq[String], tags: Seq[String], dg: DirectedGraph[String]): DirectedGraph[String] = { val dgi = dg.toDirectedGraphIndex() collapseMWEs(lemmas, tags, dgi) val collapsedNmods = collapsePrepositionsUniversal(words, lemmas, tags, dgi) @@ -102,7 +102,7 @@ object ToEnhancedDependencies { * @param sentence * @param dgi */ - def expandConj(words: Array[String], dgi: DirectedGraphIndex[String]): Unit = { + def expandConj(words: Seq[String], dgi: DirectedGraphIndex[String]): Unit = { val toRemove = new ListBuffer[Edge[String]] val conjs = dgi.findByName("conj") for (conj <- conjs) { @@ -140,7 +140,7 @@ object ToEnhancedDependencies { } def collapsePrepositionsUniversal( - words: Array[String], lemmas: Array[String], tags: Array[String], + words: Seq[String], lemmas: Seq[String], tags: Seq[String], dgi:DirectedGraphIndex[String]): Seq[EdgeSpec] = { val collapsedNmods = new ArrayBuffer[EdgeSpec]() @@ -156,7 +156,7 @@ object ToEnhancedDependencies { * @param dgi The directed graph of collapsed dependencies at this stage */ def collapsePrepositionsUniversalNmodCase( - words: Array[String], + words: Seq[String], dgi:DirectedGraphIndex[String], collapsedNmods: ArrayBuffer[EdgeSpec]): Unit = { @@ -189,7 +189,7 @@ object ToEnhancedDependencies { * @param dgi The directed graph of collapsed dependencies at this stage */ def collapsePrepositionsUniversalDueTo( - lemmas: Array[String], tags: Array[String], + lemmas: Seq[String], tags: Seq[String], dgi:DirectedGraphIndex[String], collapsedNmods: ArrayBuffer[EdgeSpec]): Unit = { @@ -234,8 +234,8 @@ object ToEnhancedDependencies { * @param dgi */ def collapseMWEs( - lemmas: Array[String], - tags: Array[String], + lemmas: Seq[String], + tags: Seq[String], dgi:DirectedGraphIndex[String]): Unit = { val size = lemmas.length @@ -261,7 +261,7 @@ object ToEnhancedDependencies { if(shouldRemove) remove(toRemove, dgi) } - def findMultiWord(first: String, firstPos: Int, words: Array[String], dgi:DirectedGraphIndex[String]): String = { + def findMultiWord(first: String, firstPos: Int, words: Seq[String], dgi:DirectedGraphIndex[String]): String = { val buffer = new StringBuilder buffer.append(first) @@ -302,7 +302,7 @@ object ToEnhancedDependencies { * @param sentence The sentence to operate on * @param dgi The directed graph of collapsed dependencies at this stage */ - def propagateSubjectsAndObjectsInConjVerbs(tags: Array[String], dgi:DirectedGraphIndex[String], universal:Boolean): Unit = { + def propagateSubjectsAndObjectsInConjVerbs(tags: Seq[String], dgi:DirectedGraphIndex[String], universal:Boolean): Unit = { val conjs = dgi.findByName("conj").sortBy(_.source) for(conj <- conjs) { val left = math.min(conj.source, conj.destination) @@ -385,7 +385,7 @@ object ToEnhancedDependencies { * @param sentence The sentence to operate on * @param dgi The directed graph of collapsed dependencies at this stage */ - def propagateConjSubjectsAndObjects(tags: Array[String], dgi:DirectedGraphIndex[String]): Unit = { + def propagateConjSubjectsAndObjects(tags: Seq[String], dgi:DirectedGraphIndex[String]): Unit = { val conjs = dgi.findByName("conj").sortBy(_.source) for(conj <- conjs) { val left = math.min(conj.source, conj.destination) @@ -421,7 +421,7 @@ object ToEnhancedDependencies { * @param sentence The sentence to operate on * @param dgi The directed graph of collapsed dependencies at this stage */ - def pushSubjectsObjectsInsideRelativeClauses(tags: Array[String], dgi:DirectedGraphIndex[String], universal:Boolean): Unit = { + def pushSubjectsObjectsInsideRelativeClauses(tags: Seq[String], dgi:DirectedGraphIndex[String], universal:Boolean): Unit = { val rels = if(universal) dgi.findByName("acl:relcl") else dgi.findByName("rcmod") From ed80611107350a5de81ce4a6791a11b5249999e2 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Sat, 24 May 2025 20:02:52 -0700 Subject: [PATCH 13/42] Finish compiling library --- .../org/clulab/numeric/NumericUtils.scala | 94 +++++++++++++++---- .../clulab/processors/clu/DocumentMaker.scala | 36 +++---- .../clulab/processors/clu/PostProcessor.scala | 44 ++++++++- .../clu/tokenizer/SentenceSplitter.scala | 76 +++++++-------- .../processors/clu/tokenizer/Tokenizer.scala | 4 +- .../org/clulab/sequences/NamedEntity.scala | 46 +++++++-- .../sequences/SeparatedLexiconNER.scala | 2 +- .../org/clulab/sequences/SequenceTagger.scala | 2 +- 8 files changed, 210 insertions(+), 94 deletions(-) diff --git a/library/src/main/scala/org/clulab/numeric/NumericUtils.scala b/library/src/main/scala/org/clulab/numeric/NumericUtils.scala index 80149f1ad..e60bb225d 100644 --- a/library/src/main/scala/org/clulab/numeric/NumericUtils.scala +++ b/library/src/main/scala/org/clulab/numeric/NumericUtils.scala @@ -5,7 +5,9 @@ import org.clulab.numeric.mentions.Norm import org.clulab.odin.{EventMention, Mention} import org.clulab.processors.Document import org.clulab.struct.Interval +import org.clulab.utils.WrappedArraySeq +import scala.collection.mutable import _root_.scala.util.control.Breaks._ object NumericUtils { @@ -72,36 +74,95 @@ object NumericUtils { * @param mentions The numeric mentions previously extracted */ def mkLabelsAndNorms(doc: Document, mentions: Seq[Mention]): (Seq[Seq[String]], Seq[Seq[String]]) = { - val allEntities = doc.sentences.map { sentence => - sentence.entities.getOrElse(Seq.fill(sentence.size)("O")) + val pertinentMentions = mentions.collect { + case mention: Norm if NumericActions.isNumeric(mention) => mention } - val allNorms = doc.sentences.map { sentence => - sentence.norms.getOrElse(Seq.fill(sentence.size)("")) + val mentionsBySentenceIndex = pertinentMentions.groupBy { mention => mention.sentence } + val zippedLabelsAndNorms = doc.sentences.zipWithIndex.map { case (sentence, index) => + val mentions = mentionsBySentenceIndex.getOrElse(index, Seq.empty) + + if (mentions.isEmpty) { + val entities = sentence.entities.getOrElse(WrappedArraySeq(Array.fill(sentence.size)("O")).toImmutableSeq) + val norms = sentence.norms.getOrElse(WrappedArraySeq(Array.fill(sentence.size)("")).toImmutableSeq) + + (entities, norms) + } + else { + val mutableEntities = sentence.entities + .map { entities => Array(entities: _*) } + .getOrElse(Array.fill(sentence.size)("O")) + val mutableNorms = sentence.norms + .map { norms => Array(norms: _*) } + .getOrElse(Array.fill(sentence.size)("")) + + mentions.foreach { mention => + addLabelsAndNorms(mention.neLabel, mention.neNorm, mention.tokenInterval, mutableEntities, mutableNorms) + } + removeOneEntityBeforeAnother(mutableEntities, mutableNorms, "B-LOC", "MEASUREMENT-LENGTH") + + val immutableEntities = WrappedArraySeq(mutableEntities).toImmutableSeq + val immutableNorms = WrappedArraySeq(mutableNorms).toImmutableSeq + (immutableEntities, immutableNorms) + } } + val unzippedLabelsAndNorms = zippedLabelsAndNorms.unzip + + unzippedLabelsAndNorms + } + + def removeOneEntityBeforeAnother(entities: mutable.Seq[String], norms: mutable.Seq[String], triggerEntity: String, toBeRemovedShortened: String): Unit = { + var triggered = false - for (mention <- mentions) { - if (NumericActions.isNumeric(mention) && mention.isInstanceOf[Norm]) { - val sentenceIndex = mention.sentence - val entities = allEntities(sentenceIndex) - val norms = allNorms(sentenceIndex) + entities.indices.reverse.foreach { index => + val entity = entities(index) - addLabelsAndNorms(mention.asInstanceOf[Norm], entities, norms, mention.tokenInterval) - removeOneEntityBeforeAnother(entities, norms, "B-LOC", "MEASUREMENT-LENGTH") + if (entity == triggerEntity) + triggered = true + else { + if (triggered) + if (entity.endsWith(toBeRemovedShortened)) { + entities(index) = "O" + norms(index) = "" + } + else + triggered = false } } - (allEntities, allNorms) + + // removes entities and norms for unallowable entity sequences, e.g., don't extract 'in' as 'inch' before B-LOC in '... Sahal 108 in Senegal' + // toBeRemovedShortened is entity without BIO- + val zippedEntities = entities.zipWithIndex + + // So remove all consecutive MEASREMENT-LENGTH in front of a B-LOC + // Can it just be done backwards in one pass in a state matchine? + + zippedEntities.foreach { case (outerEntity, outerIndex) => + if (outerIndex > 0 && outerEntity == triggerEntity && entities(outerIndex - 1).endsWith(toBeRemovedShortened)) { + // Go in reverse replacing indices and norms in the immediate preceding mention. + breakable { // TODO: rewrite + for ((innerEntity, innerIndex) <- zippedEntities.slice(0, outerIndex).reverse) { + if (innerEntity.endsWith(toBeRemovedShortened)) { + entities(innerIndex) = "O" + norms(innerIndex) = "" + } else break() + } + } + } + } } - def removeOneEntityBeforeAnother(entities: Seq[String], norms: Seq[String], triggerEntity: String, toBeRemovedShortened: String): Unit = { + def removeOneEntityBeforeAnother2(entities: mutable.Seq[String], norms: mutable.Seq[String], triggerEntity: String, toBeRemovedShortened: String): Unit = { // removes entities and norms for unallowable entity sequences, e.g., don't extract 'in' as 'inch' before B-LOC in '... Sahal 108 in Senegal' // toBeRemovedShortened is entity without BIO- val zippedEntities = entities.zipWithIndex + // So remove all consecutive MEASREMENT-LENGTH in front of a B-LOC + // Can it just be done backwards in one pass in a state matchine? + zippedEntities.foreach { case (outerEntity, outerIndex) => if (outerIndex > 0 && outerEntity == triggerEntity && entities(outerIndex - 1).endsWith(toBeRemovedShortened)) { // Go in reverse replacing indices and norms in the immediate preceding mention. - zippedEntities.slice(0, outerIndex).reverse breakable { // TODO: rewrite for ((innerEntity, innerIndex) <- zippedEntities.slice(0, outerIndex).reverse) { if (innerEntity.endsWith(toBeRemovedShortened)) { @@ -115,10 +176,7 @@ object NumericUtils { } // TODO: These need to be mutable - private def addLabelsAndNorms(m: Norm, entities: Seq[String], norms: Seq[String], tokenInt: Interval): Unit = { - val label = m.neLabel - val norm = m.neNorm - + private def addLabelsAndNorms(label: String, norm: String, tokenInt: Interval, entities: mutable.Seq[String], norms: mutable.Seq[String]): Unit = { // careful here: we may override some existing entities and norms // but, given that the numeric entity rules tend to be high precision, this is probably Ok... tokenInt.headOption.foreach { index => diff --git a/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala b/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala index 6fd8eaa76..e37f32109 100644 --- a/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala +++ b/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala @@ -27,33 +27,23 @@ object DocumentMaker { } /** Constructs a document of tokens from an array of untokenized sentences */ - def mkDocumentFromSentences(tokenizer:Tokenizer, - sentences:Iterable[String], - keepText:Boolean, - charactersBetweenSentences:Int): Document = { - val sents = new ArrayBuffer[Sentence]() + def mkDocumentFromSentences( + tokenizer: Tokenizer, + texts: Iterable[String], + keepText: Boolean, + charactersBetweenSentences: Int + ): Document = { var characterOffset = 0 - for(text <- sentences) { - val sent = tokenizer.tokenize(text, sentenceSplit = false).head // we produce a single sentence here! + val sentences = texts.map { text => + val sentence = tokenizer.tokenize(text, sentenceSplit = false, characterOffset).head // We produce a single sentence here! - // update character offsets between sentences - for(i <- 0 until sent.size) { - sent.startOffsets(i) += characterOffset - sent.endOffsets(i) += characterOffset - } - - // move the character offset after the current sentence - characterOffset = sent.endOffsets.last + charactersBetweenSentences - - //println("SENTENCE: " + sent.words.mkString(", ")) - //println("Start offsets: " + sent.startOffsets.mkString(", ")) - //println("End offsets: " + sent.endOffsets.mkString(", ")) - sents += sent - } + characterOffset = sentence.endOffsets.last + charactersBetweenSentences + sentence + }.toVector // TODO: What is the best concrete collection to use? val textOpt = Option.when(keepText)(sentences.mkString(mkSep(charactersBetweenSentences))) - val doc = Document(sents.toArray, textOpt) + val document = Document(sentences, textOpt) - doc + document } /** Constructs a document of tokens from an array of tokenized sentences */ diff --git a/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala b/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala index 5e3001f86..9761eb6d2 100644 --- a/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala +++ b/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala @@ -1,9 +1,7 @@ package org.clulab.processors.clu -import org.clulab.processors.Sentence - import java.util.regex.Pattern -import org.clulab.struct.Edge +import scala.collection.mutable object PostProcessor { // @@ -15,7 +13,7 @@ object PostProcessor { val WET_OR_DRY_SEASON = Pattern.compile("""(?i)[0-9]+(ds|ws)""") /** POS tag corrections, in place */ - def postprocessPartOfSpeechTags(words: Seq[String], tags: Seq[String]): Seq[String] = { + def postprocessPartOfSpeechTags2(words: Seq[String], tags: mutable.Seq[String]): Seq[String] = { // unigram patterns words.indices.foreach { index => @@ -45,4 +43,42 @@ object PostProcessor { tags } + /** POS tag corrections */ + def postprocessPartOfSpeechTags1(words: Seq[String], tags: Seq[String]): Seq[String] = { + val newTags = words.indices.map { index => + val word = words(index) + val oldTag = tags(index) + val newTag = { + // unigram patterns + if (VERSUS_PATTERN.matcher(word).matches) + "CC" // "versus" seems like a CC to me. but maybe not... + else if (WET_OR_DRY_SEASON.matcher(word).matches) + "CD" // such years should be CDs because our grammars expect it + // bigram patterns + else if (word.equalsIgnoreCase("due")) { + if (words.lift(index + 1).map(_.toLowerCase).contains("to")) "IN" // "due" in "due to" must be a preposition + else oldTag + } + else if (word.equalsIgnoreCase("fall")) { + if (tags.lift(index + 1).contains("CD")) "NN" // "fall" followed by a CD must be NN + else oldTag + } + else oldTag + } + + newTag + } + + newTags + } + + def postprocessPartOfSpeechTags(words: Seq[String], tags: Seq[String]): Seq[String] = { + val result1 = postprocessPartOfSpeechTags1(words, tags) + val result2 = postprocessPartOfSpeechTags2(words, mutable.Seq(tags: _*)) + + if (result1 != result2) + println("It went awry!") + + result1 + } } diff --git a/library/src/main/scala/org/clulab/processors/clu/tokenizer/SentenceSplitter.scala b/library/src/main/scala/org/clulab/processors/clu/tokenizer/SentenceSplitter.scala index 2b9dd435e..8a4790246 100644 --- a/library/src/main/scala/org/clulab/processors/clu/tokenizer/SentenceSplitter.scala +++ b/library/src/main/scala/org/clulab/processors/clu/tokenizer/SentenceSplitter.scala @@ -4,6 +4,7 @@ import org.clulab.processors.Sentence import org.clulab.scala.WrappedArrayBuffer._ import java.io.{BufferedReader, InputStreamReader} +import scala.collection.compat._ import scala.collection.mutable.ArrayBuffer import scala.util.matching.Regex import scala.util.Using @@ -11,15 +12,17 @@ import scala.util.Using import SentenceSplitter._ trait SentenceSplitter { - def split(tokens:Array[RawToken], sentenceSplit:Boolean):Array[Sentence] + def split(tokens:Array[RawToken], sentenceSplit:Boolean, characterOffset: Int = 0):Seq[Sentence] } abstract class RuleBasedSentenceSplitter extends SentenceSplitter { /** * Sentence splitting over a stream of tokens - * This includes detection of abbreviations as well + * This includes detection of abbreviations as well. + * The characterOffset is included so that Sentences + * in a longer text need not be edited afterward. **/ - override def split(tokens:Array[RawToken], sentenceSplit:Boolean):Seq[Sentence] = { + override def split(tokens: Array[RawToken], sentenceSplit: Boolean, characterOffset: Int): Seq[Sentence] = { val sentences = new ArrayBuffer[Sentence]() var raw = new ArrayBuffer[String]() var words = new ArrayBuffer[String]() @@ -27,49 +30,46 @@ abstract class RuleBasedSentenceSplitter extends SentenceSplitter { var endPositions = new ArrayBuffer[Int]() for (i <- tokens.indices) { - val crt = tokens(i) - + val curr: RawToken = tokens(i) // next and previous tokens. We need these to detect proper ends of sentences - var next: Option[RawToken] = None - if (i < tokens.length - 1) next = Some(tokens(i + 1)) - var prev: Option[RawToken] = None - if (i > 0) prev = Some(tokens(i - 1)) + val nextOpt: Option[RawToken] = Option.when(i < tokens.length - 1)(tokens(i + 1)) + val prevOpt: Option[RawToken] = Option.when(i > 0)(tokens(i - 1)) // // we handle end-of-sentence markers (periods, etc.) here // this includes detecting if a period belongs to the previous token (if it's an abbreviation) // and understanding if this token actually marks the end of a sentence // - if (EOS.findFirstIn(crt.word).isDefined) { + if (EOS.findFirstIn(curr.word).isDefined) { // found a token that normally indicates end of sentence var isEos = sentenceSplit // period that probably belongs to an abbreviation and should not be marked as EOS - if (crt.word == "." && prev.isDefined && isAbbreviation(prev.get.word) && crt.beginPosition == prev.get.endPosition) { + if (curr.word == "." && prevOpt.isDefined && isAbbreviation(prevOpt.get.word) && curr.beginPosition == prevOpt.get.endPosition) { // found a period that should be attached to the previous abbreviation - endPositions(endPositions.size - 1) = crt.endPosition - words(words.size - 1) = words.last + crt.word - raw(raw.size - 1) = raw.last + crt.raw + endPositions(endPositions.size - 1) = curr.endPosition + characterOffset + words(words.size - 1) = words.last + curr.word + raw(raw.size - 1) = raw.last + curr.raw // this is not an end of sentence if the next token does NOT look like the start of a sentence // TODO: maybe this should be handled with a binary classifier instead? - if (isEos && next.isDefined && !isSentStart(next.get.word)) { + if (isEos && nextOpt.isDefined && !isSentStart(nextOpt.get.word)) { isEos = false } } // regular end-of-sentence marker; treat is a distinct token else { - raw += crt.raw - words += crt.word - beginPositions += crt.beginPosition - endPositions += crt.endPosition + raw += curr.raw + words += curr.word + beginPositions += curr.beginPosition + characterOffset + endPositions += curr.endPosition + characterOffset } // found a valid end of sentence; start an empty one if (isEos) { - sentences += Sentence(raw.toSeq, beginPositions.toSeq, endPositions.toSeq, words.toSeq) - raw = new ArrayBuffer[String]() + sentences += Sentence(raw, beginPositions, endPositions, words) + raw = new ArrayBuffer[String]() // TODO: Check whether clear() is sufficient. words = new ArrayBuffer[String]() beginPositions = new ArrayBuffer[Int]() endPositions = new ArrayBuffer[Int]() @@ -77,27 +77,27 @@ abstract class RuleBasedSentenceSplitter extends SentenceSplitter { } // found a period *inside* a token; sometimes this is an EOS - else if(EOS_FOLLOWEDBY_BULLET.findFirstIn(crt.raw).isDefined && - crt.raw.lastIndexOf('.') > 0 && - next.isDefined && isSentStart(next.get.word)) { + else if(EOS_FOLLOWEDBY_BULLET.findFirstIn(curr.raw).isDefined && + curr.raw.lastIndexOf('.') > 0 && + nextOpt.isDefined && isSentStart(nextOpt.get.word)) { //println(s"FOUND EOS INSIDE TOKEN: ${crt.raw}") // // create the last token from the token fragment before the period, and the period itself // - val dotRawPosition = crt.raw.lastIndexOf('.') + val dotRawPosition = curr.raw.lastIndexOf('.') assert(dotRawPosition > 0) - val dotWordPosition = crt.word.lastIndexOf('.') + val dotWordPosition = curr.word.lastIndexOf('.') assert(dotWordPosition > 0) - raw += crt.raw.substring(0, dotRawPosition) - words += crt.word.substring(0, dotWordPosition) - beginPositions += crt.beginPosition - endPositions += crt.beginPosition + dotRawPosition + raw += curr.raw.substring(0, dotRawPosition) + words += curr.word.substring(0, dotWordPosition) + beginPositions += curr.beginPosition + characterOffset + endPositions += curr.beginPosition + dotRawPosition + characterOffset // This is just for the period with length of 1. - raw += crt.raw.substring(dotRawPosition, dotRawPosition + 1) - words += crt.word.substring(dotWordPosition, dotWordPosition + 1) + raw += curr.raw.substring(dotRawPosition, dotRawPosition + 1) + words += curr.word.substring(dotWordPosition, dotWordPosition + 1) beginPositions += endPositions.last endPositions += beginPositions.last + 1 val lastPosition = endPositions.last @@ -114,18 +114,18 @@ abstract class RuleBasedSentenceSplitter extends SentenceSplitter { // // add the part of the token after the period to the new sentence // - raw += crt.raw.substring(dotRawPosition + 1) - words += crt.word.substring(dotWordPosition + 1) + raw += curr.raw.substring(dotRawPosition + 1) + words += curr.word.substring(dotWordPosition + 1) beginPositions += lastPosition endPositions += lastPosition + raw.head.length } else { // just a regular token - raw += crt.raw - words += crt.word - beginPositions += crt.beginPosition - endPositions += crt.endPosition + raw += curr.raw + words += curr.word + beginPositions += curr.beginPosition + characterOffset + endPositions += curr.endPosition + characterOffset } } diff --git a/library/src/main/scala/org/clulab/processors/clu/tokenizer/Tokenizer.scala b/library/src/main/scala/org/clulab/processors/clu/tokenizer/Tokenizer.scala index 85c6a09bc..11fbff7fb 100644 --- a/library/src/main/scala/org/clulab/processors/clu/tokenizer/Tokenizer.scala +++ b/library/src/main/scala/org/clulab/processors/clu/tokenizer/Tokenizer.scala @@ -70,7 +70,7 @@ class Tokenizer( } /** Tokenization and sentence splitting */ - def tokenize(text: String, sentenceSplit: Boolean = true): Array[Sentence] = { + def tokenize(text: String, sentenceSplit: Boolean = true, characterOffset: Int = 0): Seq[Sentence] = { // raw tokenization, using the antlr grammar val rawTokens = readTokens(text) // now apply all the additional non-Antlr steps such as solving contractions, normalization, post-processing @@ -78,7 +78,7 @@ class Tokenizer( step.process(rawTokens) } // sentence splitting, including detection of abbreviations - val sentences = sentenceSplitter.split(stepTokens, sentenceSplit) + val sentences = sentenceSplitter.split(stepTokens, sentenceSplit, characterOffset) sentences } diff --git a/library/src/main/scala/org/clulab/sequences/NamedEntity.scala b/library/src/main/scala/org/clulab/sequences/NamedEntity.scala index 3ad767ca0..231f57096 100644 --- a/library/src/main/scala/org/clulab/sequences/NamedEntity.scala +++ b/library/src/main/scala/org/clulab/sequences/NamedEntity.scala @@ -1,5 +1,7 @@ package org.clulab.sequences +import scala.collection.mutable + // This is definitely not the most efficient as far as number of objects // created, but there should be a NamedEntity thing to hold and not just // shadows of it projected onto the BIO notation in an array of strings. @@ -57,11 +59,18 @@ object NamedEntity { bioLabels } + // Only INSIDEs can be invalid and they are made valid by + // converting them into a BEGIN. + def toBegin(bioLabel: String): String = BEGIN + bioLabel.drop(INSIDE.length) + + def isValid(bioLabels: Seq[String]): Boolean = bioLabels.indices.forall(isValid(bioLabels, _)) + def isValid(bioLabels: Seq[String], index: Int): Boolean = { val currBioLabel = bioLabels(index) !currBioLabel.startsWith(INSIDE) || { 0 < index && { val prevBioLabel = bioLabels(index - 1) + prevBioLabel == currBioLabel || { prevBioLabel == toBegin(currBioLabel) } @@ -69,20 +78,43 @@ object NamedEntity { } } - def isValid(bioLabels: Seq[String]): Boolean = - bioLabels.indices.forall(isValid(bioLabels, _)) + def isValid2(currBioLabel: String, prevBioLabelOpt: Option[String]): Boolean = { + !currBioLabel.startsWith(INSIDE) || prevBioLabelOpt.forall { prevBioLabel => + prevBioLabel == currBioLabel || prevBioLabel == toBegin(currBioLabel) + } + } - // Only INSIDEs can be invalid and they are made valid by - // converting them into a BEGIN. - def toBegin(bioLabel: String): String = - BEGIN + bioLabel.drop(INSIDE.length) // Note that this patches the array in place! - def patch(bioLabels: Seq[String]): Seq[String] = { + def patch2(bioLabels: mutable.Seq[String]): Seq[String] = { bioLabels.indices.foreach { index => if (!isValid(bioLabels, index)) bioLabels(index) = toBegin(bioLabels(index)) } bioLabels } + + def patch1(bioLabels: Seq[String]): Seq[String] = { + var prevBioLabelOpt = bioLabels.lift(-1) + val newBioLabels = bioLabels.indices.map { index => + val oldBioLabel = bioLabels(index) + val newBioLabel = + if (!isValid2(oldBioLabel, prevBioLabelOpt)) toBegin(oldBioLabel) + else oldBioLabel + + prevBioLabelOpt = Some(newBioLabel) + newBioLabel + } + + newBioLabels + } + + def patch(bioLabels: Seq[String]): Seq[String] = { + val result1 = patch1(bioLabels) + val result2 = patch2(mutable.Seq(bioLabels: _*)) + + if (result1 != result2) + println("This went awry!") + result1 + } } diff --git a/library/src/main/scala/org/clulab/sequences/SeparatedLexiconNER.scala b/library/src/main/scala/org/clulab/sequences/SeparatedLexiconNER.scala index ac3053997..852ba1d69 100644 --- a/library/src/main/scala/org/clulab/sequences/SeparatedLexiconNER.scala +++ b/library/src/main/scala/org/clulab/sequences/SeparatedLexiconNER.scala @@ -63,7 +63,7 @@ class SeparatedLexiconNER( * @param sentence The input sentence * @return An array of BIO notations the store the outcome of the matches */ - def find(sentence: Sentence): Array[String] = { + def find(sentence: Sentence): Seq[String] = { val seq = findLongestMatch(sentence) seq } diff --git a/library/src/main/scala/org/clulab/sequences/SequenceTagger.scala b/library/src/main/scala/org/clulab/sequences/SequenceTagger.scala index 6c902e89f..93fd32c5b 100644 --- a/library/src/main/scala/org/clulab/sequences/SequenceTagger.scala +++ b/library/src/main/scala/org/clulab/sequences/SequenceTagger.scala @@ -23,7 +23,7 @@ trait SequenceTagger[L, F] extends Tagger[L] { /** Abstract method that extracts the training labels for a given sentence */ def labelExtractor(sentence:Sentence): Array[L] - override def find(sentence: Sentence): Array[L] = classesOf(sentence) + override def find(sentence: Sentence): Seq[L] = classesOf(sentence) def save(fn:File): Unit From 741307c3cda458cd3bee3706e921763c152b535e Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Sat, 24 May 2025 20:43:26 -0700 Subject: [PATCH 14/42] Compile for other Scala versions --- .../scala/org/clulab/processors/Processor.scala | 2 +- .../scala/org/clulab/processors/Sentence.scala | 4 ++-- .../clulab/processors/clu/BalaurProcessor.scala | 17 ++++++++++------- .../clulab/processors/clu/DocumentMaker.scala | 2 +- .../clulab/processors/clu/PostProcessor.scala | 2 +- .../scala/org/clulab/processors/clu/Veil.scala | 5 +++-- .../org/clulab/sequences/NamedEntity.scala | 14 ++++++++------ 7 files changed, 26 insertions(+), 20 deletions(-) diff --git a/library/src/main/scala/org/clulab/processors/Processor.scala b/library/src/main/scala/org/clulab/processors/Processor.scala index e3df1e506..b7cab3423 100644 --- a/library/src/main/scala/org/clulab/processors/Processor.scala +++ b/library/src/main/scala/org/clulab/processors/Processor.scala @@ -44,7 +44,7 @@ trait Processor { } } - val combinedSentences = documents.flatMap(_.sentences).toArray + val combinedSentences = documents.flatMap(_.sentences) val combinedDocument = new Document( sentences = combinedSentences, id = headId, diff --git a/library/src/main/scala/org/clulab/processors/Sentence.scala b/library/src/main/scala/org/clulab/processors/Sentence.scala index c5d74a2a0..c7b5a6f20 100644 --- a/library/src/main/scala/org/clulab/processors/Sentence.scala +++ b/library/src/main/scala/org/clulab/processors/Sentence.scala @@ -192,8 +192,8 @@ class Sentence( def offset(offset: Int): Sentence = { if (offset == 0) this else { - val newStartOffsets = startOffsets.map(_ + offset).toArray - val newEndOffsets = endOffsets.map(_ + offset).toArray + val newStartOffsets = startOffsets.map(_ + offset) + val newEndOffsets = endOffsets.map(_ + offset) copy(startOffsets = newStartOffsets, endOffsets = newEndOffsets) } diff --git a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala index df968610b..c3e617e51 100644 --- a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala +++ b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala @@ -190,7 +190,7 @@ class BalaurProcessor protected ( entities = Some(newLabels(index)), norms = Some(newNorms(index)) ) - }.toArray + } partlyAnnotatedDocument.copy(sentences = fullyAnnotatedSentences) } @@ -199,10 +199,10 @@ class BalaurProcessor protected ( fullyAnnotatedDocument } - private def mkPosTags(words: Seq[String], labels: Seq[Array[(String, Float)]]): Seq[String] = { + private def mkPosTags(words: Seq[String], labels: Array[Array[(String, Float)]]): Seq[String] = { assert(labels.length == words.length) - val tags = labels.map(_.head._1).toArray + val tags = WrappedArraySeq(labels.map(_.head._1)).toImmutableSeq PostProcessor.postprocessPartOfSpeechTags(words, tags) tags @@ -237,7 +237,8 @@ class BalaurProcessor protected ( private def mkNamedEntityLabels(words: Seq[String], labels: Array[Array[(String, Float)]], optionalNERLabels: Option[Seq[String]]): Seq[String] = { assert(labels.length == words.length) - val genericLabels = NamedEntity.patch(labels.map(_.head._1)) + val labelsSeq = WrappedArraySeq(labels.map(_.head._1)).toImmutableSeq + val genericLabels = NamedEntity.patch(labelsSeq) if (optionalNERLabels.isEmpty) { genericLabels @@ -256,11 +257,12 @@ class BalaurProcessor protected ( require(generic.length == custom.length) val customNamedEntities = NamedEntity.collect(custom) - val result = generic.toArray // A copy of the generic labels is created here. if (customNamedEntities.isEmpty) - result + generic else { + // TODO: kwa work on combine + val result = generic.toArray // A copy of the generic labels is created here. val genericNamedEntities = NamedEntity.collect(generic) //println(s"Generic NamedEntity: ${genericNamedEntities.mkString(", ")}") @@ -268,13 +270,14 @@ class BalaurProcessor protected ( // The custom labels override the generic ones! NamedEntity.combine(result, genericNamedEntities, customNamedEntities) + WrappedArraySeq(result).toImmutableSeq } } private def mkChunkLabels(words: Seq[String], labels: Array[Array[(String, Float)]]): Seq[String] = { assert(labels.length == words.length) - labels.map(_.head._1) + WrappedArraySeq(labels.map(_.head._1)).toImmutableSeq } // The head has one score, the label has another. Here the two scores are interpolated diff --git a/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala b/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala index e37f32109..524f73758 100644 --- a/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala +++ b/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala @@ -73,7 +73,7 @@ object DocumentMaker { } val textOpt = Option.when(keepText)(text.toString) - val doc = Document(sents.toArray, textOpt) + val doc = Document(sents, textOpt) doc } diff --git a/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala b/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala index 9761eb6d2..837e5c49c 100644 --- a/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala +++ b/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala @@ -13,7 +13,7 @@ object PostProcessor { val WET_OR_DRY_SEASON = Pattern.compile("""(?i)[0-9]+(ds|ws)""") /** POS tag corrections, in place */ - def postprocessPartOfSpeechTags2(words: Seq[String], tags: mutable.Seq[String]): Seq[String] = { + def postprocessPartOfSpeechTags2(words: Seq[String], tags: mutable.Seq[String]): mutable.Seq[String] = { // unigram patterns words.indices.foreach { index => diff --git a/library/src/main/scala/org/clulab/processors/clu/Veil.scala b/library/src/main/scala/org/clulab/processors/clu/Veil.scala index aca16ee42..aac0bc99f 100644 --- a/library/src/main/scala/org/clulab/processors/clu/Veil.scala +++ b/library/src/main/scala/org/clulab/processors/clu/Veil.scala @@ -3,6 +3,7 @@ package org.clulab.processors.clu import org.clulab.processors.{Document, Processor, Sentence} import org.clulab.struct.{DirectedGraph, Edge, GraphMap, RelationTriple, Tree} import org.clulab.struct.GraphMap.GraphMapType +import org.clulab.utils.WrappedArraySeq import scala.collection.mutable.{Set => MutableSet} @@ -109,7 +110,7 @@ class VeiledDocument(originalDocument: Document, veiledWords: Seq[(Int, Range)]) } protected lazy val veiledDocument = { val veiledSentences = originalDocument.sentences.zipWithIndex.map { case (originalSentence, sentenceIndex) => - val wordIndexes = originalSentence.words.indices.filterNot(veilSets(sentenceIndex)).toArray + val wordIndexes = originalSentence.words.indices.filterNot(veilSets(sentenceIndex)) val veiledRaw = wordIndexes.map(originalSentence.raw) val veiledStartOffsets = wordIndexes.map(originalSentence.startOffsets) val veiledEndOffsets = wordIndexes.map(originalSentence.endOffsets) @@ -132,7 +133,7 @@ class VeiledDocument(originalDocument: Document, veiledWords: Seq[(Int, Range)]) veiledArray.zipWithIndex.foreach { case (veiledString, veiledIndex) => unveiledArray(unveilArray(veiledIndex)) = veiledString } - unveiledArray + WrappedArraySeq(unveiledArray).toImmutableSeq } } diff --git a/library/src/main/scala/org/clulab/sequences/NamedEntity.scala b/library/src/main/scala/org/clulab/sequences/NamedEntity.scala index 231f57096..8104ea3f4 100644 --- a/library/src/main/scala/org/clulab/sequences/NamedEntity.scala +++ b/library/src/main/scala/org/clulab/sequences/NamedEntity.scala @@ -63,9 +63,11 @@ object NamedEntity { // converting them into a BEGIN. def toBegin(bioLabel: String): String = BEGIN + bioLabel.drop(INSIDE.length) - def isValid(bioLabels: Seq[String]): Boolean = bioLabels.indices.forall(isValid(bioLabels, _)) + def isValid(bioLabels: Seq[String]): Boolean = bioLabels.indices.forall { index => + isValid1(bioLabels(index), bioLabels.lift(index - 1)) + } - def isValid(bioLabels: Seq[String], index: Int): Boolean = { + def isValid2(bioLabels: mutable.Seq[String], index: Int): Boolean = { val currBioLabel = bioLabels(index) !currBioLabel.startsWith(INSIDE) || { 0 < index && { @@ -78,7 +80,7 @@ object NamedEntity { } } - def isValid2(currBioLabel: String, prevBioLabelOpt: Option[String]): Boolean = { + def isValid1(currBioLabel: String, prevBioLabelOpt: Option[String]): Boolean = { !currBioLabel.startsWith(INSIDE) || prevBioLabelOpt.forall { prevBioLabel => prevBioLabel == currBioLabel || prevBioLabel == toBegin(currBioLabel) } @@ -86,9 +88,9 @@ object NamedEntity { // Note that this patches the array in place! - def patch2(bioLabels: mutable.Seq[String]): Seq[String] = { + def patch2(bioLabels: mutable.Seq[String]): mutable.Seq[String] = { bioLabels.indices.foreach { index => - if (!isValid(bioLabels, index)) + if (!isValid2(bioLabels, index)) bioLabels(index) = toBegin(bioLabels(index)) } bioLabels @@ -99,7 +101,7 @@ object NamedEntity { val newBioLabels = bioLabels.indices.map { index => val oldBioLabel = bioLabels(index) val newBioLabel = - if (!isValid2(oldBioLabel, prevBioLabelOpt)) toBegin(oldBioLabel) + if (!isValid1(oldBioLabel, prevBioLabelOpt)) toBegin(oldBioLabel) else oldBioLabel prevBioLabelOpt = Some(newBioLabel) From 38369e3f2df3062f4b6c7c34d5bc55bff3961808 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Sat, 24 May 2025 21:02:30 -0700 Subject: [PATCH 15/42] Compile other projects for other Scalas --- .../apps/ProcessorsJavaExample.java | 37 ++++++++++--------- .../processors/apps/ColumnsToDocument.scala | 9 +++-- .../processors/apps/ProcessCoNLL03.scala | 2 +- .../apps/ProcessorsScalaExample.scala | 2 +- .../sentence/HtmlSentenceVisualizer.scala | 4 +- .../TestNumericEntityRecognition.scala | 6 +-- .../clulab/numeric/TestSeasonNormalizer.scala | 2 +- .../org/clulab/processors/TestTokenizer.scala | 2 +- 8 files changed, 33 insertions(+), 31 deletions(-) diff --git a/apps/src/main/java/org/clulab/processors/apps/ProcessorsJavaExample.java b/apps/src/main/java/org/clulab/processors/apps/ProcessorsJavaExample.java index ecd6005a4..9334be6c5 100644 --- a/apps/src/main/java/org/clulab/processors/apps/ProcessorsJavaExample.java +++ b/apps/src/main/java/org/clulab/processors/apps/ProcessorsJavaExample.java @@ -8,6 +8,7 @@ import org.clulab.utils.JavaUtils; import java.util.Iterator; +import scala.collection.Seq; public class ProcessorsJavaExample { public static void main(String [] args) throws Exception { @@ -20,25 +21,25 @@ public static void main(String [] args) throws Exception { // You are basically done. The rest of this code simply prints out the annotations. // Let's print the sentence-level annotations. - for (int sentenceIndex = 0; sentenceIndex < doc.sentences().length; sentenceIndex++) { - Sentence sentence = doc.sentences()[sentenceIndex]; + for (int sentenceIndex = 0; sentenceIndex < doc.sentences().length(); sentenceIndex++) { + Sentence sentence = doc.sentences().apply(sentenceIndex); System.out.println("Sentence #" + sentenceIndex + ":"); - System.out.println("Tokens: " + mkString(sentence.words())); - System.out.println("Start character offsets: " + mkString(sentence.startOffsets())); - System.out.println("End character offsets: " + mkString(sentence.endOffsets())); + System.out.println("Tokens: " + mkStringStr(sentence.words())); + System.out.println("Start character offsets: " + mkStringInt(sentence.startOffsets())); + System.out.println("End character offsets: " + mkStringInt(sentence.endOffsets())); // These annotations are optional, so they are stored using Option objects, // hence the isDefined() and get() calls. if (sentence.lemmas().isDefined()) - System.out.println("Lemmas: " + mkString(sentence.lemmas().get())); + System.out.println("Lemmas: " + mkStringStr(sentence.lemmas().get())); if (sentence.tags().isDefined()) - System.out.println("POS tags: " + mkString(sentence.tags().get())); + System.out.println("POS tags: " + mkStringStr(sentence.tags().get())); if (sentence.chunks().isDefined()) - System.out.println("Chunks: " + mkString(sentence.chunks().get())); + System.out.println("Chunks: " + mkStringStr(sentence.chunks().get())); if (sentence.entities().isDefined()) - System.out.println("Named entities: " + mkString(sentence.entities().get())); + System.out.println("Named entities: " + mkStringStr(sentence.entities().get())); if (sentence.norms().isDefined()) - System.out.println("Normalized entities: " + mkString(sentence.norms().get())); + System.out.println("Normalized entities: " + mkStringStr(sentence.norms().get())); if (sentence.dependencies().isDefined()) { System.out.println("Syntactic dependencies:"); Iterator> iterator = @@ -53,27 +54,27 @@ public static void main(String [] args) throws Exception { } } - public static String mkString(String[] strings, String sep) { + public static String mkStringStr(Seq strings, String sep) { StringBuilder stringBuilder = new StringBuilder(); - for (int i = 0; i < strings.length; i ++) { + for (int i = 0; i < strings.length(); i ++) { if (i > 0) stringBuilder.append(sep); - stringBuilder.append(strings[i]); + stringBuilder.append(strings.apply(i)); } return stringBuilder.toString(); } - public static String mkString(String[] strings) { return mkString(strings, " "); } + public static String mkStringStr(Seq strings) { return mkStringStr(strings, " "); } - public static String mkString(int[] ints, String sep) { + public static String mkStringInt(Seq ints, String sep) { StringBuilder stringBuilder = new StringBuilder(); - for (int i = 0; i < ints.length; i ++) { + for (int i = 0; i < ints.length(); i ++) { if (i > 0) stringBuilder.append(sep); - stringBuilder.append(ints[i]); + stringBuilder.append(ints.apply(i)); } return stringBuilder.toString(); } - public static String mkString(int[] ints) { return mkString(ints, " "); } + public static String mkStringInt(Seq ints) { return mkStringInt(ints, " "); } public static Iterable iteratorToIterable(Iterator iterator) { return () -> iterator; } } diff --git a/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala b/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala index 8822ba993..e38b14615 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala @@ -2,6 +2,7 @@ package org.clulab.processors.apps import org.clulab.processors.{Document, Processor, Sentence} import org.clulab.processors.clu.BalaurProcessor +import org.clulab.scala.WrappedArrayBuffer._ import org.slf4j.{Logger, LoggerFactory} import java.io.InputStream @@ -102,7 +103,7 @@ object ColumnsToDocument { if (l.isEmpty) { // end of sentence if (words.nonEmpty) { - val s = new Sentence(words.toArray, startOffsets.toArray, endOffsets.toArray, words.toArray) + val s = new Sentence(words, startOffsets, endOffsets, words) setLabels(s, labels.toArray) sentences += s words = new ArrayBuffer[String]() @@ -139,14 +140,14 @@ object ColumnsToDocument { } if(words.nonEmpty) { val s = new Sentence( - words.toArray, startOffsets.toArray, endOffsets.toArray, words.toArray, - tags = Some(labels.toArray) + words, startOffsets, endOffsets, words, + tags = Some(labels) ) sentences += s } logger.debug(s"Loaded ${sentences.size} sentences.") - val d = new Document(sentences.toArray) + val d = new Document(sentences) annotate(d) d diff --git a/apps/src/main/scala/org/clulab/processors/apps/ProcessCoNLL03.scala b/apps/src/main/scala/org/clulab/processors/apps/ProcessCoNLL03.scala index 97a990764..b92b75129 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/ProcessCoNLL03.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/ProcessCoNLL03.scala @@ -30,7 +30,7 @@ object ProcessCoNLL03 extends App { } } - def saveSent(pw: PrintWriter, sent: Array[Row], tags: Option[Array[String]] = None, chunks: Option[Array[String]] = None): Unit = { + def saveSent(pw: PrintWriter, sent: Array[Row], tags: Option[Seq[String]] = None, chunks: Option[Seq[String]] = None): Unit = { if (tags.isDefined) { assert(sent.length == tags.get.length) //println("Using generated POS tags") diff --git a/apps/src/main/scala/org/clulab/processors/apps/ProcessorsScalaExample.scala b/apps/src/main/scala/org/clulab/processors/apps/ProcessorsScalaExample.scala index 8f8dc65e1..fb203652f 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/ProcessorsScalaExample.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/ProcessorsScalaExample.scala @@ -34,5 +34,5 @@ object ProcessorsScalaExample extends App { println() } - def mkString[T](elems: Array[T]): String = elems.mkString(" ") + def mkString[T](elems: Seq[T]): String = elems.mkString(" ") } diff --git a/debugger/src/main/scala/org/clulab/odin/debugger/visualizer/sentence/HtmlSentenceVisualizer.scala b/debugger/src/main/scala/org/clulab/odin/debugger/visualizer/sentence/HtmlSentenceVisualizer.scala index 4a8866a2e..ff7a632aa 100644 --- a/debugger/src/main/scala/org/clulab/odin/debugger/visualizer/sentence/HtmlSentenceVisualizer.scala +++ b/debugger/src/main/scala/org/clulab/odin/debugger/visualizer/sentence/HtmlSentenceVisualizer.scala @@ -18,8 +18,8 @@ class HtmlSentenceVisualizer extends SentenceVisualizer with HtmlVisualizing { string } - def getOrEmpty(arrayOpt: Option[Array[String]], index: Int): String = - arrayOpt.map(_(index)).getOrElse("") + def getOrEmpty(seqOpt: Option[Seq[String]], index: Int): String = + seqOpt.map(_(index)).getOrElse("") val rows = sentence.words.indices.map { i => tr( diff --git a/library/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala b/library/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala index 4bd2bd2ea..e476f1d43 100644 --- a/library/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala +++ b/library/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala @@ -15,7 +15,7 @@ class TestNumericEntityRecognition extends Test { class HabitusTokenizer(tokenizer: Tokenizer) extends Tokenizer(tokenizer.lexer, tokenizer.steps, tokenizer.sentenceSplitter) { // TODO: Make sure en dash is preserved in raw somehow! - override def tokenize(text: String, sentenceSplit: Boolean = true): Array[Sentence] = { + override def tokenize(text: String, sentenceSplit: Boolean = true, characterOffset: Int): Seq[Sentence] = { // Cheat and swap out some en dashes if necessary. val habitusText = if (text.contains(HabitusTokenizer.endash)) @@ -23,7 +23,7 @@ class TestNumericEntityRecognition extends Test { else text - tokenizer.tokenize(habitusText, sentenceSplit) + tokenizer.tokenize(habitusText, sentenceSplit, characterOffset) } } @@ -653,7 +653,7 @@ class TestNumericEntityRecognition extends Test { } /** Runs the actual numeric entity recognizer */ - def numericParse(sentence: String): (Array[String], Array[String], Array[String]) = { + def numericParse(sentence: String): (Seq[String], Seq[String], Seq[String]) = { val doc = proc.annotate(sentence) val mentions = ner.extractFrom(doc) NumericUtils.mkLabelsAndNorms(doc, mentions) diff --git a/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala b/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala index 8f8fa38ff..93db9fa4d 100644 --- a/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala +++ b/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala @@ -13,7 +13,7 @@ class TestSeasonNormalizer extends Test { val fallDateRange = "2017-09-22 -- 2017-12-21" val seasonDateRange = "2017-06-XX -- 2017-10-XX" - def mkEntitiesAndNorms(processor: BalaurProcessor, text: String): (Array[String], Array[String]) = { + def mkEntitiesAndNorms(processor: BalaurProcessor, text: String): (Seq[String], Seq[String]) = { val document = processor.annotate(text) val mentions = processor.numericEntityRecognizerOpt.get.extractFrom(document) diff --git a/library/src/test/scala/org/clulab/processors/TestTokenizer.scala b/library/src/test/scala/org/clulab/processors/TestTokenizer.scala index c4d67af56..afd2b594d 100644 --- a/library/src/test/scala/org/clulab/processors/TestTokenizer.scala +++ b/library/src/test/scala/org/clulab/processors/TestTokenizer.scala @@ -223,7 +223,7 @@ class TestTokenizer extends Test { } } - def tok(s:String):Array[Sentence] = { + def tok(s: String): Seq[Sentence] = { println(s"Tokenizing text: $s") val t = new OpenDomainEnglishTokenizer(None) val sents = t.tokenize(s) From deb244b170e3eb3bbe9c9d17bacab8d96dcb1f06 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Sat, 24 May 2025 23:28:36 -0700 Subject: [PATCH 16/42] Compile tests --- .../org/clulab/odin/TestNumericPatterns.scala | 10 +++++----- .../scala/org/clulab/odin/TestTokenPattern.scala | 16 ++++++++-------- .../clulab/struct/TestDocumentAttachment.scala | 8 ++++---- .../scala/org/clulab/utils/TestFindHeads.scala | 8 ++++---- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/library/src/test/scala/org/clulab/odin/TestNumericPatterns.scala b/library/src/test/scala/org/clulab/odin/TestNumericPatterns.scala index 297346984..b3b477e18 100644 --- a/library/src/test/scala/org/clulab/odin/TestNumericPatterns.scala +++ b/library/src/test/scala/org/clulab/odin/TestNumericPatterns.scala @@ -9,12 +9,12 @@ class TestNumericPatterns extends Test { val text = "blah" val doc = Document( - Array( + Seq( Sentence( - Array("blah"), - Array(0), - Array(4), - Array("blah") + Seq("blah"), + Seq(0), + Seq(4), + Seq("blah") ) ) ) diff --git a/library/src/test/scala/org/clulab/odin/TestTokenPattern.scala b/library/src/test/scala/org/clulab/odin/TestTokenPattern.scala index 10738826d..3300b791e 100644 --- a/library/src/test/scala/org/clulab/odin/TestTokenPattern.scala +++ b/library/src/test/scala/org/clulab/odin/TestTokenPattern.scala @@ -61,13 +61,13 @@ class TestTokenPattern extends Test { } val text4 = "a b c d e f g h i c" - val tokens = text4.split(" ") + val tokens = text4.split(" ").toSeq val doc = Document( - Array( + Seq( Sentence( tokens, - Array(0, 2, 4, 6, 8, 10, 12, 14, 16, 18), - Array(1, 3, 5, 7, 9, 11, 13, 15, 17, 19), + Seq(0, 2, 4, 6, 8, 10, 12, 14, 16, 18), + Seq(1, 3, 5, 7, 9, 11, 13, 15, 17, 19), tokens ) ) @@ -614,11 +614,11 @@ class TestTokenPattern extends Test { val text8 = "x a a b a b a b a b c d" val doc8 = Document( - Array( + Seq( Sentence( - text8.split(" "), - Array(0, 2, 4, 6, 8, 10, 12, 14, 16, 18), - Array(1, 3, 5, 7, 9, 11, 13, 15, 17, 19) + text8.split(" ").toSeq, + Seq(0, 2, 4, 6, 8, 10, 12, 14, 16, 18), + Seq(1, 3, 5, 7, 9, 11, 13, 15, 17, 19) ) ) ) diff --git a/library/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala b/library/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala index b34393b2b..8bf2c792c 100644 --- a/library/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala +++ b/library/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala @@ -130,7 +130,7 @@ class TestDocumentAttachment extends Test { (LAST_KEY, new TextNameDocumentAttachment(LAST_NAME)), (ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) ) - val oldDocument = new Document(sentences = Array.empty[Sentence], attachments = Some(oldAttachments)) + val oldDocument = new Document(sentences = Seq.empty[Sentence], attachments = Some(oldAttachments)) val documentSerializer = new DocumentSerializer() val documentString = documentSerializer.save(oldDocument) @@ -153,7 +153,7 @@ class TestDocumentAttachment extends Test { (LAST_KEY, new ObjectNameDocumentAttachment(LAST_NAME)), (ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) ) - val oldDocument = new Document(sentences = Array.empty[Sentence], attachments = Some(oldAttachments)) + val oldDocument = new Document(sentences = Seq.empty[Sentence], attachments = Some(oldAttachments)) val documentSerializer = new DocumentSerializer() // This should be a messy string. @@ -177,7 +177,7 @@ class TestDocumentAttachment extends Test { (LAST_KEY, new TextNameDocumentAttachment(LAST_NAME)), (ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) ) - val oldDocument = new Document(sentences = Array.empty[Sentence], attachments = Some(oldAttachments)) + val oldDocument = new Document(sentences = Seq.empty[Sentence], attachments = Some(oldAttachments)) // This shouldn't compile. /*oldDocument.addAttachment("wrong", new NameMethodAttachment("name"))*/ @@ -203,7 +203,7 @@ class TestDocumentAttachment extends Test { (LAST_KEY, new ObjectNameDocumentAttachment(LAST_NAME)), (ALIAS_KEY, new NameDocumentAttachment(ALIAS_NAME)) ) - val oldDocument = new Document(Array.empty[Sentence], attachments = Some(oldAttachments)) + val oldDocument = new Document(Seq.empty[Sentence], attachments = Some(oldAttachments)) // This should be a messy string. val documentString = prettyJson(renderJValue(oldDocument.jsonAST)) diff --git a/library/src/test/scala/org/clulab/utils/TestFindHeads.scala b/library/src/test/scala/org/clulab/utils/TestFindHeads.scala index 13e36fb85..4fd3fdfe4 100644 --- a/library/src/test/scala/org/clulab/utils/TestFindHeads.scala +++ b/library/src/test/scala/org/clulab/utils/TestFindHeads.scala @@ -6,9 +6,9 @@ import org.clulab.struct.{DirectedGraph, Edge, Interval} class TestFindHeads extends Test { - def newSentence(words: Array[String], directedGraph: DirectedGraph[String]): Sentence = { - val startOffsets = Array(0) // unused - val endOffsets = Array(0) // unused + def newSentence(words: Seq[String], directedGraph: DirectedGraph[String]): Sentence = { + val startOffsets = Seq(0) // unused + val endOffsets = Seq(0) // unused val sentence = new Sentence( words, startOffsets, endOffsets, words, tags = Some(words) @@ -117,7 +117,7 @@ class TestFindHeads extends Test { val len: Int = 78 val directedGraph = DirectedGraph(edges) val tokenInterval = Interval(0, len) - val words = 1.to(len).map { index => s"word$index" }.toArray + val words = 1.to(len).map { index => s"word$index" } val sentence = newSentence(words, directedGraph) val heads = DependencyUtils.findHeadsStrict(tokenInterval, sentence) From e9876cf55a987c17d27e5106d859bdcfb78f8ab8 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Sun, 25 May 2025 00:42:24 -0700 Subject: [PATCH 17/42] Pass tests --- .../org/clulab/processors/clu/BalaurProcessor.scala | 4 ++-- .../org/clulab/processors/clu/DocumentMaker.scala | 10 ++++++---- .../main/scala/org/clulab/sequences/NamedEntity.scala | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala index c3e617e51..39c39fbae 100644 --- a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala +++ b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala @@ -203,9 +203,9 @@ class BalaurProcessor protected ( assert(labels.length == words.length) val tags = WrappedArraySeq(labels.map(_.head._1)).toImmutableSeq + val result = PostProcessor.postprocessPartOfSpeechTags(words, tags) - PostProcessor.postprocessPartOfSpeechTags(words, tags) - tags + result } private def mkOptionalNerLabels( diff --git a/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala b/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala index 524f73758..2e228c6f3 100644 --- a/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala +++ b/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala @@ -4,6 +4,7 @@ import org.clulab.processors.Document import org.clulab.processors.Sentence import org.clulab.processors.clu.tokenizer.Tokenizer import org.clulab.scala.WrappedArrayBuffer._ +import org.clulab.utils.WrappedArraySeq import org.slf4j.Logger import org.slf4j.LoggerFactory @@ -34,13 +35,14 @@ object DocumentMaker { charactersBetweenSentences: Int ): Document = { var characterOffset = 0 - val sentences = texts.map { text => + val sentencesArray = texts.map { text => val sentence = tokenizer.tokenize(text, sentenceSplit = false, characterOffset).head // We produce a single sentence here! characterOffset = sentence.endOffsets.last + charactersBetweenSentences sentence - }.toVector // TODO: What is the best concrete collection to use? - val textOpt = Option.when(keepText)(sentences.mkString(mkSep(charactersBetweenSentences))) + }.toArray + val sentences = WrappedArraySeq(sentencesArray).toImmutableSeq + val textOpt = Option.when(keepText)(texts.mkString(mkSep(charactersBetweenSentences))) val document = Document(sentences, textOpt) document @@ -52,7 +54,7 @@ object DocumentMaker { charactersBetweenSentences:Int, charactersBetweenTokens:Int): Document = { var charOffset = 0 - var sents = new ArrayBuffer[Sentence]() + val sents = new ArrayBuffer[Sentence]() val text = new StringBuilder for(sentence <- sentences) { val startOffsets = new ArrayBuffer[Int]() diff --git a/library/src/main/scala/org/clulab/sequences/NamedEntity.scala b/library/src/main/scala/org/clulab/sequences/NamedEntity.scala index 8104ea3f4..f20f6f91e 100644 --- a/library/src/main/scala/org/clulab/sequences/NamedEntity.scala +++ b/library/src/main/scala/org/clulab/sequences/NamedEntity.scala @@ -81,7 +81,7 @@ object NamedEntity { } def isValid1(currBioLabel: String, prevBioLabelOpt: Option[String]): Boolean = { - !currBioLabel.startsWith(INSIDE) || prevBioLabelOpt.forall { prevBioLabel => + !currBioLabel.startsWith(INSIDE) || prevBioLabelOpt.exists { prevBioLabel => prevBioLabel == currBioLabel || prevBioLabel == toBegin(currBioLabel) } } From bec8f1862d532f6b278c5af004c34fdc56a6f99f Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Mon, 26 May 2025 09:05:08 -0700 Subject: [PATCH 18/42] Clean, get webapp to work --- build.sbt | 12 ++--- .../processors/clu/BalaurProcessor.scala | 47 +++++++++---------- .../webapp/serialization/ParseObj.scala | 6 +-- 3 files changed, 31 insertions(+), 34 deletions(-) diff --git a/build.sbt b/build.sbt index 0d5ffcc14..7a7df39bf 100644 --- a/build.sbt +++ b/build.sbt @@ -34,17 +34,17 @@ lazy val library = project lazy val apps = project .dependsOn(library % "compile -> compile; test -> test") -// lazy val webapp = project - // .enablePlugins(PlayScala) - // .dependsOn(library % "compile -> compile; test -> test") - // .settings( + lazy val webapp = project + .enablePlugins(PlayScala) + .dependsOn(library % "compile -> compile; test -> test") + .settings( // scala3 doesn't have play (for 2.8.19 as specified by the project) and is ruled out completely. // scala213 has version problems for com.fasterxml.jackson.databind.JsonMappingException. // scala212 works! // scala211 isn't compiling and complains on twirlCompileTemplates. // This isn't a library. Only one version needs to work. We shouldn't use play for this anyway. - // crossScalaVersions := Seq(scala212) - // ) + crossScalaVersions := Seq(scala212) + ) lazy val debugger = project .dependsOn(library % "compile -> compile; test -> test") diff --git a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala index 39c39fbae..34b616395 100644 --- a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala +++ b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala @@ -90,11 +90,10 @@ class BalaurProcessor protected ( throw new RuntimeException("ERROR: cannot call this method on its own in this processor!") } - /** Lemmatization; modifies the document in place */ override def lemmatize(words: Seq[String]): Seq[String] = { val lemmas = words.zipWithIndex.map { case (word, index) => val lemma = wordLemmatizer.lemmatizeWord(word) - // a lemma may be empty in some weird Unicode situations + // A lemma may be empty in some weird Unicode situations. val nonEmptyLemma = if (lemma.isEmpty) { logger.debug(s"""WARNING: Found empty lemma for word #$index "$word" in sentence: ${words.mkString(" ")}""") @@ -163,11 +162,11 @@ class BalaurProcessor protected ( allLabelsAndScores(TASK_TO_INDEX(HEXA_NONTERM_TASK)) ) // Entities and norms need to still be patched and filled in, so this is only a partly annotated sentence. - val partlyAnnotatedDocument = sentence.copy( + val partlyAnnotatedSentence = sentence.copy( tags = Some(tags), lemmas = Some(lemmas), entities = Some(entities), chunks = Some(chunks), graphs = graphs ) - partlyAnnotatedDocument + partlyAnnotatedSentence } catch { // No values, not even lemmas, will be included in the annotation is there was an exception. @@ -181,20 +180,18 @@ class BalaurProcessor protected ( } } val partlyAnnotatedDocument = document.copy(sentences = partlyAnnotatedSentences) - val fullyAnnotatedDocument = - if (numericEntityRecognizerOpt.nonEmpty) { - val numericMentions = numericEntityRecognizerOpt.get.extractFrom(partlyAnnotatedDocument) - val (newLabels, newNorms) = NumericUtils.mkLabelsAndNorms(partlyAnnotatedDocument, numericMentions) - val fullyAnnotatedSentences = partlyAnnotatedDocument.sentences.indices.map { index => - partlyAnnotatedDocument.sentences(index).copy( - entities = Some(newLabels(index)), - norms = Some(newNorms(index)) - ) - } + val fullyAnnotatedDocument = numericEntityRecognizerOpt.map { numericEntityRecognizer => + val numericMentions = numericEntityRecognizer.extractFrom(partlyAnnotatedDocument) + val (newLabels, newNorms) = NumericUtils.mkLabelsAndNorms(partlyAnnotatedDocument, numericMentions) + val fullyAnnotatedSentences = partlyAnnotatedDocument.sentences.indices.map { index => + partlyAnnotatedDocument.sentences(index).copy( + entities = Some(newLabels(index)), + norms = Some(newNorms(index)) + ) + } - partlyAnnotatedDocument.copy(sentences = fullyAnnotatedSentences) - } - else partlyAnnotatedDocument + partlyAnnotatedDocument.copy(sentences = fullyAnnotatedSentences) + }.getOrElse(partlyAnnotatedDocument) fullyAnnotatedDocument } @@ -239,18 +236,18 @@ class BalaurProcessor protected ( val labelsSeq = WrappedArraySeq(labels.map(_.head._1)).toImmutableSeq val genericLabels = NamedEntity.patch(labelsSeq) - - if (optionalNERLabels.isEmpty) { - genericLabels - } - else { + val specificLabels = optionalNERLabels.map { nerLabels => //println(s"MERGING NE labels for sentence: ${sent.words.mkString(" ")}") //println(s"Generic labels: ${NamedEntity.patch(labels).mkString(", ")}") //println(s"Optional labels: ${optionalNERLabels.get.mkString(", ")}") - val mergedLabels = NamedEntity.patch(mergeNerLabels(genericLabels, optionalNERLabels.get)) + val mergedLabels = mergeNerLabels(genericLabels, nerLabels) + val patchedLabels = NamedEntity.patch(mergedLabels) //println(s"Merged labels: ${mergedLabels.mkString(", ")}") - mergedLabels - } + + patchedLabels + }.getOrElse(genericLabels) + + specificLabels } private def mergeNerLabels(generic: Seq[String], custom: Seq[String]): Seq[String] = { diff --git a/webapp/app/org/clulab/processors/webapp/serialization/ParseObj.scala b/webapp/app/org/clulab/processors/webapp/serialization/ParseObj.scala index 0c9bff455..617a4303d 100644 --- a/webapp/app/org/clulab/processors/webapp/serialization/ParseObj.scala +++ b/webapp/app/org/clulab/processors/webapp/serialization/ParseObj.scala @@ -14,7 +14,7 @@ class ParseObj(doc: Document) { head + xml.Utility.escape(text) + tail } - def getTdAtOptString(option: Option[Array[String]], n: Int): String = { + def getTdAtOptString(option: Option[Seq[String]], n: Int): String = { val text = if (option.isEmpty) "" else option.get(n) @@ -22,9 +22,9 @@ class ParseObj(doc: Document) { getTd(text) } - def getTdAtString(values: Array[String], n: Int): String = getTd(values(n)) + def getTdAtString(values: Seq[String], n: Int): String = getTd(values(n)) - def getTdAtInt(values: Array[Int], n: Int): String = getTd(values(n).toString, true) + def getTdAtInt(values: Seq[Int], n: Int): String = getTd(values(n).toString, true) def edgesToString(to: Int): String = { val edges = sentence.dependencies.map(_.incomingEdges(to)).getOrElse(Array.empty) From 737e538b40b6f49829bdcc32f0668fded58ccf1f Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Mon, 26 May 2025 09:11:22 -0700 Subject: [PATCH 19/42] Remove dead code --- .../clulab/processors/clu/PostProcessor.scala | 43 +------------------ .../org/clulab/sequences/NamedEntity.scala | 42 +++--------------- 2 files changed, 6 insertions(+), 79 deletions(-) diff --git a/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala b/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala index 837e5c49c..2226e4642 100644 --- a/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala +++ b/library/src/main/scala/org/clulab/processors/clu/PostProcessor.scala @@ -12,39 +12,8 @@ object PostProcessor { // Matches agricultural season short hands such as "2021DS" or "2021WS" val WET_OR_DRY_SEASON = Pattern.compile("""(?i)[0-9]+(ds|ws)""") - /** POS tag corrections, in place */ - def postprocessPartOfSpeechTags2(words: Seq[String], tags: mutable.Seq[String]): mutable.Seq[String] = { - - // unigram patterns - words.indices.foreach { index => - if (tags(index) != "CC" && VERSUS_PATTERN.matcher(words(index)).matches) { - tags(index) = "CC" // "versus" seems like a CC to me. but maybe not... - } - - if(WET_OR_DRY_SEASON.matcher(words(index)).matches) { - tags(index) = "CD" // such years should be CDs because our grammars expect it - } - } - - // bigram patterns - words.indices.dropRight(1).foreach { curr => - val next = curr + 1 - // "due" in "due to" must be a preposition - if (words(curr).equalsIgnoreCase("due") && words(next).equalsIgnoreCase("to")) { - tags(curr) = "IN" - } - - // "fall" followed by a CD must be NN - else if(words(curr).equalsIgnoreCase("fall") && tags(next).equals("CD")) { - tags(curr) = "NN" - } - } - - tags - } - /** POS tag corrections */ - def postprocessPartOfSpeechTags1(words: Seq[String], tags: Seq[String]): Seq[String] = { + def postprocessPartOfSpeechTags(words: Seq[String], tags: Seq[String]): Seq[String] = { val newTags = words.indices.map { index => val word = words(index) val oldTag = tags(index) @@ -71,14 +40,4 @@ object PostProcessor { newTags } - - def postprocessPartOfSpeechTags(words: Seq[String], tags: Seq[String]): Seq[String] = { - val result1 = postprocessPartOfSpeechTags1(words, tags) - val result2 = postprocessPartOfSpeechTags2(words, mutable.Seq(tags: _*)) - - if (result1 != result2) - println("It went awry!") - - result1 - } } diff --git a/library/src/main/scala/org/clulab/sequences/NamedEntity.scala b/library/src/main/scala/org/clulab/sequences/NamedEntity.scala index f20f6f91e..36aae40ce 100644 --- a/library/src/main/scala/org/clulab/sequences/NamedEntity.scala +++ b/library/src/main/scala/org/clulab/sequences/NamedEntity.scala @@ -59,49 +59,26 @@ object NamedEntity { bioLabels } - // Only INSIDEs can be invalid and they are made valid by + // Only INSIDEs can be invalid, and they are made valid by // converting them into a BEGIN. def toBegin(bioLabel: String): String = BEGIN + bioLabel.drop(INSIDE.length) def isValid(bioLabels: Seq[String]): Boolean = bioLabels.indices.forall { index => - isValid1(bioLabels(index), bioLabels.lift(index - 1)) + isValid(bioLabels(index), bioLabels.lift(index - 1)) } - def isValid2(bioLabels: mutable.Seq[String], index: Int): Boolean = { - val currBioLabel = bioLabels(index) - !currBioLabel.startsWith(INSIDE) || { - 0 < index && { - val prevBioLabel = bioLabels(index - 1) - - prevBioLabel == currBioLabel || { - prevBioLabel == toBegin(currBioLabel) - } - } - } - } - - def isValid1(currBioLabel: String, prevBioLabelOpt: Option[String]): Boolean = { + def isValid(currBioLabel: String, prevBioLabelOpt: Option[String]): Boolean = { !currBioLabel.startsWith(INSIDE) || prevBioLabelOpt.exists { prevBioLabel => prevBioLabel == currBioLabel || prevBioLabel == toBegin(currBioLabel) } } - - // Note that this patches the array in place! - def patch2(bioLabels: mutable.Seq[String]): mutable.Seq[String] = { - bioLabels.indices.foreach { index => - if (!isValid2(bioLabels, index)) - bioLabels(index) = toBegin(bioLabels(index)) - } - bioLabels - } - - def patch1(bioLabels: Seq[String]): Seq[String] = { + def patch(bioLabels: Seq[String]): Seq[String] = { var prevBioLabelOpt = bioLabels.lift(-1) val newBioLabels = bioLabels.indices.map { index => val oldBioLabel = bioLabels(index) val newBioLabel = - if (!isValid1(oldBioLabel, prevBioLabelOpt)) toBegin(oldBioLabel) + if (!isValid(oldBioLabel, prevBioLabelOpt)) toBegin(oldBioLabel) else oldBioLabel prevBioLabelOpt = Some(newBioLabel) @@ -110,13 +87,4 @@ object NamedEntity { newBioLabels } - - def patch(bioLabels: Seq[String]): Seq[String] = { - val result1 = patch1(bioLabels) - val result2 = patch2(mutable.Seq(bioLabels: _*)) - - if (result1 != result2) - println("This went awry!") - result1 - } } From 4cfd51822164f1d051a2798c88c7ad940cfd7959 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Mon, 26 May 2025 12:04:29 -0700 Subject: [PATCH 20/42] Maintenance Compile with no warnings, some renamed processor variables, view changes --- .../apps/NumericEntityRecognizerShell.scala | 3 +- .../clulab/processors/apps/OdinStarter.scala | 2 +- .../apps/DebuggingOdinStarterApp.scala | 2 +- .../graph/DebugRelationGraphExtractor.scala | 2 +- .../DebugTriggerMentionGraphExtractor.scala | 2 +- .../DebugTriggerPatternGraphExtractor.scala | 2 +- .../org/clulab/struct/GraphMap.scala | 2 + .../org/clulab/struct/GraphMap.scala | 2 + .../org/clulab/odinstarter/OdinStarter3.scala | 2 +- .../scala-3/org/clulab/struct/GraphMap.scala | 2 + .../org/clulab/processors/Sentence.scala | 12 +- .../processors/clu/BalaurProcessor.scala | 199 ++++++++---------- .../sequences/BiMEMMSequenceTagger.scala | 2 +- .../clulab/sequences/CombinedLexiconNER.scala | 1 + .../clulab/sequences/CompactLexiconNER.scala | 10 +- .../org/clulab/sequences/LexiconNER.scala | 7 +- .../clulab/sequences/MEMMSequenceTagger.scala | 4 +- .../org/clulab/sequences/NamedEntity.scala | 11 +- .../org/clulab/sequences/SequenceTagger.scala | 2 +- .../sequences/SequenceTaggerShell.scala | 1 + .../org/clulab/struct/BooleanHashTrie.scala | 4 +- .../scala/org/clulab/struct/HashTrie.scala | 4 +- .../scala/org/clulab/utils/ArrayView.scala | 37 ---- .../clulab/utils/ToEnhancedDependencies.scala | 2 +- .../org/clulab/utils/TestHash.scala | 2 +- .../org/clulab/utils/TestHash.scala | 2 +- .../scala-3/org/clulab/utils/TestHash.scala | 2 +- .../scala/org/clulab/processors/CluTest.scala | 2 +- .../clulab/sequences/TestNamedEntity.scala | 3 +- .../org/clulab/utils/TestArrayView.scala | 54 ----- .../webapp/controllers/HomeController.scala | 2 +- 31 files changed, 141 insertions(+), 243 deletions(-) delete mode 100644 library/src/main/scala/org/clulab/utils/ArrayView.scala delete mode 100644 library/src/test/scala/org/clulab/utils/TestArrayView.scala diff --git a/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala b/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala index 3a93ff4bd..c77688a54 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala @@ -23,9 +23,8 @@ class ReloadableNumericProcessor(ruleDirOpt: Option[String]) extends ReloadableP val numericEntityRecognizerOpt = balaurProcessor .numericEntityRecognizerOpt .map(_.reloaded(new File(ruleDirOpt.get))) - val numericEntityRecognizerOptOpt = numericEntityRecognizerOpt.map(Option(_)) - processorOpt = Some(balaurProcessor.copy(numericEntityRecognizerOptOpt = numericEntityRecognizerOptOpt)) + processorOpt = Some(balaurProcessor.copy(numericEntityRecognizerOpt = numericEntityRecognizerOpt)) } } diff --git a/apps/src/main/scala/org/clulab/processors/apps/OdinStarter.scala b/apps/src/main/scala/org/clulab/processors/apps/OdinStarter.scala index 09440b813..54abb3b5e 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/OdinStarter.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/OdinStarter.scala @@ -24,7 +24,7 @@ object OdinStarter extends App { LexiconNER(kbs, caseInsensitiveMatchings, baseDirOpt) } - val processor = new BalaurProcessor(optionalNER = Some(customLexiconNer)) + val processor = new BalaurProcessor(lexiconNerOpt = Some(customLexiconNer)) val extractorEngine = { val masterResource = "/org/clulab/odinstarter/main.yml" // We usually want to reload rules during development, diff --git a/debugger/src/main/scala/org/clulab/odin/debugger/apps/DebuggingOdinStarterApp.scala b/debugger/src/main/scala/org/clulab/odin/debugger/apps/DebuggingOdinStarterApp.scala index 6924eef03..f55ff1b9b 100644 --- a/debugger/src/main/scala/org/clulab/odin/debugger/apps/DebuggingOdinStarterApp.scala +++ b/debugger/src/main/scala/org/clulab/odin/debugger/apps/DebuggingOdinStarterApp.scala @@ -31,7 +31,7 @@ object DebuggingOdinStarterApp extends App { LexiconNER(kbs, caseInsensitiveMatchings, baseDirOpt) } - val processor = new CluProcessor(optionalNER = Some(customLexiconNer)) + val processor = new CluProcessor(lexiconNerOpt = Some(customLexiconNer)) val exampleGlobalAction = (inMentions: Seq[Mention], state: State) => { val outMentions = inMentions.map { mention => if (mention.words.length % 2 == 0) diff --git a/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugRelationGraphExtractor.scala b/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugRelationGraphExtractor.scala index 7534c6415..b88f678b1 100644 --- a/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugRelationGraphExtractor.scala +++ b/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugRelationGraphExtractor.scala @@ -20,7 +20,7 @@ class DebugRelationGraphExtractor extends DebugTest { val resourceDir: File = new File(resourceDirName) val customLexiconNer = LexiconNER(Seq(s"$baseResourceName/FOOD.tsv"), Seq(true), Some(resourceDir)) - val processor = new CluProcessor(optionalNER = Some(customLexiconNer)) + val processor = new CluProcessor(lexiconNerOpt = Some(customLexiconNer)) val document = processor.annotate("John eats cake.", keepText = true) val sentence = document.sentences.head val ruleName = "people-eat-food" diff --git a/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugTriggerMentionGraphExtractor.scala b/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugTriggerMentionGraphExtractor.scala index 860646c06..97f8c4631 100644 --- a/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugTriggerMentionGraphExtractor.scala +++ b/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugTriggerMentionGraphExtractor.scala @@ -20,7 +20,7 @@ class DebugTriggerMentionGraphExtractor extends DebugTest { val resourceDir: File = new File(resourceDirName) val customLexiconNer = LexiconNER(Seq(s"$baseResourceName/FOOD.tsv"), Seq(true), Some(resourceDir)) - val processor = new CluProcessor(optionalNER = Some(customLexiconNer)) + val processor = new CluProcessor(lexiconNerOpt = Some(customLexiconNer)) val document = processor.annotate("John eats cake.", keepText = true) val sentence = document.sentences.head val ruleName = "people-eat-food" diff --git a/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugTriggerPatternGraphExtractor.scala b/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugTriggerPatternGraphExtractor.scala index 7aa28848b..31447ac66 100644 --- a/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugTriggerPatternGraphExtractor.scala +++ b/debugger/src/test/scala/org/clulab/odin/debugger/extractor/graph/DebugTriggerPatternGraphExtractor.scala @@ -20,7 +20,7 @@ class DebugTriggerPatternGraphExtractor extends DebugTest { val resourceDir: File = new File(resourceDirName) val customLexiconNer = LexiconNER(Seq(s"$baseResourceName/FOOD.tsv"), Seq(true), Some(resourceDir)) - val processor = new CluProcessor(optionalNER = Some(customLexiconNer)) + val processor = new CluProcessor(lexiconNerOpt = Some(customLexiconNer)) val document = processor.annotate("John eats cake.", keepText = true) val sentence = document.sentences.head val ruleName = "people-eat-food" diff --git a/library/src/main/scala-2.11_2.12/org/clulab/struct/GraphMap.scala b/library/src/main/scala-2.11_2.12/org/clulab/struct/GraphMap.scala index de5a5472f..8de1af507 100644 --- a/library/src/main/scala-2.11_2.12/org/clulab/struct/GraphMap.scala +++ b/library/src/main/scala-2.11_2.12/org/clulab/struct/GraphMap.scala @@ -9,6 +9,8 @@ class GraphMap protected extends mutable.HashMap[String, DirectedGraph[String]] object GraphMap extends GraphMapNames { type GraphMapType = GraphMap + val EMPTY_GRAPH = GraphMap() + def apply(): GraphMapType = new GraphMap() def apply(existing: Map[String, DirectedGraph[String]]): GraphMapType = { diff --git a/library/src/main/scala-2.13/org/clulab/struct/GraphMap.scala b/library/src/main/scala-2.13/org/clulab/struct/GraphMap.scala index 805226874..4cb404f24 100644 --- a/library/src/main/scala-2.13/org/clulab/struct/GraphMap.scala +++ b/library/src/main/scala-2.13/org/clulab/struct/GraphMap.scala @@ -8,6 +8,8 @@ object GraphMap extends GraphMapNames { // [warn] ...: inheritance from class HashMap in package mutable is deprecated (since 2.13.0): HashMap will be made final; use .withDefault for the common use case of computing a default value type GraphMapType = mutable.HashMap[String, DirectedGraph[String]] + val EMPTY_GRAPH = GraphMap() + def apply(): GraphMapType = { // we have very few dependency types, so let's create a small hash to save memory. new GraphMapType(2, mutable.HashMap.defaultLoadFactor) diff --git a/library/src/main/scala-3/org/clulab/odinstarter/OdinStarter3.scala b/library/src/main/scala-3/org/clulab/odinstarter/OdinStarter3.scala index fa9dfa73d..a1332bf6d 100644 --- a/library/src/main/scala-3/org/clulab/odinstarter/OdinStarter3.scala +++ b/library/src/main/scala-3/org/clulab/odinstarter/OdinStarter3.scala @@ -27,7 +27,7 @@ object OdinStarter3: val baseDirOpt = if isLocal then Some(resourceDir) else None LexiconNER(kbs, caseInsensitiveMatchings, baseDirOpt) - val processor = new BalaurProcessor(optionalNER = Some(customLexiconNer)) + val processor = new BalaurProcessor(lexiconNerOpt = Some(customLexiconNer)) val extractorEngine = val masterResource = "/org/clulab/odinstarter/main.yml" // We usually want to reload rules during development, diff --git a/library/src/main/scala-3/org/clulab/struct/GraphMap.scala b/library/src/main/scala-3/org/clulab/struct/GraphMap.scala index 805226874..4cb404f24 100644 --- a/library/src/main/scala-3/org/clulab/struct/GraphMap.scala +++ b/library/src/main/scala-3/org/clulab/struct/GraphMap.scala @@ -8,6 +8,8 @@ object GraphMap extends GraphMapNames { // [warn] ...: inheritance from class HashMap in package mutable is deprecated (since 2.13.0): HashMap will be made final; use .withDefault for the common use case of computing a default value type GraphMapType = mutable.HashMap[String, DirectedGraph[String]] + val EMPTY_GRAPH = GraphMap() + def apply(): GraphMapType = { // we have very few dependency types, so let's create a small hash to save memory. new GraphMapType(2, mutable.HashMap.defaultLoadFactor) diff --git a/library/src/main/scala/org/clulab/processors/Sentence.scala b/library/src/main/scala/org/clulab/processors/Sentence.scala index c7b5a6f20..276e2dc2a 100644 --- a/library/src/main/scala/org/clulab/processors/Sentence.scala +++ b/library/src/main/scala/org/clulab/processors/Sentence.scala @@ -222,12 +222,12 @@ object Sentence { words: Seq[String], tags: Option[Seq[String]], lemmas: Option[Seq[String]], - entities: Option[Seq[String]], - norms: Option[Seq[String]], - chunks: Option[Seq[String]], - tree: Option[Tree], - deps: GraphMapType, - relations: Option[Seq[RelationTriple]] + entities: Option[Seq[String]] = None, + norms: Option[Seq[String]] = None, + chunks: Option[Seq[String]] = None, + tree: Option[Tree] = None, + deps: GraphMapType = GraphMap.EMPTY_GRAPH, + relations: Option[Seq[RelationTriple]] = None ): Sentence = { new Sentence( raw, startOffsets, endOffsets, words, diff --git a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala index 34b616395..4716e472e 100644 --- a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala +++ b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala @@ -5,23 +5,25 @@ import com.typesafe.config.ConfigFactory import org.clulab.numeric.NumericEntityRecognizer import org.clulab.numeric.NumericUtils import org.clulab.processors.{Document, Processor, Sentence} -import org.clulab.processors.clu.tokenizer.{EnglishLemmatizer, Lemmatizer, OpenDomainEnglishTokenizer, OpenDomainPortugueseTokenizer, OpenDomainSpanishTokenizer, PortugueseLemmatizer, SpanishLemmatizer, Tokenizer} +import org.clulab.processors.clu.tokenizer.Lemmatizer +import org.clulab.processors.clu.tokenizer.{EnglishLemmatizer, PortugueseLemmatizer, SpanishLemmatizer} +import org.clulab.processors.clu.tokenizer.Tokenizer +import org.clulab.processors.clu.tokenizer.{OpenDomainEnglishTokenizer, OpenDomainPortugueseTokenizer, OpenDomainSpanishTokenizer} import org.clulab.processors.hexatagging.HexaDecoder -import org.clulab.utils.WrappedArraySeq import org.clulab.scala_transformers.encoder.EncoderMaxTokensRuntimeException import org.clulab.scala_transformers.encoder.TokenClassifier import org.clulab.sequences.{LexiconNER, NamedEntity} import org.clulab.struct.DirectedGraph import org.clulab.struct.GraphMap -import org.clulab.struct.GraphMap.GraphMapType import org.clulab.utils.{Configured, MathUtils, ToEnhancedDependencies} +import org.clulab.utils.WrappedArraySeq import org.slf4j.{Logger, LoggerFactory} import BalaurProcessor._ class BalaurProcessor protected ( val config: Config, - val optionalNER: Option[LexiconNER], + val lexiconNerOpt: Option[LexiconNER], val numericEntityRecognizerOpt: Option[NumericEntityRecognizer], wordTokenizer: Tokenizer, wordLemmatizer: Lemmatizer, @@ -33,11 +35,11 @@ class BalaurProcessor protected ( // standard, abbreviated constructor def this( config: Config = ConfigFactory.load("balaurprocessor"), - optionalNER: Option[LexiconNER] = None, + lexiconNerOpt: Option[LexiconNER] = None, seasonPathOpt: Option[String] = Some("/org/clulab/numeric/SEASON.tsv") ) = this( config, - optionalNER, + lexiconNerOpt, newNumericEntityRecognizerOpt(seasonPathOpt), mkTokenizer(getConfigArgString(config, s"$prefix.language", Some("EN"))), mkLemmatizer(getConfigArgString(config, s"$prefix.language", Some("EN"))), @@ -46,43 +48,49 @@ class BalaurProcessor protected ( ) def copy( - configOpt: Option[Config] = None, - optionalNEROpt: Option[Option[LexiconNER]] = None, - numericEntityRecognizerOptOpt: Option[Option[NumericEntityRecognizer]] = None, - wordTokenizerOpt: Option[Tokenizer] = None, - wordLemmatizerOpt: Option[Lemmatizer] = None, - tokenClassifierOpt: Option[TokenClassifier] = None + config: Config = config, + lexiconNerOpt: Option[LexiconNER] = lexiconNerOpt, + numericEntityRecognizerOpt: Option[NumericEntityRecognizer] = numericEntityRecognizerOpt, + wordTokenizer: Tokenizer = wordTokenizer, + wordLemmatizer: Lemmatizer = wordLemmatizer, + tokenClassifier: TokenClassifier = tokenClassifier ): BalaurProcessor = { new BalaurProcessor( - configOpt.getOrElse(this.config), - optionalNEROpt.getOrElse(this.optionalNER), - numericEntityRecognizerOptOpt.getOrElse(this.numericEntityRecognizerOpt), - wordTokenizerOpt.getOrElse(this.wordTokenizer), - wordLemmatizerOpt.getOrElse(this.wordLemmatizer), - tokenClassifierOpt.getOrElse(this.tokenClassifier) + config, + lexiconNerOpt, + numericEntityRecognizerOpt, + wordTokenizer, + wordLemmatizer, + tokenClassifier ) } + // TODO: Try not to make a new decoder for each processor? val hexaDecoder = new HexaDecoder() override def getConf: Config = config + // TODO: Why not make the wordTokenizer a val then? + def tokenizer: Tokenizer = wordTokenizer + override def mkDocument(text: String, keepText: Boolean): Document = { DocumentMaker.mkDocument(tokenizer, text, keepText) } - def tokenizer: Tokenizer = wordTokenizer - - override def mkDocumentFromSentences(sentences: Iterable[String], + override def mkDocumentFromSentences( + sentences: Iterable[String], keepText: Boolean, - charactersBetweenSentences: Int): Document = { + charactersBetweenSentences: Int + ): Document = { DocumentMaker.mkDocumentFromSentences(tokenizer, sentences, keepText, charactersBetweenSentences) } - override def mkDocumentFromTokens(sentences: Iterable[Iterable[String]], + override def mkDocumentFromTokens( + sentences: Iterable[Iterable[String]], keepText: Boolean, charactersBetweenSentences: Int, - charactersBetweenTokens: Int): Document = { + charactersBetweenTokens: Int + ): Document = { DocumentMaker.mkDocumentFromTokens(sentences, keepText, charactersBetweenSentences, charactersBetweenSentences) } @@ -108,37 +116,28 @@ class BalaurProcessor protected ( } /** Generates cheap lemmas with the word in lower case, for languages where a lemmatizer is not available */ - def cheapLemmatize(sentence: Sentence): Seq[String] = { - sentence.words.map(_.toLowerCase()) - } + def cheapLemmatize(sentence: Sentence): Seq[String] = + sentence.words.map(_.toLowerCase()) - override def recognizeNamedEntities(doc: Document): Unit = { - throw new RuntimeException("ERROR: cannot call this method on its own in this procecessor!") - } + def throwCannotCallException(methodName: String): Unit = + throw new RuntimeException(s"ERROR: cannot call $methodName on its own in this processor!") - override def parse(doc: Document): Unit = { - throw new RuntimeException("ERROR: cannot call this method on its own in this procecessor!") - } + override def recognizeNamedEntities(doc: Document): Unit = throwCannotCallException("recognizeNamedEntities") - override def srl(doc: Document): Unit = { - throw new RuntimeException("ERROR: functionality not supported in this procecessor!") - } + override def parse(doc: Document): Unit = throwCannotCallException("parse") - override def chunking(doc: Document): Unit = { - throw new RuntimeException("ERROR: cannot call this method on its own in this procecessor!") - } + override def chunking(doc: Document): Unit = throwCannotCallException("chunking") - override def resolveCoreference(doc: Document): Unit = { - throw new RuntimeException("ERROR: functionality not supported in this procecessor!") - } + def throwNotSupportedException(methodName: String): Unit = + throw new RuntimeException(s"ERROR: $methodName functionality not supported in this procecessor!") - override def discourse(doc: Document): Unit = { - throw new RuntimeException("ERROR: functionality not supported in this procecessor!") - } + override def srl(doc: Document): Unit = throwNotSupportedException("srl") - override def relationExtraction(doc: Document): Unit = { - throw new RuntimeException("ERROR: functionality not supported in this procecessor!") - } + override def resolveCoreference(doc: Document): Unit = throwNotSupportedException("resolveCoreference") + + override def discourse(doc: Document): Unit = throwNotSupportedException("discourse") + + override def relationExtraction(doc: Document): Unit = throwNotSupportedException("relationExtraction") override def annotate(document: Document): Document = { // Process one sentence at a time through the MTL framework. @@ -151,7 +150,7 @@ class BalaurProcessor protected ( val allLabelsAndScores = tokenClassifier.predictWithScores(words) val tags = mkPosTags(words, allLabelsAndScores(TASK_TO_INDEX(POS_TASK))) val entities = { - val optionalEntities = mkOptionalNerLabels(words, sentence.startOffsets, sentence.endOffsets, tags, lemmas) + val optionalEntities = mkNerLabelsOpt(words, sentence.startOffsets, sentence.endOffsets, tags, lemmas) mkNamedEntityLabels(words, allLabelsAndScores(TASK_TO_INDEX(NER_TASK)), optionalEntities) } @@ -199,44 +198,37 @@ class BalaurProcessor protected ( private def mkPosTags(words: Seq[String], labels: Array[Array[(String, Float)]]): Seq[String] = { assert(labels.length == words.length) - val tags = WrappedArraySeq(labels.map(_.head._1)).toImmutableSeq - val result = PostProcessor.postprocessPartOfSpeechTags(words, tags) + val rawTags = WrappedArraySeq(labels.map(_.head._1)).toImmutableSeq + val cookedTags = PostProcessor.postprocessPartOfSpeechTags(words, rawTags) - result + cookedTags } - private def mkOptionalNerLabels( + private def mkNerLabelsOpt( words: Seq[String], startOffsets: Seq[Int], endOffsets: Seq[Int], tags: Seq[String], lemmas: Seq[String] ): Option[Seq[String]] = { - // NER labels from the custom NER - optionalNER.map { ner => + lexiconNerOpt.map { lexiconNer => val sentence = Sentence( - words, // Why isn't this raw? + words, // TODO: Why isn't this raw? startOffsets, endOffsets, words, Some(tags), - Some(lemmas), - entities = None, - norms = None, - chunks = None, - tree = None, - deps = EMPTY_GRAPH, - relations = None + Some(lemmas) ) - ner.find(sentence) + lexiconNer.find(sentence) } } /** Must be called after assignPosTags and lemmatize because it requires Sentence.tags and Sentence.lemmas */ - private def mkNamedEntityLabels(words: Seq[String], labels: Array[Array[(String, Float)]], optionalNERLabels: Option[Seq[String]]): Seq[String] = { + private def mkNamedEntityLabels(words: Seq[String], labels: Array[Array[(String, Float)]], nerLabelsOpt: Option[Seq[String]]): Seq[String] = { assert(labels.length == words.length) val labelsSeq = WrappedArraySeq(labels.map(_.head._1)).toImmutableSeq val genericLabels = NamedEntity.patch(labelsSeq) - val specificLabels = optionalNERLabels.map { nerLabels => + val specificLabels = nerLabelsOpt.map { nerLabels => //println(s"MERGING NE labels for sentence: ${sent.words.mkString(" ")}") //println(s"Generic labels: ${NamedEntity.patch(labels).mkString(", ")}") //println(s"Optional labels: ${optionalNERLabels.get.mkString(", ")}") @@ -258,9 +250,8 @@ class BalaurProcessor protected ( if (customNamedEntities.isEmpty) generic else { - // TODO: kwa work on combine - val result = generic.toArray // A copy of the generic labels is created here. val genericNamedEntities = NamedEntity.collect(generic) + val result = generic.toArray // A copy of the generic labels is created here. //println(s"Generic NamedEntity: ${genericNamedEntities.mkString(", ")}") //println(s"Custom NamedEntity: ${customNamedEntities.mkString(", ")}") @@ -277,13 +268,15 @@ class BalaurProcessor protected ( WrappedArraySeq(labels.map(_.head._1)).toImmutableSeq } + // TODO: This appears to be unused. // The head has one score, the label has another. Here the two scores are interpolated // and the head and label are stored together in a single object with the score if the // object, the Dependency, has a valid absolute head. private def interpolateHeadsAndLabels( - sentHeadPredictionScores: Array[Array[PredictionScore]], - sentLabelPredictionScores: Array[Array[PredictionScore]], - lambda: Float): Array[Array[Dependency]] = { + sentHeadPredictionScores: Array[Array[PredictionScore]], + sentLabelPredictionScores: Array[Array[PredictionScore]], + lambda: Float + ): Array[Array[Dependency]] = { assert(sentHeadPredictionScores.length == sentLabelPredictionScores.length) val sentDependencies = sentHeadPredictionScores.zip(sentLabelPredictionScores).zipWithIndex.map { case ((wordHeadPredictionScores, wordLabelPredictionScores), wordIndex) => @@ -316,21 +309,22 @@ class BalaurProcessor protected ( words: Seq[String], lemmas: Seq[String], tags: Seq[String], termTags: Array[Array[PredictionScore]], nonTermTags: Array[Array[PredictionScore]] - ): GraphMapType = { + ): GraphMap.GraphMapType = { val verbose = false val graphs = GraphMap() val size = words.length - // bht is used just for debugging purposes here val (bht, deps, roots) = hexaDecoder.decode(termTags, nonTermTags, topK = 25, verbose) - if(verbose && bht.nonEmpty) { + + if (verbose && bht.nonEmpty) { println(bht) println(s"Dependencies (${deps.get.size}):") println(deps.mkString("\n")) println("Roots: " + roots.get.mkString(", ")) } - if (deps.nonEmpty && roots.nonEmpty) { + // TODO: This can be made in one fell swoop. + // basic dependencies that replicate treebank annotations val depGraph = new DirectedGraph[String](deps.get, Some(size), roots) graphs += GraphMap.UNIVERSAL_BASIC -> depGraph @@ -351,51 +345,38 @@ object BalaurProcessor { val logger:Logger = LoggerFactory.getLogger(classOf[BalaurProcessor]) val prefix:String = "BalaurProcessor" - val OUTSIDE = "O" - val EMPTY_GRAPH = GraphMap() - val NER_TASK = "NER" val POS_TASK = "POS" val CHUNKING_TASK = "Chunking" - val DEPS_HEAD_TASK = "Deps Head" - val DEPS_LABEL_TASK = "Deps Label" val HEXA_TERM_TASK = "Hexa Term" val HEXA_NONTERM_TASK = "Hexa NonTerm" - val PARSING_INTERPOLATION_LAMBDA = 0.6f - val PARSING_TOPK = 5 - // maps a task name to a head index in the encoder - val TASK_TO_INDEX = Map( - NER_TASK -> 0, - POS_TASK -> 1, - CHUNKING_TASK -> 2, - HEXA_TERM_TASK -> 3, - HEXA_NONTERM_TASK -> 4 - ) - - def mkTokenizer(lang: String): Tokenizer = { - lang match { - case "PT" => new OpenDomainPortugueseTokenizer - case "ES" => new OpenDomainSpanishTokenizer - case _ => new OpenDomainEnglishTokenizer - } + val TASK_TO_INDEX: Map[String, Int] = Seq( + NER_TASK, + POS_TASK, + CHUNKING_TASK, + HEXA_TERM_TASK, + HEXA_NONTERM_TASK + ).zipWithIndex.toMap + + def mkTokenizer(lang: String): Tokenizer = lang match { + case "PT" => new OpenDomainPortugueseTokenizer + case "ES" => new OpenDomainSpanishTokenizer + case "EN" | _ => new OpenDomainEnglishTokenizer } - def mkLemmatizer(lang: String): Lemmatizer = { - lang match { - case "PT" => new PortugueseLemmatizer - case "ES" => new SpanishLemmatizer - case _ => new EnglishLemmatizer - } + def mkLemmatizer(lang: String): Lemmatizer = lang match { + case "PT" => new PortugueseLemmatizer + case "ES" => new SpanishLemmatizer + case "EN" | _ => new EnglishLemmatizer } def getConfigArgString (config: Config, argPath: String, defaultValue: Option[String]): String = - if (config.hasPath(argPath)) config.getString(argPath) - else if(defaultValue.nonEmpty) defaultValue.get - else throw new RuntimeException(s"ERROR: parameter $argPath must be defined!") + if (config.hasPath(argPath)) config.getString(argPath) + else if (defaultValue.nonEmpty) defaultValue.get + else throw new RuntimeException(s"ERROR: parameter $argPath must be defined!") - def newNumericEntityRecognizerOpt(seasonPathOpt: Option[String]): Option[NumericEntityRecognizer] = { - seasonPathOpt.map(NumericEntityRecognizer(_)) - } + def newNumericEntityRecognizerOpt(seasonPathOpt: Option[String]): Option[NumericEntityRecognizer] = + seasonPathOpt.map(NumericEntityRecognizer(_)) } diff --git a/library/src/main/scala/org/clulab/sequences/BiMEMMSequenceTagger.scala b/library/src/main/scala/org/clulab/sequences/BiMEMMSequenceTagger.scala index 3278df5f2..dd7118ac5 100644 --- a/library/src/main/scala/org/clulab/sequences/BiMEMMSequenceTagger.scala +++ b/library/src/main/scala/org/clulab/sequences/BiMEMMSequenceTagger.scala @@ -233,7 +233,7 @@ abstract class BiMEMMSequenceTagger[L: ClassTag, F: ClassTag]( if(leftToRight) history.toArray else SeqUtils.revert(history).toArray } - override def classesOf(sentence: Sentence):Array[L] = { + override def classesOf(sentence: Sentence):Seq[L] = { var firstPassLabels:Option[Array[L]] = None if(firstPassModel.nonEmpty) firstPassLabels = Some(classesOf(firstPassModel.get, sentence, None, ! leftToRight)) diff --git a/library/src/main/scala/org/clulab/sequences/CombinedLexiconNER.scala b/library/src/main/scala/org/clulab/sequences/CombinedLexiconNER.scala index 9c12ab411..9ab41afda 100644 --- a/library/src/main/scala/org/clulab/sequences/CombinedLexiconNER.scala +++ b/library/src/main/scala/org/clulab/sequences/CombinedLexiconNER.scala @@ -2,6 +2,7 @@ package org.clulab.sequences import org.clulab.processors.Sentence import org.clulab.sequences.LexiconNER._ +import org.clulab.scala.WrappedArray._ import org.clulab.struct.EntityValidator import org.clulab.struct.IntHashTrie diff --git a/library/src/main/scala/org/clulab/sequences/CompactLexiconNER.scala b/library/src/main/scala/org/clulab/sequences/CompactLexiconNER.scala index 08bee6769..2d16ddbf2 100644 --- a/library/src/main/scala/org/clulab/sequences/CompactLexiconNER.scala +++ b/library/src/main/scala/org/clulab/sequences/CompactLexiconNER.scala @@ -1,15 +1,15 @@ package org.clulab.sequences -import java.io.ObjectInputStream -import java.io.ObjectOutputStream -import java.util.Arrays - import org.clulab.processors.Sentence import org.clulab.sequences.LexiconNER.OUTSIDE_LABEL +import org.clulab.scala.WrappedArray._ import org.clulab.struct.EntityValidator import org.clulab.struct.IntHashTrie import org.clulab.struct.IntTrieNode +import java.io.ObjectInputStream +import java.io.ObjectOutputStream +import java.util.Arrays import scala.collection.mutable /** Lexicon-based NER similar to [[org.clulab.sequences.CombinedLexiconNER CombinedLexiconNER]] but which @@ -376,7 +376,7 @@ object CompactLexiconNER { // Assume that trieNodes are already sorted as much as necessary and all the tokens have stringIds. // Returns the number of parentsAdded and childrenAdded - def add(trieNodes: Array[IntTrieNode], parentOffset: Int, childOffset: Int): (Int, Int) = { + def add(trieNodes: Seq[IntTrieNode], parentOffset: Int, childOffset: Int): (Int, Int) = { // Area between parentOffset and parentOffset + parentRserve is for this recursive pass and // likewise for between childOffset and childOffset + childReserve. val parentReserve = trieNodes.length diff --git a/library/src/main/scala/org/clulab/sequences/LexiconNER.scala b/library/src/main/scala/org/clulab/sequences/LexiconNER.scala index cc8bebf16..24d0b143f 100644 --- a/library/src/main/scala/org/clulab/sequences/LexiconNER.scala +++ b/library/src/main/scala/org/clulab/sequences/LexiconNER.scala @@ -4,7 +4,6 @@ import org.clulab.processors.Sentence import org.clulab.scala.SeqView import org.clulab.scala.WrappedArray._ import org.clulab.struct.{EntityValidator, TrueEntityValidator} -import org.clulab.utils.ArrayView import java.io.File import scala.collection.mutable @@ -103,7 +102,7 @@ abstract class LexiconNER(val knownCaseInsensitives: Set[String], val useLemmas: ) protected def contentfulSpan(sentence: Sentence, start: Int, length: Int): Boolean = { - val wordsView = sentence.words.view(start, start + length) + val wordsView = sentence.words.view.slice(start, start + length) // A valid view/span must have a letter and at least one of the other qualifiers. val contentful = hasLetter(wordsView) && contentQualifiers.exists(_(wordsView)) @@ -314,7 +313,7 @@ object LexiconNER { var upperCaseLetters = 0 val spaces = math.max(0, end - start - 1) // Spaces are between words, not after them. - ArrayView(words, start, end).foreach { word => + words.view.slice(start, end).foreach { word => characters += word.length word.foreach { c => if (Character.isLetter(c)) letters += 1 @@ -347,7 +346,7 @@ object LexiconNER { while (offset < length) { val notOutsideCount = countWhile(src, offset, isNotOutside) // Check that there is not anything in dst that should not be overwritten. - if (!ArrayView(dst, offset, offset + notOutsideCount).exists(isNotOutside(_))) + if (!dst.view.slice(offset, offset + notOutsideCount).exists(isNotOutside(_))) Array.copy(src, offset, dst, offset, notOutsideCount) offset += notOutsideCount diff --git a/library/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala b/library/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala index aa6ac8b47..a78fc7795 100644 --- a/library/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala +++ b/library/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala @@ -67,7 +67,7 @@ abstract class MEMMSequenceTagger[L: ClassTag, F: ClassTag](var order:Int = 1, v logger.debug("Finished training.") } - override def classesOf(origSentence: Sentence):Array[L] = { + override def classesOf(origSentence: Sentence):Seq[L] = { val sentence = if(leftToRight) origSentence else origSentence.revert() val history = new ArrayBuffer[L]() @@ -80,7 +80,7 @@ abstract class MEMMSequenceTagger[L: ClassTag, F: ClassTag](var order:Int = 1, v history += label } - if(leftToRight) history.toArray else SeqUtils.revert(history).toArray + if(leftToRight) history else SeqUtils.revert(history) } override def save(file: File): Unit = { diff --git a/library/src/main/scala/org/clulab/sequences/NamedEntity.scala b/library/src/main/scala/org/clulab/sequences/NamedEntity.scala index 36aae40ce..2b74c5b6d 100644 --- a/library/src/main/scala/org/clulab/sequences/NamedEntity.scala +++ b/library/src/main/scala/org/clulab/sequences/NamedEntity.scala @@ -43,7 +43,7 @@ object NamedEntity { namedEntities } - def combine(bioLabels: Array[String], genericNamedEntities: Seq[NamedEntity], customNamedEntities: Seq[NamedEntity]): Array[String] = { + def combine(bioLabels: Array[String], genericNamedEntities: Seq[NamedEntity], customNamedEntities: Seq[NamedEntity]): Unit = { // Neither named entities sequence can contain overlapping elements within the sequence. // At most, there is overlap between sequences. Use is made of that fact. // The NamedEntities never have empty Ranges, so end - 1 is always at least start. @@ -51,12 +51,13 @@ object NamedEntity { val validStarts = (genericNamedEntities.map(_.range.start) ++ outsides).toSet // The -1 is used to coordinate ends (exclusive) with the OUTSIDE positions (inclusive). val validEnds = (genericNamedEntities.map(_.range.end - 1) ++ outsides).toSet + val validCustomNamedEntities = customNamedEntities.filter { customNamedEntity => + validStarts(customNamedEntity.range.start) && validEnds(customNamedEntity.range.end - 1) + } - customNamedEntities.foreach { customNamedEntity => - if (validStarts(customNamedEntity.range.start) && validEnds(customNamedEntity.range.end - 1)) - customNamedEntity.fill(bioLabels) + validCustomNamedEntities.foreach { customNamedEntity => + customNamedEntity.fill(bioLabels) } - bioLabels } // Only INSIDEs can be invalid, and they are made valid by diff --git a/library/src/main/scala/org/clulab/sequences/SequenceTagger.scala b/library/src/main/scala/org/clulab/sequences/SequenceTagger.scala index 93fd32c5b..76081875a 100644 --- a/library/src/main/scala/org/clulab/sequences/SequenceTagger.scala +++ b/library/src/main/scala/org/clulab/sequences/SequenceTagger.scala @@ -15,7 +15,7 @@ import scala.util.Using trait SequenceTagger[L, F] extends Tagger[L] { def train(docs:Iterator[Document]): Unit - def classesOf(sentence: Sentence):Array[L] + def classesOf(sentence: Sentence):Seq[L] /** Abstract method that generates the features for the word at the position offset in the given sentence */ def featureExtractor(features:Counter[F], sentence: Sentence, offset:Int): Unit diff --git a/library/src/main/scala/org/clulab/sequences/SequenceTaggerShell.scala b/library/src/main/scala/org/clulab/sequences/SequenceTaggerShell.scala index 9bcb7368f..1b4566e68 100644 --- a/library/src/main/scala/org/clulab/sequences/SequenceTaggerShell.scala +++ b/library/src/main/scala/org/clulab/sequences/SequenceTaggerShell.scala @@ -5,6 +5,7 @@ import java.io.File import jline.console.ConsoleReader import jline.console.history.FileHistory import org.clulab.processors.Sentence +import org.clulab.scala.WrappedArray._ /** * Simple shell for sequence taggers diff --git a/library/src/main/scala/org/clulab/struct/BooleanHashTrie.scala b/library/src/main/scala/org/clulab/struct/BooleanHashTrie.scala index dc4bed380..5ab19e3ad 100644 --- a/library/src/main/scala/org/clulab/struct/BooleanHashTrie.scala +++ b/library/src/main/scala/org/clulab/struct/BooleanHashTrie.scala @@ -261,13 +261,13 @@ class DebugBooleanHashTrie(label: String, caseInsensitive: Boolean = true) exten * Generates BIO labels for this sequence when complete trie paths match * When multiple paths match, the longest one is kept */ - def find(sequence: Array[String], outsideLabel: String): Array[String] = { + def find(sequence: Seq[String], outsideLabel: String): Array[String] = { val casedSequence = if (caseInsensitive) sequence.map(_.toLowerCase) else sequence findNormalized(casedSequence, outsideLabel) } - private def findNormalized(sequence: Array[String], outsideLabel: String): Array[String] = { + private def findNormalized(sequence: Seq[String], outsideLabel: String): Array[String] = { val labels = new Array[String](sequence.length) var offset = 0 diff --git a/library/src/main/scala/org/clulab/struct/HashTrie.scala b/library/src/main/scala/org/clulab/struct/HashTrie.scala index 331858735..1bcd8c0af 100644 --- a/library/src/main/scala/org/clulab/struct/HashTrie.scala +++ b/library/src/main/scala/org/clulab/struct/HashTrie.scala @@ -4,11 +4,11 @@ package org.clulab.struct class HashTrie(caseInsensitive: Boolean = true) extends BooleanHashTrie("", caseInsensitive) { - def find(sequence:Array[String], label: String, outsideLabel: String): Array[String] = + def find(sequence:Seq[String], label: String, outsideLabel: String): Array[String] = if (caseInsensitive) findNormalized(sequence.map(_.toLowerCase), label, outsideLabel) else findNormalized(sequence, label, outsideLabel) - protected def findNormalized(tokens: Array[String], label: String, outsideLabel: String): Array[String] = { + protected def findNormalized(tokens: Seq[String], label: String, outsideLabel: String): Array[String] = { val labels = new Array[String](tokens.length) lazy val bLabel = "B-" + label // lazy thinking that most calls will not use it lazy val iLabel = "I-" + label diff --git a/library/src/main/scala/org/clulab/utils/ArrayView.scala b/library/src/main/scala/org/clulab/utils/ArrayView.scala deleted file mode 100644 index afbd6d42a..000000000 --- a/library/src/main/scala/org/clulab/utils/ArrayView.scala +++ /dev/null @@ -1,37 +0,0 @@ -package org.clulab.utils - -import scala.collection.mutable - -// Array.view(from, until) is no longer available in Scala 2.13+. -class ArrayView[T](array: Array[T], from: Int, until: Int) extends IndexedSeq[T] { - val length = until - from - - override def apply(index: Int): T = array(from + index) -} - -object ArrayView { - - def apply[T](array: Array[T]): ArrayView[T] = apply(array, 0) - - def apply[T](array: Array[T], from: Int): ArrayView[T] = apply(array, from, array.length) - - def apply[T](array: Array[T], from: Int, until: Int): ArrayView[T] = new ArrayView(array, from, until) -} - -// Array.view(from, until) is no longer available in Scala 2.13+. -class MutableArrayView[T](array: Array[T], from: Int, until: Int) extends mutable.IndexedSeq[T] { - val length = until - from - - override def apply(index: Int): T = array(from + index) - - override def update(index: Int, elem: T): Unit = array(from + index) = elem -} - -object MutableArrayView { - - def apply[T](array: Array[T]): MutableArrayView[T] = apply(array, 0) - - def apply[T](array: Array[T], from: Int): MutableArrayView[T] = apply(array, from, array.length) - - def apply[T](array: Array[T], from: Int, until: Int): MutableArrayView[T] = new MutableArrayView(array, from, until) -} diff --git a/library/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala b/library/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala index e6da8e3b7..1eb8314d5 100644 --- a/library/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala +++ b/library/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala @@ -23,7 +23,7 @@ import scala.collection.mutable.{ArrayBuffer, ListBuffer} object ToEnhancedDependencies { type EdgeSpec = (Int, Int, String) - def generateStanfordEnhancedDependencies(words: Array[String], tags: Array[String], dg:DirectedGraph[String]): DirectedGraph[String] = { + def generateStanfordEnhancedDependencies(words: Array[String], tags: Seq[String], dg:DirectedGraph[String]): DirectedGraph[String] = { val dgi = dg.toDirectedGraphIndex() collapsePrepositionsStanford(words, dgi) raiseSubjects(dgi) diff --git a/library/src/test/scala-2.11_2.12/org/clulab/utils/TestHash.scala b/library/src/test/scala-2.11_2.12/org/clulab/utils/TestHash.scala index 54ce33916..85125c04b 100644 --- a/library/src/test/scala-2.11_2.12/org/clulab/utils/TestHash.scala +++ b/library/src/test/scala-2.11_2.12/org/clulab/utils/TestHash.scala @@ -16,7 +16,7 @@ class TestHash extends Test { LexiconNER(kbs, caseInsensitiveMatchings, None) } - val processor = new BalaurProcessor(optionalNER = Some(customLexiconNer)) + val processor = new BalaurProcessor(lexiconNerOpt = Some(customLexiconNer)) val extractorEngine = { val rules = FileUtils.getTextFromResource("/org/clulab/odinstarter/main.yml") diff --git a/library/src/test/scala-2.13/org/clulab/utils/TestHash.scala b/library/src/test/scala-2.13/org/clulab/utils/TestHash.scala index 31e03d8ec..88e2b0726 100644 --- a/library/src/test/scala-2.13/org/clulab/utils/TestHash.scala +++ b/library/src/test/scala-2.13/org/clulab/utils/TestHash.scala @@ -16,7 +16,7 @@ class TestHash extends Test { LexiconNER(kbs, caseInsensitiveMatchings, None) } - val processor = new BalaurProcessor(optionalNER = Some(customLexiconNer)) + val processor = new BalaurProcessor(lexiconNerOpt = Some(customLexiconNer)) val extractorEngine = { val rules = FileUtils.getTextFromResource("/org/clulab/odinstarter/main.yml") diff --git a/library/src/test/scala-3/org/clulab/utils/TestHash.scala b/library/src/test/scala-3/org/clulab/utils/TestHash.scala index 9a08e0ca5..9186e9ae6 100644 --- a/library/src/test/scala-3/org/clulab/utils/TestHash.scala +++ b/library/src/test/scala-3/org/clulab/utils/TestHash.scala @@ -17,7 +17,7 @@ class TestHash extends Test { LexiconNER(kbs, caseInsensitiveMatchings, None) } - val processor = new BalaurProcessor(optionalNER = Some(customLexiconNer)) + val processor = new BalaurProcessor(lexiconNerOpt = Some(customLexiconNer)) val extractorEngine = { val rules = FileUtils.getTextFromResource("/org/clulab/odinstarter/main.yml") diff --git a/library/src/test/scala/org/clulab/processors/CluTest.scala b/library/src/test/scala/org/clulab/processors/CluTest.scala index 7b7d323e5..025e71413 100644 --- a/library/src/test/scala/org/clulab/processors/CluTest.scala +++ b/library/src/test/scala/org/clulab/processors/CluTest.scala @@ -29,7 +29,7 @@ class CluTest extends Test with BeforeAndAfterAll { ) val lexiconNer = LexiconNER(kbs, Seq(false), useLemmasForMatching = false) // case sensitive match on this KB - new BalaurProcessor(optionalNER = Some(lexiconNer)) + new BalaurProcessor(lexiconNerOpt = Some(lexiconNer)) } def stop(): Unit = { diff --git a/library/src/test/scala/org/clulab/sequences/TestNamedEntity.scala b/library/src/test/scala/org/clulab/sequences/TestNamedEntity.scala index 08a774400..cd731635f 100644 --- a/library/src/test/scala/org/clulab/sequences/TestNamedEntity.scala +++ b/library/src/test/scala/org/clulab/sequences/TestNamedEntity.scala @@ -45,7 +45,8 @@ class TestNamedEntity extends Test { val customBioLabels = customBioLabelString.split(" +") val genericNamedEntities = NamedEntity.collect(genericBioLabels) val customNamedEntities = NamedEntity.collect(customBioLabels) - val actualCombinedBioLabels = NamedEntity.combine(genericBioLabels, genericNamedEntities, customNamedEntities) + NamedEntity.combine(genericBioLabels, genericNamedEntities, customNamedEntities) + val actualCombinedBioLabels = genericBioLabels val actualCombinedBioLabelString = actualCombinedBioLabels.mkString(" ") val formattedExpectedCombinedBioLabelString = expectedCombinedBioLabelString.split(" +").mkString(" ") diff --git a/library/src/test/scala/org/clulab/utils/TestArrayView.scala b/library/src/test/scala/org/clulab/utils/TestArrayView.scala deleted file mode 100644 index 2bfbd08ff..000000000 --- a/library/src/test/scala/org/clulab/utils/TestArrayView.scala +++ /dev/null @@ -1,54 +0,0 @@ -package org.clulab.utils - -class TestArrayView extends Test { - - behavior of "ArrayView" - - it should "work with no offset" in { - val array = Array(1, 2, 3) - val arrayView = MutableArrayView(array) - - array.length should be (arrayView.length) - - arrayView.zip(array).foreach { case (arrayViewItem, arrayItem) => - arrayViewItem should be (arrayItem) - } - - arrayView(0) = 4 - arrayView(0) should be (4) - array(0) should be (4) - } - - it should "work with an offset" in { - val offset = 1 - val array = Array(1, 2, 3) - val arrayView = MutableArrayView(array, offset) - - array.length should be (arrayView.length + offset) - - arrayView.zip(array).foreach { case (arrayViewItem, arrayItem) => - arrayViewItem should be (arrayItem + offset) - } - - arrayView(0) = 4 - arrayView(0) should be (4) - array(1) should be (4) - } - - it should "work when clipped" in { - val offset = 1 - val clip = 1 - val array = Array(1, 2, 3) - val arrayView = MutableArrayView(array, offset, array.length - clip) - - array.length should be (arrayView.length + offset + clip) - - arrayView.zip(array).foreach { case (arrayViewItem, arrayItem) => - arrayViewItem should be (arrayItem + offset) - } - - arrayView(0) = 4 - arrayView(0) should be (4) - array(1) should be (4) - } -} diff --git a/webapp/app/org/clulab/processors/webapp/controllers/HomeController.scala b/webapp/app/org/clulab/processors/webapp/controllers/HomeController.scala index 9f4691529..14fc5ebb8 100644 --- a/webapp/app/org/clulab/processors/webapp/controllers/HomeController.scala +++ b/webapp/app/org/clulab/processors/webapp/controllers/HomeController.scala @@ -33,7 +33,7 @@ class HomeController @Inject()(cc: ControllerComponents) extends AbstractControl val kbs = customLexiconNerConfigs.map(_.kb) val caseInsensitiveMatchings = customLexiconNerConfigs.map(_.caseInsensitiveMatching) val customLexiconNer = LexiconNER(kbs, caseInsensitiveMatchings, None) - val processor = new BalaurProcessor(optionalNER = Some(customLexiconNer)) + val processor = new BalaurProcessor(lexiconNerOpt = Some(customLexiconNer)) processor } From dbfe52b5f8fefa2bb85d986aebf890dcf8a2d08a Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Mon, 26 May 2025 12:30:00 -0700 Subject: [PATCH 21/42] Document, Sentence --- .../org/clulab/processors/Document.scala | 24 ++----- .../org/clulab/processors/Processor.scala | 64 ++++++++++--------- .../org/clulab/processors/Sentence.scala | 33 +++++----- .../sequences/BiMEMMSequenceTagger.scala | 4 +- .../clulab/sequences/MEMMSequenceTagger.scala | 4 +- 5 files changed, 61 insertions(+), 68 deletions(-) diff --git a/library/src/main/scala/org/clulab/processors/Document.scala b/library/src/main/scala/org/clulab/processors/Document.scala index 1cae6a826..34db68688 100644 --- a/library/src/main/scala/org/clulab/processors/Document.scala +++ b/library/src/main/scala/org/clulab/processors/Document.scala @@ -1,7 +1,5 @@ package org.clulab.processors -import java.io.PrintWriter - import org.clulab.struct.{CorefChains, DirectedGraphEdgeIterator} import org.clulab.utils.Hash import org.clulab.utils.Serializer @@ -9,6 +7,7 @@ import org.json4s.JString import org.json4s.JValue import org.json4s.jackson.prettyJson +import java.io.PrintWriter import scala.collection.mutable /** @@ -26,7 +25,8 @@ class Document( val text: Option[String] = None, /** Map of any arbitrary document attachments such as document creation time */ protected val attachments: Option[mutable.HashMap[String, DocumentAttachment]] = None, - protected val documentCreationTime:Option[String] = None + /** DCT is Document Creation Time */ + protected val dct: Option[String] = None ) extends Serializable { def copy( @@ -35,8 +35,8 @@ class Document( coreferenceChains: Option[CorefChains] = coreferenceChains, text: Option[String] = text, attachments: Option[mutable.HashMap[String, DocumentAttachment]] = None, - documentCreationTime: Option[String] = documentCreationTime - ): Document = new Document(sentences, id, coreferenceChains, text, attachments, documentCreationTime) + dct: Option[String] = dct + ): Document = new Document(sentences, id, coreferenceChains, text, attachments, dct) /** Clears any internal state potentially constructed by the annotators */ // def clear(): Unit = { } @@ -72,18 +72,9 @@ class Document( Hash.ordered(sentences.map(_.ambivalenceHash)) ) - /** Adds an attachment to the document's attachment map */ -// def addAttachment(name: String, attachment: DocumentAttachment): Unit = { -// if (attachments.isEmpty) -// attachments = Some(new mutable.HashMap[String, DocumentAttachment]()) -// attachments.get += name -> attachment -// } - /** Retrieves the attachment with the given name */ def getAttachment(name: String): Option[DocumentAttachment] = attachments.flatMap(_.get(name)) - def removeAttachment(name: String): Unit = attachments.foreach(_ -= name) - /** Retrieves keys to all attachments so that the entire collection can be read * for purposes including but not limited to serialization. If there are no * attachments, that is attachments == None, an empty set is returned. @@ -102,9 +93,8 @@ class Document( * The DCT will impacts how Sentence.norms are generated for DATE expressions * @param dct Document creation time */ -// def setDCT(dct:String): Unit = documentCreationTime = Some(dct) - def getDCT: Option[String] = documentCreationTime + def getDCT: Option[String] = dct def prettyPrint(pw: PrintWriter): Unit = { // let's print the sentence-level annotations @@ -216,7 +206,7 @@ object Document { coreferenceChains = doc.coreferenceChains, text = doc.text, attachments = doc.attachments, - documentCreationTime = doc.documentCreationTime + dct = doc.dct ) newDocument diff --git a/library/src/main/scala/org/clulab/processors/Processor.scala b/library/src/main/scala/org/clulab/processors/Processor.scala index b7cab3423..9d84e1527 100644 --- a/library/src/main/scala/org/clulab/processors/Processor.scala +++ b/library/src/main/scala/org/clulab/processors/Processor.scala @@ -12,7 +12,7 @@ import scala.collection.mutable trait Processor { /** Constructs a document of tokens from free text; includes sentence splitting and tokenization. */ - def mkDocument (text:String, keepText:Boolean = false): Document + def mkDocument(text:String, keepText:Boolean = false): Document // The documents here were created with Processor.mkDocument, which could have created a subclassed // Document or documents with certain fields already filled in. This implementation only handles @@ -51,7 +51,7 @@ trait Processor { coreferenceChains = None, text = combinedTextOpt, attachments = Some(attachments), - documentCreationTime = headDctOpt + dct = headDctOpt ) combinedDocument @@ -84,16 +84,22 @@ trait Processor { } /** Constructs a document of tokens from an array of untokenized sentences. */ - def mkDocumentFromSentences (sentences:Iterable[String], - keepText:Boolean = false, - charactersBetweenSentences:Int = 1): Document + def mkDocumentFromSentences( + sentences: Iterable[String], + keepText: Boolean = false, + charactersBetweenSentences: Int = 1 + ): Document /** Constructs a document of tokens from an array of tokenized sentences. */ - def mkDocumentFromTokens (sentences:Iterable[Iterable[String]], - keepText:Boolean = false, - charactersBetweenSentences:Int = 1, - charactersBetweenTokens:Int = 1): Document + def mkDocumentFromTokens( + sentences: Iterable[Iterable[String]], + keepText: Boolean = false, + charactersBetweenSentences: Int = 1, + charactersBetweenTokens: Int = 1 + ): Document + /** Lemmatization; modifies the document in place. */ + def lemmatize(words: Seq[String]): Seq[String] // Side-effecting annotations. These modify the document in place, which is not too elegant. // There are two reasons for this: @@ -104,52 +110,52 @@ trait Processor { /** Part of speech tagging; modifies the document in place. */ def tagPartsOfSpeech(doc: Document): Unit - /** Lemmatization; modifies the document in place. */ - def lemmatize(words: Seq[String]): Seq[String] - /** Named Entity Recognition; modifies the document in place. */ - def recognizeNamedEntities (doc:Document): Unit + def recognizeNamedEntities(doc: Document): Unit /** Syntactic parsing; modifies the document in place. */ - def parse (doc:Document): Unit + def parse(doc:Document): Unit /** Semantic role labeling */ - def srl (doc: Document): Unit + def srl(doc: Document): Unit /** Shallow parsing; modifies the document in place. */ - def chunking (doc:Document): Unit + def chunking(doc:Document): Unit /** Coreference resolution; modifies the document in place. */ - def resolveCoreference (doc:Document): Unit + def resolveCoreference(doc:Document): Unit /** Discourse parsing; modifies the document in place. */ - def discourse (doc:Document): Unit + def discourse(doc:Document): Unit /** Relation extraction; modifies the document in place. */ def relationExtraction(doc:Document): Unit /** Annotate the given text string, specify whether to retain the text in the resultant Document. */ - def annotate (text:String, keepText:Boolean = false): Document = { - val doc = mkDocument(text, keepText) - if (doc.sentences.nonEmpty) - annotate(doc) - else - doc + def annotate(text: String, keepText: Boolean = false): Document = { + val tokenizedDoc = mkDocument(text, keepText) + val annotatedDoc = // For now, these two documents have the same type. + if (tokenizedDoc.sentences.nonEmpty) annotate(tokenizedDoc) + else tokenizedDoc + + annotatedDoc } /** Annotate the given sentences, specify whether to retain the text in the resultant Document. */ - def annotateFromSentences ( - sentences:Iterable[String], - keepText:Boolean = false): Document = { + def annotateFromSentences( + sentences: Iterable[String], + keepText: Boolean = false + ): Document = { val doc = mkDocumentFromSentences(sentences, keepText) annotate(doc) } /** Annotate the given tokens, specify whether to retain the text in the resultant Document. */ - def annotateFromTokens ( + def annotateFromTokens( sentences:Iterable[Iterable[String]], - keepText:Boolean = false): Document = { + keepText:Boolean = false + ): Document = { val doc = mkDocumentFromTokens(sentences, keepText) annotate(doc) } diff --git a/library/src/main/scala/org/clulab/processors/Sentence.scala b/library/src/main/scala/org/clulab/processors/Sentence.scala index 276e2dc2a..97a2350a8 100644 --- a/library/src/main/scala/org/clulab/processors/Sentence.scala +++ b/library/src/main/scala/org/clulab/processors/Sentence.scala @@ -96,39 +96,37 @@ class Sentence( * * @return A directed graph of dependencies if any exist, otherwise None */ - def dependencies:Option[DirectedGraph[String]] = graphs match { + def dependencies: Option[DirectedGraph[String]] = graphs match { case collapsed if collapsed.contains(UNIVERSAL_ENHANCED) => collapsed.get(UNIVERSAL_ENHANCED) case basic if basic.contains(UNIVERSAL_BASIC) => basic.get(UNIVERSAL_BASIC) case _ => None } /** Fetches the universal basic dependencies */ - def universalBasicDependencies:Option[DirectedGraph[String]] = graphs.get(UNIVERSAL_BASIC) + def universalBasicDependencies: Option[DirectedGraph[String]] = graphs.get(UNIVERSAL_BASIC) /** Fetches the universal enhanced dependencies */ - def universalEnhancedDependencies:Option[DirectedGraph[String]] = graphs.get(UNIVERSAL_ENHANCED) + def universalEnhancedDependencies: Option[DirectedGraph[String]] = graphs.get(UNIVERSAL_ENHANCED) /** Fetches the Stanford basic dependencies */ - def stanfordBasicDependencies:Option[DirectedGraph[String]] = graphs.get(STANFORD_BASIC) + def stanfordBasicDependencies: Option[DirectedGraph[String]] = graphs.get(STANFORD_BASIC) /** Fetches the Stanford collapsed dependencies */ - def stanfordCollapsedDependencies:Option[DirectedGraph[String]] = graphs.get(STANFORD_COLLAPSED) + def stanfordCollapsedDependencies: Option[DirectedGraph[String]] = graphs.get(STANFORD_COLLAPSED) - def semanticRoles:Option[DirectedGraph[String]] = graphs.get(SEMANTIC_ROLES) - def enhancedSemanticRoles:Option[DirectedGraph[String]] = graphs.get(ENHANCED_SEMANTIC_ROLES) + def semanticRoles: Option[DirectedGraph[String]] = graphs.get(SEMANTIC_ROLES) + def enhancedSemanticRoles: Option[DirectedGraph[String]] = graphs.get(ENHANCED_SEMANTIC_ROLES) - def hybridDependencies:Option[DirectedGraph[String]] = graphs.get(HYBRID_DEPENDENCIES) - - def setDependencies(depType: String, deps: DirectedGraph[String]): Unit = graphs += (depType -> deps) + def hybridDependencies: Option[DirectedGraph[String]] = graphs.get(HYBRID_DEPENDENCIES) /** * Recreates the text of the sentence, preserving the original number of white spaces between tokens * * @return the text of the sentence */ - def getSentenceText:String = getSentenceFragmentText(0, words.length) + def getSentenceText: String = getSentenceFragmentText(0, words.length) - def getSentenceFragmentText(start:Int, end:Int):String = { + def getSentenceFragmentText(start: Int, end: Int):String = { // optimize the single token case if (end - start == 1) raw(start) else { @@ -147,8 +145,8 @@ class Sentence( } } - /** Reverts the current sentence */ - def revert(): Sentence = { + /** Reverses the current sentence */ + def reverse(): Sentence = { val reversedSentence = Sentence( raw.reverse, startOffsets.reverse, @@ -168,7 +166,6 @@ class Sentence( reversedSentence } - // TODO def copy( raw: Seq[String] = raw, startOffsets: Seq[Int] = startOffsets, @@ -203,13 +200,13 @@ class Sentence( object Sentence { def apply( - raw:Seq[String], + raw: Seq[String], startOffsets: Seq[Int], endOffsets: Seq[Int]): Sentence = new Sentence(raw, startOffsets, endOffsets, raw) // words are identical to raw tokens (a common situation) def apply( - raw:Seq[String], + raw: Seq[String], startOffsets: Seq[Int], endOffsets: Seq[Int], words: Seq[String]): Sentence = @@ -234,4 +231,4 @@ object Sentence { tags, lemmas, entities, norms, chunks, tree, deps, relations ) } -} \ No newline at end of file +} diff --git a/library/src/main/scala/org/clulab/sequences/BiMEMMSequenceTagger.scala b/library/src/main/scala/org/clulab/sequences/BiMEMMSequenceTagger.scala index dd7118ac5..d9fb83262 100644 --- a/library/src/main/scala/org/clulab/sequences/BiMEMMSequenceTagger.scala +++ b/library/src/main/scala/org/clulab/sequences/BiMEMMSequenceTagger.scala @@ -168,7 +168,7 @@ abstract class BiMEMMSequenceTagger[L: ClassTag, F: ClassTag]( // original sentence val origSentence = sentences(sentOffset) // actual sentence to be used - val sentence = if (leftToRight) origSentence else origSentence.revert() + val sentence = if (leftToRight) origSentence else origSentence.reverse() // labels to be learned val labels = if (leftToRight) labelExtractor(origSentence) @@ -211,7 +211,7 @@ abstract class BiMEMMSequenceTagger[L: ClassTag, F: ClassTag]( origSentence: Sentence, firstPassLabels:Option[Array[L]], leftToRight:Boolean): Array[L] = { - val sentence = if(leftToRight) origSentence else origSentence.revert() + val sentence = if(leftToRight) origSentence else origSentence.reverse() val firstPass = if(firstPassLabels.nonEmpty) { diff --git a/library/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala b/library/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala index a78fc7795..7cba53724 100644 --- a/library/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala +++ b/library/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala @@ -32,7 +32,7 @@ abstract class MEMMSequenceTagger[L: ClassTag, F: ClassTag](var order:Int = 1, v var sentCount = 0 for(doc <- docs; origSentence <- doc.sentences) { // labels and features for one sentence - val sentence = if(leftToRight) origSentence else origSentence.revert() + val sentence = if(leftToRight) origSentence else origSentence.reverse() val labels = if(leftToRight) labelExtractor(origSentence) else SeqUtils.revert(labelExtractor(origSentence)).toArray @@ -68,7 +68,7 @@ abstract class MEMMSequenceTagger[L: ClassTag, F: ClassTag](var order:Int = 1, v } override def classesOf(origSentence: Sentence):Seq[L] = { - val sentence = if(leftToRight) origSentence else origSentence.revert() + val sentence = if(leftToRight) origSentence else origSentence.reverse() val history = new ArrayBuffer[L]() for(i <- 0 until sentence.size) { From 2c19b03c10564ff245979fe8d1d43bd411b6bece Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Mon, 26 May 2025 12:30:27 -0700 Subject: [PATCH 22/42] Balaur --- .../clulab/processors/clu/BalaurProcessor.scala | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala index 4716e472e..67ef882d0 100644 --- a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala +++ b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala @@ -94,10 +94,6 @@ class BalaurProcessor protected ( DocumentMaker.mkDocumentFromTokens(sentences, keepText, charactersBetweenSentences, charactersBetweenSentences) } - override def tagPartsOfSpeech(doc: Document): Unit = { - throw new RuntimeException("ERROR: cannot call this method on its own in this processor!") - } - override def lemmatize(words: Seq[String]): Seq[String] = { val lemmas = words.zipWithIndex.map { case (word, index) => val lemma = wordLemmatizer.lemmatizeWord(word) @@ -119,9 +115,12 @@ class BalaurProcessor protected ( def cheapLemmatize(sentence: Sentence): Seq[String] = sentence.words.map(_.toLowerCase()) + // TODO: Just don't include anything that calls this. def throwCannotCallException(methodName: String): Unit = throw new RuntimeException(s"ERROR: cannot call $methodName on its own in this processor!") + override def tagPartsOfSpeech(doc: Document): Unit = throwCannotCallException("tagPartsOfSpeech") + override def recognizeNamedEntities(doc: Document): Unit = throwCannotCallException("recognizeNamedEntities") override def parse(doc: Document): Unit = throwCannotCallException("parse") @@ -139,9 +138,9 @@ class BalaurProcessor protected ( override def relationExtraction(doc: Document): Unit = throwNotSupportedException("relationExtraction") - override def annotate(document: Document): Document = { + override def annotate(doc: Document): Document = { // Process one sentence at a time through the MTL framework. - val partlyAnnotatedSentences = document.sentences.map { sentence => + val partlyAnnotatedSentences = doc.sentences.map { sentence => val words = sentence.words // Lemmas are created deterministically, not through the MTL framework. val lemmas = lemmatize(words) @@ -167,6 +166,7 @@ class BalaurProcessor protected ( partlyAnnotatedSentence } + // TODO: Improve error handling. catch { // No values, not even lemmas, will be included in the annotation is there was an exception. case e: EncoderMaxTokensRuntimeException => @@ -178,7 +178,7 @@ class BalaurProcessor protected ( sentence } } - val partlyAnnotatedDocument = document.copy(sentences = partlyAnnotatedSentences) + val partlyAnnotatedDocument = doc.copy(sentences = partlyAnnotatedSentences) val fullyAnnotatedDocument = numericEntityRecognizerOpt.map { numericEntityRecognizer => val numericMentions = numericEntityRecognizer.extractFrom(partlyAnnotatedDocument) val (newLabels, newNorms) = NumericUtils.mkLabelsAndNorms(partlyAnnotatedDocument, numericMentions) From 55eb202c8069a81771304e8078e44c104368a365 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Mon, 26 May 2025 15:53:14 -0700 Subject: [PATCH 23/42] Remove Scala-specific GraphMap --- .../org/clulab/struct/DependencyMap.scala | 12 --------- .../org/clulab/struct/GraphMap.scala | 20 -------------- .../org/clulab/struct/DependencyMap.scala | 14 ---------- .../org/clulab/struct/GraphMap.scala | 22 ---------------- .../org/clulab/struct/DependencyMap.scala | 14 ---------- .../scala-3/org/clulab/struct/GraphMap.scala | 22 ---------------- .../org/clulab/processors/Sentence.scala | 26 +++++++++---------- .../processors/clu/BalaurProcessor.scala | 25 +++++++++--------- .../org/clulab/processors/clu/Veil.scala | 9 +++---- .../serialization/DocumentSerializer.scala | 4 +-- .../serialization/json/JSONSerializer.scala | 2 +- .../clulab/serialization/json/package.scala | 4 +-- .../scala/org/clulab/struct/Annotation.scala | 4 +-- .../clulab/struct/DependencyMapNames.scala | 7 ----- .../{GraphMapNames.scala => GraphMap.scala} | 10 ++++++- .../org/clulab/utils/TestFindHeads.scala | 4 +-- 16 files changed, 45 insertions(+), 154 deletions(-) delete mode 100644 library/src/main/scala-2.11_2.12/org/clulab/struct/DependencyMap.scala delete mode 100644 library/src/main/scala-2.11_2.12/org/clulab/struct/GraphMap.scala delete mode 100644 library/src/main/scala-2.13/org/clulab/struct/DependencyMap.scala delete mode 100644 library/src/main/scala-2.13/org/clulab/struct/GraphMap.scala delete mode 100644 library/src/main/scala-3/org/clulab/struct/DependencyMap.scala delete mode 100644 library/src/main/scala-3/org/clulab/struct/GraphMap.scala delete mode 100644 library/src/main/scala/org/clulab/struct/DependencyMapNames.scala rename library/src/main/scala/org/clulab/struct/{GraphMapNames.scala => GraphMap.scala} (68%) diff --git a/library/src/main/scala-2.11_2.12/org/clulab/struct/DependencyMap.scala b/library/src/main/scala-2.11_2.12/org/clulab/struct/DependencyMap.scala deleted file mode 100644 index d9b2cbfc5..000000000 --- a/library/src/main/scala-2.11_2.12/org/clulab/struct/DependencyMap.scala +++ /dev/null @@ -1,12 +0,0 @@ -package org.clulab.struct - -import scala.collection.mutable - -class DependencyMap protected extends mutable.HashMap[Int, DirectedGraph[String]] { - override def initialSize: Int = 2 // we have very few dependency types, so let's create a small hash to save memory -} - -object DependencyMap extends DependencyMapNames { - - def apply(): DependencyMap = new DependencyMap() -} diff --git a/library/src/main/scala-2.11_2.12/org/clulab/struct/GraphMap.scala b/library/src/main/scala-2.11_2.12/org/clulab/struct/GraphMap.scala deleted file mode 100644 index 8de1af507..000000000 --- a/library/src/main/scala-2.11_2.12/org/clulab/struct/GraphMap.scala +++ /dev/null @@ -1,20 +0,0 @@ -package org.clulab.struct - -import scala.collection.mutable - -class GraphMap protected extends mutable.HashMap[String, DirectedGraph[String]] { - override def initialSize: Int = 2 // we have very few dependency types, so let's create a small hash to save memory -} - -object GraphMap extends GraphMapNames { - type GraphMapType = GraphMap - - val EMPTY_GRAPH = GraphMap() - - def apply(): GraphMapType = new GraphMap() - - def apply(existing: Map[String, DirectedGraph[String]]): GraphMapType = { - val gm = GraphMap() - gm ++= existing - } -} diff --git a/library/src/main/scala-2.13/org/clulab/struct/DependencyMap.scala b/library/src/main/scala-2.13/org/clulab/struct/DependencyMap.scala deleted file mode 100644 index c4ed49b82..000000000 --- a/library/src/main/scala-2.13/org/clulab/struct/DependencyMap.scala +++ /dev/null @@ -1,14 +0,0 @@ -package org.clulab.struct - -import scala.collection.mutable - -object DependencyMap extends DependencyMapNames { - // This was previously a class inheriting from HashMap. However, - // [warn] ...: inheritance from class HashMap in package mutable is deprecated (since 2.13.0): HashMap will be made final; use .withDefault for the common use case of computing a default value. - type DependencyMap = mutable.HashMap[String, DirectedGraph[String]] - - def apply(): DependencyMap = { - // we have very few dependency types, so let's create a small hash to save memory. - new DependencyMap(2, mutable.HashMap.defaultLoadFactor) - } -} diff --git a/library/src/main/scala-2.13/org/clulab/struct/GraphMap.scala b/library/src/main/scala-2.13/org/clulab/struct/GraphMap.scala deleted file mode 100644 index 4cb404f24..000000000 --- a/library/src/main/scala-2.13/org/clulab/struct/GraphMap.scala +++ /dev/null @@ -1,22 +0,0 @@ -package org.clulab.struct - -import scala.collection.mutable - -object GraphMap extends GraphMapNames { - - // This was previously a class inheriting from HashMap. However, - // [warn] ...: inheritance from class HashMap in package mutable is deprecated (since 2.13.0): HashMap will be made final; use .withDefault for the common use case of computing a default value - type GraphMapType = mutable.HashMap[String, DirectedGraph[String]] - - val EMPTY_GRAPH = GraphMap() - - def apply(): GraphMapType = { - // we have very few dependency types, so let's create a small hash to save memory. - new GraphMapType(2, mutable.HashMap.defaultLoadFactor) - } - - def apply(existing: scala.collection.Map[String, DirectedGraph[String]]): GraphMapType = { - val gm = GraphMap() - gm ++= existing - } -} diff --git a/library/src/main/scala-3/org/clulab/struct/DependencyMap.scala b/library/src/main/scala-3/org/clulab/struct/DependencyMap.scala deleted file mode 100644 index c4ed49b82..000000000 --- a/library/src/main/scala-3/org/clulab/struct/DependencyMap.scala +++ /dev/null @@ -1,14 +0,0 @@ -package org.clulab.struct - -import scala.collection.mutable - -object DependencyMap extends DependencyMapNames { - // This was previously a class inheriting from HashMap. However, - // [warn] ...: inheritance from class HashMap in package mutable is deprecated (since 2.13.0): HashMap will be made final; use .withDefault for the common use case of computing a default value. - type DependencyMap = mutable.HashMap[String, DirectedGraph[String]] - - def apply(): DependencyMap = { - // we have very few dependency types, so let's create a small hash to save memory. - new DependencyMap(2, mutable.HashMap.defaultLoadFactor) - } -} diff --git a/library/src/main/scala-3/org/clulab/struct/GraphMap.scala b/library/src/main/scala-3/org/clulab/struct/GraphMap.scala deleted file mode 100644 index 4cb404f24..000000000 --- a/library/src/main/scala-3/org/clulab/struct/GraphMap.scala +++ /dev/null @@ -1,22 +0,0 @@ -package org.clulab.struct - -import scala.collection.mutable - -object GraphMap extends GraphMapNames { - - // This was previously a class inheriting from HashMap. However, - // [warn] ...: inheritance from class HashMap in package mutable is deprecated (since 2.13.0): HashMap will be made final; use .withDefault for the common use case of computing a default value - type GraphMapType = mutable.HashMap[String, DirectedGraph[String]] - - val EMPTY_GRAPH = GraphMap() - - def apply(): GraphMapType = { - // we have very few dependency types, so let's create a small hash to save memory. - new GraphMapType(2, mutable.HashMap.defaultLoadFactor) - } - - def apply(existing: scala.collection.Map[String, DirectedGraph[String]]): GraphMapType = { - val gm = GraphMap() - gm ++= existing - } -} diff --git a/library/src/main/scala/org/clulab/processors/Sentence.scala b/library/src/main/scala/org/clulab/processors/Sentence.scala index 97a2350a8..d7589cc19 100644 --- a/library/src/main/scala/org/clulab/processors/Sentence.scala +++ b/library/src/main/scala/org/clulab/processors/Sentence.scala @@ -1,7 +1,6 @@ package org.clulab.processors import org.clulab.struct.{DirectedGraph, GraphMap, RelationTriple, Tree} -import org.clulab.struct.GraphMap._ import org.clulab.utils.Hash import scala.collection.mutable @@ -37,7 +36,7 @@ class Sentence( /** Constituent tree of this sentence; includes head words */ val syntacticTree: Option[Tree] = None, /** DAG of syntactic and semantic dependencies; word offsets start at 0 */ - val graphs: GraphMapType = GraphMap(), + val graphs: GraphMap.ImmutableType = GraphMap.immutableEmpty, /** Relation triples from OpenIE */ val relations:Option[Seq[RelationTriple]] = None ) extends Serializable { @@ -97,27 +96,28 @@ class Sentence( * @return A directed graph of dependencies if any exist, otherwise None */ def dependencies: Option[DirectedGraph[String]] = graphs match { - case collapsed if collapsed.contains(UNIVERSAL_ENHANCED) => collapsed.get(UNIVERSAL_ENHANCED) - case basic if basic.contains(UNIVERSAL_BASIC) => basic.get(UNIVERSAL_BASIC) + case collapsed if collapsed.contains(GraphMap.UNIVERSAL_ENHANCED) => collapsed.get(GraphMap.UNIVERSAL_ENHANCED) + case basic if basic.contains(GraphMap.UNIVERSAL_BASIC) => basic.get(GraphMap.UNIVERSAL_BASIC) case _ => None } /** Fetches the universal basic dependencies */ - def universalBasicDependencies: Option[DirectedGraph[String]] = graphs.get(UNIVERSAL_BASIC) + def universalBasicDependencies: Option[DirectedGraph[String]] = graphs.get(GraphMap.UNIVERSAL_BASIC) /** Fetches the universal enhanced dependencies */ - def universalEnhancedDependencies: Option[DirectedGraph[String]] = graphs.get(UNIVERSAL_ENHANCED) + def universalEnhancedDependencies: Option[DirectedGraph[String]] = graphs.get(GraphMap.UNIVERSAL_ENHANCED) /** Fetches the Stanford basic dependencies */ - def stanfordBasicDependencies: Option[DirectedGraph[String]] = graphs.get(STANFORD_BASIC) + def stanfordBasicDependencies: Option[DirectedGraph[String]] = graphs.get(GraphMap.STANFORD_BASIC) /** Fetches the Stanford collapsed dependencies */ - def stanfordCollapsedDependencies: Option[DirectedGraph[String]] = graphs.get(STANFORD_COLLAPSED) + def stanfordCollapsedDependencies: Option[DirectedGraph[String]] = graphs.get(GraphMap.STANFORD_COLLAPSED) - def semanticRoles: Option[DirectedGraph[String]] = graphs.get(SEMANTIC_ROLES) - def enhancedSemanticRoles: Option[DirectedGraph[String]] = graphs.get(ENHANCED_SEMANTIC_ROLES) + def semanticRoles: Option[DirectedGraph[String]] = graphs.get(GraphMap.SEMANTIC_ROLES) - def hybridDependencies: Option[DirectedGraph[String]] = graphs.get(HYBRID_DEPENDENCIES) + def enhancedSemanticRoles: Option[DirectedGraph[String]] = graphs.get(GraphMap.ENHANCED_SEMANTIC_ROLES) + + def hybridDependencies: Option[DirectedGraph[String]] = graphs.get(GraphMap.HYBRID_DEPENDENCIES) /** * Recreates the text of the sentence, preserving the original number of white spaces between tokens @@ -178,7 +178,7 @@ class Sentence( norms: Option[Seq[String]] = norms, chunks: Option[Seq[String]] = chunks, syntacticTree: Option[Tree] = syntacticTree, - graphs: GraphMapType = graphs, + graphs: GraphMap.ImmutableType = graphs, relations: Option[Seq[RelationTriple]] = relations ): Sentence = new Sentence( @@ -223,7 +223,7 @@ object Sentence { norms: Option[Seq[String]] = None, chunks: Option[Seq[String]] = None, tree: Option[Tree] = None, - deps: GraphMapType = GraphMap.EMPTY_GRAPH, + deps: GraphMap.ImmutableType = GraphMap.immutableEmpty, relations: Option[Seq[RelationTriple]] = None ): Sentence = { new Sentence( diff --git a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala index 67ef882d0..ce20b2f62 100644 --- a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala +++ b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala @@ -309,9 +309,8 @@ class BalaurProcessor protected ( words: Seq[String], lemmas: Seq[String], tags: Seq[String], termTags: Array[Array[PredictionScore]], nonTermTags: Array[Array[PredictionScore]] - ): GraphMap.GraphMapType = { + ): GraphMap.ImmutableType = { val verbose = false - val graphs = GraphMap() val size = words.length // bht is used just for debugging purposes here val (bht, deps, roots) = hexaDecoder.decode(termTags, nonTermTags, topK = 25, verbose) @@ -323,27 +322,27 @@ class BalaurProcessor protected ( println("Roots: " + roots.get.mkString(", ")) } if (deps.nonEmpty && roots.nonEmpty) { - // TODO: This can be made in one fell swoop. - // basic dependencies that replicate treebank annotations val depGraph = new DirectedGraph[String](deps.get, Some(size), roots) - graphs += GraphMap.UNIVERSAL_BASIC -> depGraph - // enhanced dependencies as defined by Manning val enhancedDepGraph = ToEnhancedDependencies.generateUniversalEnhancedDependencies(words, lemmas, tags, depGraph) - graphs += GraphMap.UNIVERSAL_ENHANCED -> enhancedDepGraph - // ideally, hybrid dependencies should contain both syntactic dependencies and semantic roles - // however, this processor produces only syntactic dependencies - graphs += GraphMap.HYBRID_DEPENDENCIES -> enhancedDepGraph + Map( + GraphMap.UNIVERSAL_BASIC -> depGraph, + GraphMap.UNIVERSAL_ENHANCED -> enhancedDepGraph, + // ideally, hybrid dependencies should contain both syntactic dependencies and semantic roles + // however, this processor produces only syntactic dependencies + GraphMap.HYBRID_DEPENDENCIES -> enhancedDepGraph + ) } - graphs + else + GraphMap.immutableEmpty } } object BalaurProcessor { - val logger:Logger = LoggerFactory.getLogger(classOf[BalaurProcessor]) - val prefix:String = "BalaurProcessor" + val logger: Logger = LoggerFactory.getLogger(classOf[BalaurProcessor]) + val prefix: String = "BalaurProcessor" val NER_TASK = "NER" val POS_TASK = "POS" diff --git a/library/src/main/scala/org/clulab/processors/clu/Veil.scala b/library/src/main/scala/org/clulab/processors/clu/Veil.scala index aac0bc99f..430762e83 100644 --- a/library/src/main/scala/org/clulab/processors/clu/Veil.scala +++ b/library/src/main/scala/org/clulab/processors/clu/Veil.scala @@ -2,7 +2,6 @@ package org.clulab.processors.clu import org.clulab.processors.{Document, Processor, Sentence} import org.clulab.struct.{DirectedGraph, Edge, GraphMap, RelationTriple, Tree} -import org.clulab.struct.GraphMap.GraphMapType import org.clulab.utils.WrappedArraySeq import scala.collection.mutable.{Set => MutableSet} @@ -137,18 +136,16 @@ class VeiledDocument(originalDocument: Document, veiledWords: Seq[(Int, Range)]) } } - def unveilGraphs(veiledGraphs: GraphMapType, sentenceIndex: Int): GraphMapType = { + def unveilGraphs(veiledGraphs: GraphMap.ImmutableType, sentenceIndex: Int): GraphMap.ImmutableType = { val unveilArray = unveilArrays(sentenceIndex) - val unveiledGraphs = GraphMap() val originalLength = originalDocument.sentences(sentenceIndex).words.length - - veiledGraphs.foreach { case (name, veiledDirectedGraph) => + val unveiledGraphs = veiledGraphs.map { case (name, veiledDirectedGraph) => val unveiledEdges = veiledDirectedGraph.allEdges.map { case (veiledSource, veiledDestination, relation) => Edge(unveilArray(veiledSource), unveilArray(veiledDestination), relation) } val unveiledRoots = veiledDirectedGraph.roots.map(unveilArray) - unveiledGraphs(name) = new DirectedGraph(unveiledEdges, Some(originalLength), Some(unveiledRoots)) + name -> new DirectedGraph(unveiledEdges, Some(originalLength), Some(unveiledRoots)) } unveiledGraphs } diff --git a/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala b/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala index c2f3f885c..4e9db1baf 100644 --- a/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala +++ b/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala @@ -235,7 +235,7 @@ class DocumentSerializer extends Logging { assert(normBuffer.isEmpty || normBuffer.size == tokenCount) assert(chunkBuffer.isEmpty || chunkBuffer.size == tokenCount) - var deps = GraphMap() + val deps = GraphMap.mutableEmpty var tree:Option[Tree] = None var relations:Option[Seq[RelationTriple]] = None while ({ @@ -266,7 +266,7 @@ class DocumentSerializer extends Logging { bufferOption(entityBuffer, nilEntities), bufferOption(normBuffer, nilNorms), bufferOption(chunkBuffer, nilChunks), - tree, deps, relations + tree, deps.toMap, relations ) } diff --git a/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala b/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala index 26853d11b..2e66d1f76 100644 --- a/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala +++ b/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala @@ -102,7 +102,7 @@ object JSONSerializer { key -> toDirectedGraph(json, Some(preferredSize)) }.toMap - GraphMap(graphs) + graphs } val relations = None // TODO: Are these not serialized? val parsedSentence = Sentence( diff --git a/library/src/main/scala/org/clulab/serialization/json/package.scala b/library/src/main/scala/org/clulab/serialization/json/package.scala index a27c14174..3d93d9cf4 100644 --- a/library/src/main/scala/org/clulab/serialization/json/package.scala +++ b/library/src/main/scala/org/clulab/serialization/json/package.scala @@ -52,8 +52,8 @@ package object json { } } - implicit class GraphMapOps(gm: GraphMapType) extends JSONSerialization { - def jsonAST: JValue = Extraction.decompose(gm.toMap.map { case (k, v) => k -> v.jsonAST }) // instead of mapValues + implicit class GraphMapOps(gm: GraphMap.ImmutableType) extends JSONSerialization { + def jsonAST: JValue = Extraction.decompose(gm.map { case (k, v) => k -> v.jsonAST }) // instead of mapValues } /** For Document */ diff --git a/library/src/main/scala/org/clulab/struct/Annotation.scala b/library/src/main/scala/org/clulab/struct/Annotation.scala index 4323cecf3..5d98de5ed 100644 --- a/library/src/main/scala/org/clulab/struct/Annotation.scala +++ b/library/src/main/scala/org/clulab/struct/Annotation.scala @@ -1,7 +1,5 @@ package org.clulab.struct -import org.clulab.struct.GraphMap.GraphMapType - // These are by the word ones and then there are relationships between words. // So parse, might not be a thing that is per word. //case class WordParse(tag: String, lemma: String, entity: String, norm: String, chunk: String) @@ -21,7 +19,7 @@ case class Annotation( /** Constituent tree of this sentence; includes head words */ syntacticTree: Option[Tree] = None, /** DAG of syntactic and semantic dependencies; word offsets start at 0 */ - graphs: GraphMapType = GraphMap(), + graphs: GraphMap.ImmutableType = GraphMap.immutableEmpty, /** Relation triples from OpenIE */ relations:Option[Array[RelationTriple]] = None ) { diff --git a/library/src/main/scala/org/clulab/struct/DependencyMapNames.scala b/library/src/main/scala/org/clulab/struct/DependencyMapNames.scala deleted file mode 100644 index 82a8b39ab..000000000 --- a/library/src/main/scala/org/clulab/struct/DependencyMapNames.scala +++ /dev/null @@ -1,7 +0,0 @@ -package org.clulab.struct - -trait DependencyMapNames { - val STANFORD_BASIC = 0 // basic Stanford dependencies - val STANFORD_COLLAPSED = 1 // collapsed Stanford dependencies - val SEMANTIC_ROLES = 2 // semantic roles from CoNLL 2008-09, which includes PropBank and NomBank -} diff --git a/library/src/main/scala/org/clulab/struct/GraphMapNames.scala b/library/src/main/scala/org/clulab/struct/GraphMap.scala similarity index 68% rename from library/src/main/scala/org/clulab/struct/GraphMapNames.scala rename to library/src/main/scala/org/clulab/struct/GraphMap.scala index 012f0f52a..f9111af49 100644 --- a/library/src/main/scala/org/clulab/struct/GraphMapNames.scala +++ b/library/src/main/scala/org/clulab/struct/GraphMap.scala @@ -1,6 +1,14 @@ package org.clulab.struct -trait GraphMapNames { +import scala.collection.mutable + +object GraphMap { + type ImmutableType = Map[String, DirectedGraph[String]] + type MutableType = mutable.Map[String, DirectedGraph[String]] + + val immutableEmpty: ImmutableType = Map.empty + val mutableEmpty: MutableType = mutable.Map.empty[String, DirectedGraph[String]] + val UNIVERSAL_BASIC = "universal-basic" // basic Universal dependencies val UNIVERSAL_ENHANCED = "universal-enhanced" // collapsed (or enhanced) Universal dependencies val STANFORD_BASIC = "stanford-basic" // basic Stanford dependencies diff --git a/library/src/test/scala/org/clulab/utils/TestFindHeads.scala b/library/src/test/scala/org/clulab/utils/TestFindHeads.scala index 4fd3fdfe4..bb9ba3823 100644 --- a/library/src/test/scala/org/clulab/utils/TestFindHeads.scala +++ b/library/src/test/scala/org/clulab/utils/TestFindHeads.scala @@ -11,10 +11,10 @@ class TestFindHeads extends Test { val endOffsets = Seq(0) // unused val sentence = new Sentence( words, startOffsets, endOffsets, words, - tags = Some(words) + tags = Some(words), + graphs = Map(UNIVERSAL_BASIC -> directedGraph) ) - sentence.graphs(UNIVERSAL_BASIC) = directedGraph sentence } From 3c3f3db3f1f0b0192e6e57ad81f649cc1c3817dd Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Mon, 26 May 2025 16:04:10 -0700 Subject: [PATCH 24/42] More GraphMap --- .../src/main/scala/org/clulab/processors/Sentence.scala | 6 +++--- .../scala/org/clulab/processors/clu/BalaurProcessor.scala | 4 ++-- .../src/main/scala/org/clulab/processors/clu/Veil.scala | 2 +- .../org/clulab/serialization/DocumentSerializer.scala | 4 ++-- .../scala/org/clulab/serialization/json/package.scala | 2 +- library/src/main/scala/org/clulab/struct/Annotation.scala | 2 +- library/src/main/scala/org/clulab/struct/GraphMap.scala | 8 ++------ 7 files changed, 12 insertions(+), 16 deletions(-) diff --git a/library/src/main/scala/org/clulab/processors/Sentence.scala b/library/src/main/scala/org/clulab/processors/Sentence.scala index d7589cc19..acb14b56b 100644 --- a/library/src/main/scala/org/clulab/processors/Sentence.scala +++ b/library/src/main/scala/org/clulab/processors/Sentence.scala @@ -36,7 +36,7 @@ class Sentence( /** Constituent tree of this sentence; includes head words */ val syntacticTree: Option[Tree] = None, /** DAG of syntactic and semantic dependencies; word offsets start at 0 */ - val graphs: GraphMap.ImmutableType = GraphMap.immutableEmpty, + val graphs: GraphMap.Type = GraphMap.empty, /** Relation triples from OpenIE */ val relations:Option[Seq[RelationTriple]] = None ) extends Serializable { @@ -178,7 +178,7 @@ class Sentence( norms: Option[Seq[String]] = norms, chunks: Option[Seq[String]] = chunks, syntacticTree: Option[Tree] = syntacticTree, - graphs: GraphMap.ImmutableType = graphs, + graphs: GraphMap.Type = graphs, relations: Option[Seq[RelationTriple]] = relations ): Sentence = new Sentence( @@ -223,7 +223,7 @@ object Sentence { norms: Option[Seq[String]] = None, chunks: Option[Seq[String]] = None, tree: Option[Tree] = None, - deps: GraphMap.ImmutableType = GraphMap.immutableEmpty, + deps: GraphMap.Type = GraphMap.empty, relations: Option[Seq[RelationTriple]] = None ): Sentence = { new Sentence( diff --git a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala index ce20b2f62..8404ed13f 100644 --- a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala +++ b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala @@ -309,7 +309,7 @@ class BalaurProcessor protected ( words: Seq[String], lemmas: Seq[String], tags: Seq[String], termTags: Array[Array[PredictionScore]], nonTermTags: Array[Array[PredictionScore]] - ): GraphMap.ImmutableType = { + ): GraphMap.Type = { val verbose = false val size = words.length // bht is used just for debugging purposes here @@ -336,7 +336,7 @@ class BalaurProcessor protected ( ) } else - GraphMap.immutableEmpty + GraphMap.empty } } diff --git a/library/src/main/scala/org/clulab/processors/clu/Veil.scala b/library/src/main/scala/org/clulab/processors/clu/Veil.scala index 430762e83..31d25ed9c 100644 --- a/library/src/main/scala/org/clulab/processors/clu/Veil.scala +++ b/library/src/main/scala/org/clulab/processors/clu/Veil.scala @@ -136,7 +136,7 @@ class VeiledDocument(originalDocument: Document, veiledWords: Seq[(Int, Range)]) } } - def unveilGraphs(veiledGraphs: GraphMap.ImmutableType, sentenceIndex: Int): GraphMap.ImmutableType = { + def unveilGraphs(veiledGraphs: GraphMap.Type, sentenceIndex: Int): GraphMap.Type = { val unveilArray = unveilArrays(sentenceIndex) val originalLength = originalDocument.sentences(sentenceIndex).words.length val unveiledGraphs = veiledGraphs.map { case (name, veiledDirectedGraph) => diff --git a/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala b/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala index 4e9db1baf..cfae7e40b 100644 --- a/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala +++ b/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala @@ -235,7 +235,7 @@ class DocumentSerializer extends Logging { assert(normBuffer.isEmpty || normBuffer.size == tokenCount) assert(chunkBuffer.isEmpty || chunkBuffer.size == tokenCount) - val deps = GraphMap.mutableEmpty + var deps = GraphMap.empty var tree:Option[Tree] = None var relations:Option[Seq[RelationTriple]] = None while ({ @@ -266,7 +266,7 @@ class DocumentSerializer extends Logging { bufferOption(entityBuffer, nilEntities), bufferOption(normBuffer, nilNorms), bufferOption(chunkBuffer, nilChunks), - tree, deps.toMap, relations + tree, deps, relations ) } diff --git a/library/src/main/scala/org/clulab/serialization/json/package.scala b/library/src/main/scala/org/clulab/serialization/json/package.scala index 3d93d9cf4..88276826b 100644 --- a/library/src/main/scala/org/clulab/serialization/json/package.scala +++ b/library/src/main/scala/org/clulab/serialization/json/package.scala @@ -52,7 +52,7 @@ package object json { } } - implicit class GraphMapOps(gm: GraphMap.ImmutableType) extends JSONSerialization { + implicit class GraphMapOps(gm: GraphMap.Type) extends JSONSerialization { def jsonAST: JValue = Extraction.decompose(gm.map { case (k, v) => k -> v.jsonAST }) // instead of mapValues } diff --git a/library/src/main/scala/org/clulab/struct/Annotation.scala b/library/src/main/scala/org/clulab/struct/Annotation.scala index 5d98de5ed..d9f390a86 100644 --- a/library/src/main/scala/org/clulab/struct/Annotation.scala +++ b/library/src/main/scala/org/clulab/struct/Annotation.scala @@ -19,7 +19,7 @@ case class Annotation( /** Constituent tree of this sentence; includes head words */ syntacticTree: Option[Tree] = None, /** DAG of syntactic and semantic dependencies; word offsets start at 0 */ - graphs: GraphMap.ImmutableType = GraphMap.immutableEmpty, + graphs: GraphMap.Type = GraphMap.empty, /** Relation triples from OpenIE */ relations:Option[Array[RelationTriple]] = None ) { diff --git a/library/src/main/scala/org/clulab/struct/GraphMap.scala b/library/src/main/scala/org/clulab/struct/GraphMap.scala index f9111af49..6857916e3 100644 --- a/library/src/main/scala/org/clulab/struct/GraphMap.scala +++ b/library/src/main/scala/org/clulab/struct/GraphMap.scala @@ -1,13 +1,9 @@ package org.clulab.struct -import scala.collection.mutable - object GraphMap { - type ImmutableType = Map[String, DirectedGraph[String]] - type MutableType = mutable.Map[String, DirectedGraph[String]] + type Type = Map[String, DirectedGraph[String]] - val immutableEmpty: ImmutableType = Map.empty - val mutableEmpty: MutableType = mutable.Map.empty[String, DirectedGraph[String]] + val empty: Type = Map.empty val UNIVERSAL_BASIC = "universal-basic" // basic Universal dependencies val UNIVERSAL_ENHANCED = "universal-enhanced" // collapsed (or enhanced) Universal dependencies From 61d871dffc09ef2f748aed023c15e69d713e4729 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Mon, 26 May 2025 16:43:01 -0700 Subject: [PATCH 25/42] SeqView again --- .../org/clulab/scala/SeqView.scala | 2 +- .../org/clulab/scala/package.scala | 11 --- .../scala-2.13/org/clulab/scala/SeqView.scala | 2 +- .../scala-2.13/org/clulab/scala/package.scala | 11 --- .../org/clulab/odinstarter/OdinStarter3.scala | 67 ------------------- .../scala-3/org/clulab/scala/SeqView.scala | 2 +- .../scala-3/org/clulab/scala/package.scala | 11 --- .../scala/org/clulab/odin/impl/Taxonomy.scala | 2 +- .../org/clulab/sequences/LexiconNER.scala | 19 +++--- 9 files changed, 13 insertions(+), 114 deletions(-) delete mode 100644 library/src/main/scala-2.11_2.12/org/clulab/scala/package.scala delete mode 100644 library/src/main/scala-2.13/org/clulab/scala/package.scala delete mode 100644 library/src/main/scala-3/org/clulab/odinstarter/OdinStarter3.scala delete mode 100644 library/src/main/scala-3/org/clulab/scala/package.scala diff --git a/library/src/main/scala-2.11_2.12/org/clulab/scala/SeqView.scala b/library/src/main/scala-2.11_2.12/org/clulab/scala/SeqView.scala index c49d930cb..649887166 100644 --- a/library/src/main/scala-2.11_2.12/org/clulab/scala/SeqView.scala +++ b/library/src/main/scala-2.11_2.12/org/clulab/scala/SeqView.scala @@ -1,5 +1,5 @@ package org.clulab.scala object SeqView { - type Immutable[T] = scala.collection.SeqView[T, Seq[T]] + type Type[T] = scala.collection.SeqView[T, Seq[T]] } diff --git a/library/src/main/scala-2.11_2.12/org/clulab/scala/package.scala b/library/src/main/scala-2.11_2.12/org/clulab/scala/package.scala deleted file mode 100644 index a6a43654c..000000000 --- a/library/src/main/scala-2.11_2.12/org/clulab/scala/package.scala +++ /dev/null @@ -1,11 +0,0 @@ -package org.clulab - -import _root_.scala.{BufferedIterator => GenericBufferedIterator} -import _root_.scala.collection.immutable.{Stream => ImmutableStream} - -package object scala { - type BufferedIterator[T] = GenericBufferedIterator[T] - - type LazyList[T] = ImmutableStream[T] - val LazyList = ImmutableStream -} diff --git a/library/src/main/scala-2.13/org/clulab/scala/SeqView.scala b/library/src/main/scala-2.13/org/clulab/scala/SeqView.scala index d55c09e97..e227e7cbb 100644 --- a/library/src/main/scala-2.13/org/clulab/scala/SeqView.scala +++ b/library/src/main/scala-2.13/org/clulab/scala/SeqView.scala @@ -1,5 +1,5 @@ package org.clulab.scala object SeqView { - type Immutable[T] = scala.collection.View[T] + type Type[T] = scala.collection.View[T] } diff --git a/library/src/main/scala-2.13/org/clulab/scala/package.scala b/library/src/main/scala-2.13/org/clulab/scala/package.scala deleted file mode 100644 index 8df18bbdf..000000000 --- a/library/src/main/scala-2.13/org/clulab/scala/package.scala +++ /dev/null @@ -1,11 +0,0 @@ -package org.clulab - -import _root_.scala.collection.{BufferedIterator => GenericBufferedIterator} -import _root_.scala.collection.immutable.{LazyList => ImmutableLazyList} - -package object scala { - type BufferedIterator[T] = GenericBufferedIterator[T] - - type LazyList[T] = ImmutableLazyList[T] - val LazyList = ImmutableLazyList -} diff --git a/library/src/main/scala-3/org/clulab/odinstarter/OdinStarter3.scala b/library/src/main/scala-3/org/clulab/odinstarter/OdinStarter3.scala deleted file mode 100644 index a1332bf6d..000000000 --- a/library/src/main/scala-3/org/clulab/odinstarter/OdinStarter3.scala +++ /dev/null @@ -1,67 +0,0 @@ -package org.clulab.odinstarter - -import org.clulab.odin.ExtractorEngine -import org.clulab.odin.Mention -import org.clulab.processors.clu.BalaurProcessor -import org.clulab.sequences.LexiconNER -import org.clulab.utils.FileUtils - -import java.io.File - -object OdinStarter3: - - // From sbt use "runMain org.clulab.odinstarter.main". - @main def main() = - // When using an IDE rather than sbt, make sure the working directory for the run - // configuration is the subproject directory so that this resourceDir is accessible. - val resourceDir: File = new File("./src/main/resources") - val customLexiconNer = // i.e., Named Entity Recognizer - val kbsAndCaseInsensitiveMatchings: Seq[(String, Boolean)] = Seq( - // You can add additional kbs (knowledge bases) and caseInsensitiveMatchings here. - ("org/clulab/odinstarter/FOOD.tsv", true) // , - // ("org/clulab/odinstarter/RESTAURANTS.tsv", false) - ) - val kbs = kbsAndCaseInsensitiveMatchings.map(_._1) - val caseInsensitiveMatchings = kbsAndCaseInsensitiveMatchings.map(_._2) - val isLocal = kbs.forall(new File(resourceDir, _).exists) - val baseDirOpt = if isLocal then Some(resourceDir) else None - - LexiconNER(kbs, caseInsensitiveMatchings, baseDirOpt) - val processor = new BalaurProcessor(lexiconNerOpt = Some(customLexiconNer)) - val extractorEngine = - val masterResource = "/org/clulab/odinstarter/main.yml" - // We usually want to reload rules during development, - // so we try to load them from the filesystem first, then jar. - // The resource must start with /, but the file probably shouldn't. - val masterFile = new File(resourceDir, masterResource.drop(1)) - - if masterFile.exists then - // Read rules from file in filesystem. - val rules = FileUtils.getTextFromFile(masterFile) - ExtractorEngine(rules, ruleDir = Some(resourceDir)) - else - // Read rules from resource in jar. - val rules = FileUtils.getTextFromResource(masterResource) - ExtractorEngine(rules, ruleDir = None) - val document = processor.annotate("John eats cake.") - val mentions = extractorEngine.extractFrom(document).sortBy(_.arguments.size) - - for mention <- mentions - do printMention(mention) - - def printMention(mention: Mention, nameOpt: Option[String] = None, depth: Int = 0): Unit = - val indent = " " * depth - val name = nameOpt.getOrElse("") - val labels = mention.labels - val words = mention.sentenceObj.words - val tokens = mention.tokenInterval.map(mention.sentenceObj.words) - - println(indent + " Name: " + name) - println(indent + " Labels: " + labels.mkString(" ")) - println(indent + " Sentence: " + words.mkString(" ")) - println(indent + " Tokens: " + tokens.mkString(" ")) - if mention.arguments.nonEmpty then - println(indent + "Arguments:") - for (name, mentions) <- mention.arguments; mention <- mentions - do printMention(mention, Some(name), depth + 1) - println() diff --git a/library/src/main/scala-3/org/clulab/scala/SeqView.scala b/library/src/main/scala-3/org/clulab/scala/SeqView.scala index d55c09e97..e227e7cbb 100644 --- a/library/src/main/scala-3/org/clulab/scala/SeqView.scala +++ b/library/src/main/scala-3/org/clulab/scala/SeqView.scala @@ -1,5 +1,5 @@ package org.clulab.scala object SeqView { - type Immutable[T] = scala.collection.View[T] + type Type[T] = scala.collection.View[T] } diff --git a/library/src/main/scala-3/org/clulab/scala/package.scala b/library/src/main/scala-3/org/clulab/scala/package.scala deleted file mode 100644 index 8df18bbdf..000000000 --- a/library/src/main/scala-3/org/clulab/scala/package.scala +++ /dev/null @@ -1,11 +0,0 @@ -package org.clulab - -import _root_.scala.collection.{BufferedIterator => GenericBufferedIterator} -import _root_.scala.collection.immutable.{LazyList => ImmutableLazyList} - -package object scala { - type BufferedIterator[T] = GenericBufferedIterator[T] - - type LazyList[T] = ImmutableLazyList[T] - val LazyList = ImmutableLazyList -} diff --git a/library/src/main/scala/org/clulab/odin/impl/Taxonomy.scala b/library/src/main/scala/org/clulab/odin/impl/Taxonomy.scala index 3afe9794a..96c3d2e57 100644 --- a/library/src/main/scala/org/clulab/odin/impl/Taxonomy.scala +++ b/library/src/main/scala/org/clulab/odin/impl/Taxonomy.scala @@ -1,7 +1,7 @@ package org.clulab.odin.impl -import org.clulab.scala.LazyList import java.util.{ Collection, Map => JMap } +import scala.collection.compat.immutable.LazyList import scala.jdk.CollectionConverters._ class Taxonomy(parents: Map[String, String]) { diff --git a/library/src/main/scala/org/clulab/sequences/LexiconNER.scala b/library/src/main/scala/org/clulab/sequences/LexiconNER.scala index 24d0b143f..688f196b6 100644 --- a/library/src/main/scala/org/clulab/sequences/LexiconNER.scala +++ b/library/src/main/scala/org/clulab/sequences/LexiconNER.scala @@ -6,7 +6,6 @@ import org.clulab.scala.WrappedArray._ import org.clulab.struct.{EntityValidator, TrueEntityValidator} import java.io.File -import scala.collection.mutable /** * The abstract base class for several concrete child classes used for Named Entity @@ -74,31 +73,31 @@ abstract class LexiconNER(val knownCaseInsensitives: Set[String], val useLemmas: } } - def hasCondition(wordsView: SeqView.Immutable[String], condition: Char => Boolean): Boolean = + def hasCondition(wordsView: SeqView.Type[String], condition: Char => Boolean): Boolean = wordsView.exists(_.exists(condition)) - def hasLetter(wordsView: SeqView.Immutable[String]): Boolean = + def hasLetter(wordsView: SeqView.Type[String]): Boolean = hasCondition(wordsView, Character.isLetter) - def hasDigit(wordsView: SeqView.Immutable[String]): Boolean = + def hasDigit(wordsView: SeqView.Type[String]): Boolean = hasCondition(wordsView, Character.isDigit) - def hasUpperCaseLetters(wordsView: SeqView.Immutable[String]): Boolean = + def hasUpperCaseLetters(wordsView: SeqView.Type[String]): Boolean = hasCondition(wordsView, Character.isUpperCase) - def hasSpace(wordsView: SeqView.Immutable[String]): Boolean = wordsView.size > 1 + def hasSpace(wordsView: SeqView.Type[String]): Boolean = wordsView.size > 1 - def countCharacters(wordsView: SeqView.Immutable[String]): Int = + def countCharacters(wordsView: SeqView.Type[String]): Int = // Go ahead and calculate them all even though we only need to know if they exceed a value. wordsView.foldLeft(0) { (sum, word) => sum + word.length } - val contentQualifiers: Array[SeqView.Immutable[String] => Boolean] = Array( + val contentQualifiers: Array[SeqView.Type[String] => Boolean] = Array( // Start with the quick and easy ones. hasSpace, - { (wordsView: SeqView.Immutable[String]) => countCharacters(wordsView) > LexiconNER.KNOWN_CASE_INSENSITIVE_LENGTH }, + { (wordsView: SeqView.Type[String]) => countCharacters(wordsView) > LexiconNER.KNOWN_CASE_INSENSITIVE_LENGTH }, hasDigit, hasUpperCaseLetters, - { (wordsView: SeqView.Immutable[String]) => knownCaseInsensitives.contains(wordsView.head) } + { (wordsView: SeqView.Type[String]) => knownCaseInsensitives.contains(wordsView.head) } ) protected def contentfulSpan(sentence: Sentence, start: Int, length: Int): Boolean = { From e9979eabcc854752c76ac8cc400fc4a67aa68a18 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Mon, 26 May 2025 16:51:47 -0700 Subject: [PATCH 26/42] Remove spaces --- build.sbt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/build.sbt b/build.sbt index 7a7df39bf..69a37d8d2 100644 --- a/build.sbt +++ b/build.sbt @@ -34,19 +34,19 @@ lazy val library = project lazy val apps = project .dependsOn(library % "compile -> compile; test -> test") - lazy val webapp = project - .enablePlugins(PlayScala) - .dependsOn(library % "compile -> compile; test -> test") - .settings( +lazy val webapp = project + .enablePlugins(PlayScala) + .dependsOn(library % "compile -> compile; test -> test") + .settings( // scala3 doesn't have play (for 2.8.19 as specified by the project) and is ruled out completely. // scala213 has version problems for com.fasterxml.jackson.databind.JsonMappingException. // scala212 works! // scala211 isn't compiling and complains on twirlCompileTemplates. // This isn't a library. Only one version needs to work. We shouldn't use play for this anyway. - crossScalaVersions := Seq(scala212) - ) + crossScalaVersions := Seq(scala212) + ) lazy val debugger = project - .dependsOn(library % "compile -> compile; test -> test") + .dependsOn(library % "compile -> compile; test -> test") addCommandAlias("dockerizeWebapp", ";webapp/docker:publishLocal") From 9c80f426278f8ff0f9407f9d87b25b95fb709dd8 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Mon, 26 May 2025 17:01:04 -0700 Subject: [PATCH 27/42] Update sbt again --- project/build.properties | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/project/build.properties b/project/build.properties index 4c19fc197..29f5dd953 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1,9 +1,9 @@ -# This was last checked on 2025-05-09. +# This was last checked on 2025-05-26. # Version 1.7.2+ will cause problems when combined with the play plug-in used for the webapp! # [error] * org.scala-lang.modules:scala-xml_2.12:2.1.0 (early-semver) is selected over {1.2.0, 1.1.1} # [error] +- org.scala-lang:scala-compiler:2.12.17 (depends on 2.1.0) # [error] +- com.typesafe.sbt:sbt-native-packager:1.5.2 (scalaVersion=2.12, sbtVersion=1.0) (depends on 1.1.1) # [error] +- com.typesafe.play:twirl-api_2.12:1.5.1 (depends on 1.2.0) # This error is solved by adding a VersionScheme.Always to plugins.sbt. -# up to 1.10.11 -sbt.version = 1.10.11 +# up to 1.11.0 +sbt.version = 1.11.0 From 70b031b54f9b2fa0ba065aa4569bb5f156d8bf31 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Mon, 26 May 2025 17:20:24 -0700 Subject: [PATCH 28/42] Fix a toSeq --- .../org/clulab/processors/apps/CommandLineInterface.scala | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/apps/src/main/scala/org/clulab/processors/apps/CommandLineInterface.scala b/apps/src/main/scala/org/clulab/processors/apps/CommandLineInterface.scala index 0e84c662d..c0303f6ca 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/CommandLineInterface.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/CommandLineInterface.scala @@ -3,7 +3,7 @@ package org.clulab.processors.apps import org.clulab.processors.Document import org.clulab.processors.clu.BalaurProcessor import org.clulab.serialization.CoNLLUSerializer -import org.clulab.utils.{FileUtils, StringUtils} +import org.clulab.utils.{FileUtils, StringUtils, WrappedArraySeq} import java.io.PrintWriter import scala.util.Using @@ -36,7 +36,11 @@ object CommandLineInterface extends App { } else if(props.containsKey(TOKENS)) { // one sentence per line; sentences are tokenized val sents = FileUtils.getLinesFromFile(props.getProperty(INPUT)) - val tokenizedSents = sents.map(_.split("\\s+").toSeq) + val tokenizedSents = sents.map { sent => + val tokens = sent.split("\\s+") + + WrappedArraySeq(tokens).toImmutableSeq + } proc.annotateFromTokens(tokenizedSents) } else { // assume raw text From de0041f73349eec040412e2df7d222b61a62db4c Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Mon, 26 May 2025 18:03:16 -0700 Subject: [PATCH 29/42] Account for immutable doc in some tests --- .../apps/NumericEntityRecognizerShell.scala | 4 +++- .../scala/org/clulab/numeric/EvalTimeNorm.scala | 17 +++++++++++------ .../clulab/numeric/TestSeasonNormalizer.scala | 4 ++-- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala b/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala index c77688a54..d4ddd3cb4 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/NumericEntityRecognizerShell.scala @@ -34,9 +34,11 @@ class NumericEntityRecognizerShell(ruleDirOpt: Option[String]) extends Reloadabl /** The actual work, including printing out the output */ def work(text: String): Unit = { val doc = proc.get.annotate(text) + // This gets the same numericEntityRecognizer already used in the annotation + // so that the mentions, since thrown away, can be recalculated. val mentions = proc.get.numericEntityRecognizerOpt.map(_.extractFrom(doc)).getOrElse(Seq.empty) - NumericUtils.mkLabelsAndNorms(doc, mentions) + // The doc should already have been annotated two lines above. NumericUtils.displayMentions(mentions, doc) } diff --git a/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala b/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala index 08acac195..db074c570 100644 --- a/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala +++ b/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala @@ -2,6 +2,7 @@ package org.clulab.numeric import org.clulab.numeric.mentions.Norm import org.clulab.processors.Processor +import org.clulab.processors.clu.BalaurProcessor import java.nio.charset.StandardCharsets import scala.io.Source @@ -9,8 +10,11 @@ import scala.util.Using object EvalTimeNorm { - def runEval(proc: Processor, ner: NumericEntityRecognizer, - testFile: String): Double = { + def runEval( + proc: Processor, + ner: NumericEntityRecognizer, + testFile: String + ): Double = { val timeNormEvalDir = "/org/clulab/numeric/TimeNormEvalSet" val goldStream = getClass.getResourceAsStream(s"$timeNormEvalDir/$testFile") val goldLines = Source.fromInputStream(goldStream).getLines() @@ -34,8 +38,9 @@ object EvalTimeNorm { } val doc = proc.annotate(docText) val mentions = ner.extractFrom(doc) - NumericUtils.mkLabelsAndNorms(doc, mentions) - val prediction = mentions.collect{ + // The following line does not change the document. + // NumericUtils.mkLabelsAndNorms(doc, mentions) + val prediction = mentions.collect{ case m: Norm if m.neLabel.equals("DATE") || m.neLabel.equals("DATE-RANGE") => (m.startOffset.toString, m.endOffset.toString, m.neNorm) }.toSet @@ -53,8 +58,8 @@ object EvalTimeNorm { fscore } - def run(proc: Processor): Double = { - val ner = NumericEntityRecognizer() + def run(proc: BalaurProcessor): Double = { + val ner = proc.numericEntityRecognizerOpt.get test(proc, ner) } diff --git a/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala b/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala index 93db9fa4d..423bd3eb7 100644 --- a/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala +++ b/library/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala @@ -17,8 +17,8 @@ class TestSeasonNormalizer extends Test { val document = processor.annotate(text) val mentions = processor.numericEntityRecognizerOpt.get.extractFrom(document) - NumericUtils.mkLabelsAndNorms(document, mentions) - (document.sentences.head.entities.get, document.sentences.head.norms.get) + val (entities, norms) = NumericUtils.mkLabelsAndNorms(document, mentions) + (entities.head, norms.head) } behavior of "Default seasonal BalaurProcessor" From db9b5e5e2dca956931542963098051a777d0abb1 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Mon, 26 May 2025 18:28:24 -0700 Subject: [PATCH 30/42] Move evaluation resources to app --- .../CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16 | 0 .../EAST_AFRICA_Seasonal_Monitor_5-Jun-17 | 0 .../EA_Seasonal_Monitor_Aug-17 | 0 .../Enhancing_Food_Security_in_South_Sudan_Nov-15 | 0 .../Ethiopia_Food_Security_Outlook_1-Feb-17 | 0 .../FAO_GIEWS_South_Sudan_Country_Brief_Sep-17 | 0 .../FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17 | 0 .../FEWS_NET_South_Sudan_Outlook_Jan-18 | 0 .../FFP_Fact_Sheet_South_Sudan_Jan-18 | 0 ...lace_Hundreds_In_War-torn_In_South_Sudan_Sep-17 | 0 .../Food_Assistance_Outlook_Brief_1-Jan-18 | 0 .../Price_Watch_28-Feb-18/Price_Watch_28-Feb-18 | 0 .../South_Sudan_Humanitarian_Response_Plan_Jan-18 | 0 .../South_Sudanese_Risk_Facing_Famine_Jan-18 | 0 .../TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14 | 0 ...F_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17 | 0 ...pia_Drought_Emergency_Situation_Report_5_Jul-17 | 0 .../WorldModelersDatesRangesTimex.csv | 0 .../clulab/processors/apps/EvalTimeNormApp.scala | 4 +++- .../scala/org/clulab/numeric/EvalTimeNorm.scala | 14 +++++--------- 20 files changed, 8 insertions(+), 10 deletions(-) rename {library => apps}/src/main/resources/org/clulab/numeric/TimeNormEvalSet/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16 (100%) rename {library => apps}/src/main/resources/org/clulab/numeric/TimeNormEvalSet/EAST_AFRICA_Seasonal_Monitor_5-Jun-17/EAST_AFRICA_Seasonal_Monitor_5-Jun-17 (100%) rename {library => apps}/src/main/resources/org/clulab/numeric/TimeNormEvalSet/EA_Seasonal_Monitor_Aug-17/EA_Seasonal_Monitor_Aug-17 (100%) rename {library => apps}/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Enhancing_Food_Security_in_South_Sudan_Nov-15/Enhancing_Food_Security_in_South_Sudan_Nov-15 (100%) rename {library => apps}/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Ethiopia_Food_Security_Outlook_1-Feb-17/Ethiopia_Food_Security_Outlook_1-Feb-17 (100%) rename {library => apps}/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17 (100%) rename {library => apps}/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17 (100%) rename {library => apps}/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Outlook_Jan-18/FEWS_NET_South_Sudan_Outlook_Jan-18 (100%) rename {library => apps}/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FFP_Fact_Sheet_South_Sudan_Jan-18/FFP_Fact_Sheet_South_Sudan_Jan-18 (100%) rename {library => apps}/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17 (100%) rename {library => apps}/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Food_Assistance_Outlook_Brief_1-Jan-18/Food_Assistance_Outlook_Brief_1-Jan-18 (100%) rename {library => apps}/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Price_Watch_28-Feb-18/Price_Watch_28-Feb-18 (100%) rename {library => apps}/src/main/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudan_Humanitarian_Response_Plan_Jan-18/South_Sudan_Humanitarian_Response_Plan_Jan-18 (100%) rename {library => apps}/src/main/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudanese_Risk_Facing_Famine_Jan-18/South_Sudanese_Risk_Facing_Famine_Jan-18 (100%) rename {library => apps}/src/main/resources/org/clulab/numeric/TimeNormEvalSet/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14 (100%) rename {library => apps}/src/main/resources/org/clulab/numeric/TimeNormEvalSet/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17 (100%) rename {library => apps}/src/main/resources/org/clulab/numeric/TimeNormEvalSet/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17 (100%) rename {library => apps}/src/main/resources/org/clulab/numeric/TimeNormEvalSet/WorldModelersDatesRangesTimex.csv (100%) diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16 b/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16 rename to apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/EAST_AFRICA_Seasonal_Monitor_5-Jun-17/EAST_AFRICA_Seasonal_Monitor_5-Jun-17 b/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/EAST_AFRICA_Seasonal_Monitor_5-Jun-17/EAST_AFRICA_Seasonal_Monitor_5-Jun-17 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/EAST_AFRICA_Seasonal_Monitor_5-Jun-17/EAST_AFRICA_Seasonal_Monitor_5-Jun-17 rename to apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/EAST_AFRICA_Seasonal_Monitor_5-Jun-17/EAST_AFRICA_Seasonal_Monitor_5-Jun-17 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/EA_Seasonal_Monitor_Aug-17/EA_Seasonal_Monitor_Aug-17 b/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/EA_Seasonal_Monitor_Aug-17/EA_Seasonal_Monitor_Aug-17 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/EA_Seasonal_Monitor_Aug-17/EA_Seasonal_Monitor_Aug-17 rename to apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/EA_Seasonal_Monitor_Aug-17/EA_Seasonal_Monitor_Aug-17 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Enhancing_Food_Security_in_South_Sudan_Nov-15/Enhancing_Food_Security_in_South_Sudan_Nov-15 b/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Enhancing_Food_Security_in_South_Sudan_Nov-15/Enhancing_Food_Security_in_South_Sudan_Nov-15 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Enhancing_Food_Security_in_South_Sudan_Nov-15/Enhancing_Food_Security_in_South_Sudan_Nov-15 rename to apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Enhancing_Food_Security_in_South_Sudan_Nov-15/Enhancing_Food_Security_in_South_Sudan_Nov-15 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Ethiopia_Food_Security_Outlook_1-Feb-17/Ethiopia_Food_Security_Outlook_1-Feb-17 b/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Ethiopia_Food_Security_Outlook_1-Feb-17/Ethiopia_Food_Security_Outlook_1-Feb-17 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Ethiopia_Food_Security_Outlook_1-Feb-17/Ethiopia_Food_Security_Outlook_1-Feb-17 rename to apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Ethiopia_Food_Security_Outlook_1-Feb-17/Ethiopia_Food_Security_Outlook_1-Feb-17 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17 b/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17 rename to apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17 b/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17 rename to apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Outlook_Jan-18/FEWS_NET_South_Sudan_Outlook_Jan-18 b/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Outlook_Jan-18/FEWS_NET_South_Sudan_Outlook_Jan-18 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Outlook_Jan-18/FEWS_NET_South_Sudan_Outlook_Jan-18 rename to apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Outlook_Jan-18/FEWS_NET_South_Sudan_Outlook_Jan-18 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FFP_Fact_Sheet_South_Sudan_Jan-18/FFP_Fact_Sheet_South_Sudan_Jan-18 b/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FFP_Fact_Sheet_South_Sudan_Jan-18/FFP_Fact_Sheet_South_Sudan_Jan-18 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FFP_Fact_Sheet_South_Sudan_Jan-18/FFP_Fact_Sheet_South_Sudan_Jan-18 rename to apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FFP_Fact_Sheet_South_Sudan_Jan-18/FFP_Fact_Sheet_South_Sudan_Jan-18 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17 b/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17 rename to apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Food_Assistance_Outlook_Brief_1-Jan-18/Food_Assistance_Outlook_Brief_1-Jan-18 b/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Food_Assistance_Outlook_Brief_1-Jan-18/Food_Assistance_Outlook_Brief_1-Jan-18 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Food_Assistance_Outlook_Brief_1-Jan-18/Food_Assistance_Outlook_Brief_1-Jan-18 rename to apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Food_Assistance_Outlook_Brief_1-Jan-18/Food_Assistance_Outlook_Brief_1-Jan-18 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Price_Watch_28-Feb-18/Price_Watch_28-Feb-18 b/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Price_Watch_28-Feb-18/Price_Watch_28-Feb-18 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Price_Watch_28-Feb-18/Price_Watch_28-Feb-18 rename to apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Price_Watch_28-Feb-18/Price_Watch_28-Feb-18 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudan_Humanitarian_Response_Plan_Jan-18/South_Sudan_Humanitarian_Response_Plan_Jan-18 b/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudan_Humanitarian_Response_Plan_Jan-18/South_Sudan_Humanitarian_Response_Plan_Jan-18 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudan_Humanitarian_Response_Plan_Jan-18/South_Sudan_Humanitarian_Response_Plan_Jan-18 rename to apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudan_Humanitarian_Response_Plan_Jan-18/South_Sudan_Humanitarian_Response_Plan_Jan-18 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudanese_Risk_Facing_Famine_Jan-18/South_Sudanese_Risk_Facing_Famine_Jan-18 b/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudanese_Risk_Facing_Famine_Jan-18/South_Sudanese_Risk_Facing_Famine_Jan-18 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudanese_Risk_Facing_Famine_Jan-18/South_Sudanese_Risk_Facing_Famine_Jan-18 rename to apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudanese_Risk_Facing_Famine_Jan-18/South_Sudanese_Risk_Facing_Famine_Jan-18 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14 b/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14 rename to apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17 b/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17 rename to apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17 b/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17 similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17 rename to apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17 diff --git a/library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/WorldModelersDatesRangesTimex.csv b/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/WorldModelersDatesRangesTimex.csv similarity index 100% rename from library/src/main/resources/org/clulab/numeric/TimeNormEvalSet/WorldModelersDatesRangesTimex.csv rename to apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/WorldModelersDatesRangesTimex.csv diff --git a/apps/src/main/scala/org/clulab/processors/apps/EvalTimeNormApp.scala b/apps/src/main/scala/org/clulab/processors/apps/EvalTimeNormApp.scala index fdba5a609..8be6b6deb 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/EvalTimeNormApp.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/EvalTimeNormApp.scala @@ -5,6 +5,8 @@ import org.clulab.processors.clu.BalaurProcessor object EvalTimeNormApp extends App { val proc = new BalaurProcessor() + val timeNormEvalDir = "/org/clulab/numeric/TimeNormEvalSet" + val testFile = "WorldModelersDatesRangesTimex.csv" - EvalTimeNorm.run(proc) + EvalTimeNorm.run(proc, timeNormEvalDir, testFile) } diff --git a/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala b/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala index db074c570..9804190fe 100644 --- a/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala +++ b/library/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala @@ -12,10 +12,10 @@ object EvalTimeNorm { def runEval( proc: Processor, - ner: NumericEntityRecognizer, - testFile: String + timeNormEvalDir: String, + testFile: String, + ner: NumericEntityRecognizer ): Double = { - val timeNormEvalDir = "/org/clulab/numeric/TimeNormEvalSet" val goldStream = getClass.getResourceAsStream(s"$timeNormEvalDir/$testFile") val goldLines = Source.fromInputStream(goldStream).getLines() // Build a Map with the gold time expressions. @@ -58,13 +58,9 @@ object EvalTimeNorm { fscore } - def run(proc: BalaurProcessor): Double = { + def run(proc: BalaurProcessor, timeNormEvalDir: String, testFile: String): Double = { val ner = proc.numericEntityRecognizerOpt.get - test(proc, ner) - } - - def test(proc: Processor, ner: NumericEntityRecognizer): Double = { - runEval(proc, ner, "WorldModelersDatesRangesTimex.csv") + runEval(proc, timeNormEvalDir, testFile, ner) } } From 7239f2b1b0e9b4f1afa76298694f5a55b0e1e814 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Mon, 26 May 2025 19:23:45 -0700 Subject: [PATCH 31/42] Fix test --- .../org/clulab/processors/apps/EvalTimeNormApp.scala | 12 ------------ .../CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16 | 0 .../EAST_AFRICA_Seasonal_Monitor_5-Jun-17 | 0 .../EA_Seasonal_Monitor_Aug-17 | 0 .../Enhancing_Food_Security_in_South_Sudan_Nov-15 | 0 .../Ethiopia_Food_Security_Outlook_1-Feb-17 | 0 .../FAO_GIEWS_South_Sudan_Country_Brief_Sep-17 | 0 .../FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17 | 0 .../FEWS_NET_South_Sudan_Outlook_Jan-18 | 0 .../FFP_Fact_Sheet_South_Sudan_Jan-18 | 0 ...splace_Hundreds_In_War-torn_In_South_Sudan_Sep-17 | 0 .../Food_Assistance_Outlook_Brief_1-Jan-18 | 0 .../Price_Watch_28-Feb-18/Price_Watch_28-Feb-18 | 0 .../South_Sudan_Humanitarian_Response_Plan_Jan-18 | 0 .../South_Sudanese_Risk_Facing_Famine_Jan-18 | 0 .../TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14 | 0 ...CEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17 | 0 ...iopia_Drought_Emergency_Situation_Report_5_Jul-17 | 0 .../WorldModelersDatesRangesTimex.csv | 0 .../scala/org/clulab/numeric/TestEvalTimeNorm.scala | 11 ++++++----- 20 files changed, 6 insertions(+), 17 deletions(-) delete mode 100644 apps/src/main/scala/org/clulab/processors/apps/EvalTimeNormApp.scala rename {apps/src/main => library/src/test}/resources/org/clulab/numeric/TimeNormEvalSet/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16 (100%) rename {apps/src/main => library/src/test}/resources/org/clulab/numeric/TimeNormEvalSet/EAST_AFRICA_Seasonal_Monitor_5-Jun-17/EAST_AFRICA_Seasonal_Monitor_5-Jun-17 (100%) rename {apps/src/main => library/src/test}/resources/org/clulab/numeric/TimeNormEvalSet/EA_Seasonal_Monitor_Aug-17/EA_Seasonal_Monitor_Aug-17 (100%) rename {apps/src/main => library/src/test}/resources/org/clulab/numeric/TimeNormEvalSet/Enhancing_Food_Security_in_South_Sudan_Nov-15/Enhancing_Food_Security_in_South_Sudan_Nov-15 (100%) rename {apps/src/main => library/src/test}/resources/org/clulab/numeric/TimeNormEvalSet/Ethiopia_Food_Security_Outlook_1-Feb-17/Ethiopia_Food_Security_Outlook_1-Feb-17 (100%) rename {apps/src/main => library/src/test}/resources/org/clulab/numeric/TimeNormEvalSet/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17 (100%) rename {apps/src/main => library/src/test}/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17 (100%) rename {apps/src/main => library/src/test}/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Outlook_Jan-18/FEWS_NET_South_Sudan_Outlook_Jan-18 (100%) rename {apps/src/main => library/src/test}/resources/org/clulab/numeric/TimeNormEvalSet/FFP_Fact_Sheet_South_Sudan_Jan-18/FFP_Fact_Sheet_South_Sudan_Jan-18 (100%) rename {apps/src/main => library/src/test}/resources/org/clulab/numeric/TimeNormEvalSet/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17 (100%) rename {apps/src/main => library/src/test}/resources/org/clulab/numeric/TimeNormEvalSet/Food_Assistance_Outlook_Brief_1-Jan-18/Food_Assistance_Outlook_Brief_1-Jan-18 (100%) rename {apps/src/main => library/src/test}/resources/org/clulab/numeric/TimeNormEvalSet/Price_Watch_28-Feb-18/Price_Watch_28-Feb-18 (100%) rename {apps/src/main => library/src/test}/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudan_Humanitarian_Response_Plan_Jan-18/South_Sudan_Humanitarian_Response_Plan_Jan-18 (100%) rename {apps/src/main => library/src/test}/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudanese_Risk_Facing_Famine_Jan-18/South_Sudanese_Risk_Facing_Famine_Jan-18 (100%) rename {apps/src/main => library/src/test}/resources/org/clulab/numeric/TimeNormEvalSet/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14 (100%) rename {apps/src/main => library/src/test}/resources/org/clulab/numeric/TimeNormEvalSet/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17 (100%) rename {apps/src/main => library/src/test}/resources/org/clulab/numeric/TimeNormEvalSet/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17 (100%) rename {apps/src/main => library/src/test}/resources/org/clulab/numeric/TimeNormEvalSet/WorldModelersDatesRangesTimex.csv (100%) diff --git a/apps/src/main/scala/org/clulab/processors/apps/EvalTimeNormApp.scala b/apps/src/main/scala/org/clulab/processors/apps/EvalTimeNormApp.scala deleted file mode 100644 index 8be6b6deb..000000000 --- a/apps/src/main/scala/org/clulab/processors/apps/EvalTimeNormApp.scala +++ /dev/null @@ -1,12 +0,0 @@ -package org.clulab.processors.apps - -import org.clulab.numeric.EvalTimeNorm -import org.clulab.processors.clu.BalaurProcessor - -object EvalTimeNormApp extends App { - val proc = new BalaurProcessor() - val timeNormEvalDir = "/org/clulab/numeric/TimeNormEvalSet" - val testFile = "WorldModelersDatesRangesTimex.csv" - - EvalTimeNorm.run(proc, timeNormEvalDir, testFile) -} diff --git a/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16 similarity index 100% rename from apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16/CLiMIS_FAO_UNICEF_WFP_South_Sudan_IPC_Jun-16 diff --git a/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/EAST_AFRICA_Seasonal_Monitor_5-Jun-17/EAST_AFRICA_Seasonal_Monitor_5-Jun-17 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/EAST_AFRICA_Seasonal_Monitor_5-Jun-17/EAST_AFRICA_Seasonal_Monitor_5-Jun-17 similarity index 100% rename from apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/EAST_AFRICA_Seasonal_Monitor_5-Jun-17/EAST_AFRICA_Seasonal_Monitor_5-Jun-17 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/EAST_AFRICA_Seasonal_Monitor_5-Jun-17/EAST_AFRICA_Seasonal_Monitor_5-Jun-17 diff --git a/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/EA_Seasonal_Monitor_Aug-17/EA_Seasonal_Monitor_Aug-17 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/EA_Seasonal_Monitor_Aug-17/EA_Seasonal_Monitor_Aug-17 similarity index 100% rename from apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/EA_Seasonal_Monitor_Aug-17/EA_Seasonal_Monitor_Aug-17 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/EA_Seasonal_Monitor_Aug-17/EA_Seasonal_Monitor_Aug-17 diff --git a/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Enhancing_Food_Security_in_South_Sudan_Nov-15/Enhancing_Food_Security_in_South_Sudan_Nov-15 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Enhancing_Food_Security_in_South_Sudan_Nov-15/Enhancing_Food_Security_in_South_Sudan_Nov-15 similarity index 100% rename from apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Enhancing_Food_Security_in_South_Sudan_Nov-15/Enhancing_Food_Security_in_South_Sudan_Nov-15 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Enhancing_Food_Security_in_South_Sudan_Nov-15/Enhancing_Food_Security_in_South_Sudan_Nov-15 diff --git a/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Ethiopia_Food_Security_Outlook_1-Feb-17/Ethiopia_Food_Security_Outlook_1-Feb-17 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Ethiopia_Food_Security_Outlook_1-Feb-17/Ethiopia_Food_Security_Outlook_1-Feb-17 similarity index 100% rename from apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Ethiopia_Food_Security_Outlook_1-Feb-17/Ethiopia_Food_Security_Outlook_1-Feb-17 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Ethiopia_Food_Security_Outlook_1-Feb-17/Ethiopia_Food_Security_Outlook_1-Feb-17 diff --git a/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17 similarity index 100% rename from apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17/FAO_GIEWS_South_Sudan_Country_Brief_Sep-17 diff --git a/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17 similarity index 100% rename from apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17/FEWS_NET_South_Sudan_Famine_Risk_Alert_Jan-17 diff --git a/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Outlook_Jan-18/FEWS_NET_South_Sudan_Outlook_Jan-18 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Outlook_Jan-18/FEWS_NET_South_Sudan_Outlook_Jan-18 similarity index 100% rename from apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Outlook_Jan-18/FEWS_NET_South_Sudan_Outlook_Jan-18 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/FEWS_NET_South_Sudan_Outlook_Jan-18/FEWS_NET_South_Sudan_Outlook_Jan-18 diff --git a/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FFP_Fact_Sheet_South_Sudan_Jan-18/FFP_Fact_Sheet_South_Sudan_Jan-18 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/FFP_Fact_Sheet_South_Sudan_Jan-18/FFP_Fact_Sheet_South_Sudan_Jan-18 similarity index 100% rename from apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/FFP_Fact_Sheet_South_Sudan_Jan-18/FFP_Fact_Sheet_South_Sudan_Jan-18 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/FFP_Fact_Sheet_South_Sudan_Jan-18/FFP_Fact_Sheet_South_Sudan_Jan-18 diff --git a/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17 similarity index 100% rename from apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17/Floods_Displace_Hundreds_In_War-torn_In_South_Sudan_Sep-17 diff --git a/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Food_Assistance_Outlook_Brief_1-Jan-18/Food_Assistance_Outlook_Brief_1-Jan-18 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Food_Assistance_Outlook_Brief_1-Jan-18/Food_Assistance_Outlook_Brief_1-Jan-18 similarity index 100% rename from apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Food_Assistance_Outlook_Brief_1-Jan-18/Food_Assistance_Outlook_Brief_1-Jan-18 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Food_Assistance_Outlook_Brief_1-Jan-18/Food_Assistance_Outlook_Brief_1-Jan-18 diff --git a/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Price_Watch_28-Feb-18/Price_Watch_28-Feb-18 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Price_Watch_28-Feb-18/Price_Watch_28-Feb-18 similarity index 100% rename from apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/Price_Watch_28-Feb-18/Price_Watch_28-Feb-18 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/Price_Watch_28-Feb-18/Price_Watch_28-Feb-18 diff --git a/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudan_Humanitarian_Response_Plan_Jan-18/South_Sudan_Humanitarian_Response_Plan_Jan-18 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudan_Humanitarian_Response_Plan_Jan-18/South_Sudan_Humanitarian_Response_Plan_Jan-18 similarity index 100% rename from apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudan_Humanitarian_Response_Plan_Jan-18/South_Sudan_Humanitarian_Response_Plan_Jan-18 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudan_Humanitarian_Response_Plan_Jan-18/South_Sudan_Humanitarian_Response_Plan_Jan-18 diff --git a/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudanese_Risk_Facing_Famine_Jan-18/South_Sudanese_Risk_Facing_Famine_Jan-18 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudanese_Risk_Facing_Famine_Jan-18/South_Sudanese_Risk_Facing_Famine_Jan-18 similarity index 100% rename from apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudanese_Risk_Facing_Famine_Jan-18/South_Sudanese_Risk_Facing_Famine_Jan-18 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/South_Sudanese_Risk_Facing_Famine_Jan-18/South_Sudanese_Risk_Facing_Famine_Jan-18 diff --git a/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14 similarity index 100% rename from apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14/TECHNICAL_BRIEF_(RE)ASSESSING_THE_Oct-14 diff --git a/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17 similarity index 100% rename from apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17/UNICEF_ETHIOPIA_HUMANITARIAN_SITUATION_REPORT_Apr-17 diff --git a/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17 b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17 similarity index 100% rename from apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17 rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17/WFP_Ethiopia_Drought_Emergency_Situation_Report_5_Jul-17 diff --git a/apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/WorldModelersDatesRangesTimex.csv b/library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/WorldModelersDatesRangesTimex.csv similarity index 100% rename from apps/src/main/resources/org/clulab/numeric/TimeNormEvalSet/WorldModelersDatesRangesTimex.csv rename to library/src/test/resources/org/clulab/numeric/TimeNormEvalSet/WorldModelersDatesRangesTimex.csv diff --git a/library/src/test/scala/org/clulab/numeric/TestEvalTimeNorm.scala b/library/src/test/scala/org/clulab/numeric/TestEvalTimeNorm.scala index bc22534cd..b5575b1b3 100644 --- a/library/src/test/scala/org/clulab/numeric/TestEvalTimeNorm.scala +++ b/library/src/test/scala/org/clulab/numeric/TestEvalTimeNorm.scala @@ -8,12 +8,13 @@ class TestEvalTimeNorm extends Test { behavior of "temporal parser" it should "not degrade in performance" in { + val timeNormEvalDir = "/org/clulab/numeric/TimeNormEvalSet" + val testFile = "WorldModelersDatesRangesTimex.csv" + val seasonPath = "/org/clulab/numeric/custom/SEASON.tsv" val expectedFscore = 0.85 - val proc = new BalaurProcessor(seasonPathOpt = Some("/org/clulab/numeric/custom/SEASON.tsv")) - val ner = NumericEntityRecognizer(seasonPath = "/org/clulab/numeric/custom/SEASON.tsv") - val actualFscore = EvalTimeNorm.test(proc, ner) + val proc = new BalaurProcessor(seasonPathOpt = Some(seasonPath)) + val actualFscore = EvalTimeNorm.run(proc, timeNormEvalDir, testFile) + actualFscore should be >= expectedFscore } - } - From 8b1c2f3bda9857dd8faec899ca9d8a54770ff84b Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Tue, 27 May 2025 09:42:41 -0700 Subject: [PATCH 32/42] Make DocumentAttachments immutable --- .../org/clulab/processors/Document.scala | 45 +++++--------- .../org/clulab/processors/Processor.scala | 22 ++++--- .../serialization/DocumentSerializer.scala | 10 ++-- .../serialization/json/JSONSerializer.scala | 16 ++--- .../clulab/serialization/json/package.scala | 5 +- .../struct/TestDocumentAttachment.scala | 60 ++++++++++--------- 6 files changed, 71 insertions(+), 87 deletions(-) diff --git a/library/src/main/scala/org/clulab/processors/Document.scala b/library/src/main/scala/org/clulab/processors/Document.scala index 34db68688..7dd9bcfd3 100644 --- a/library/src/main/scala/org/clulab/processors/Document.scala +++ b/library/src/main/scala/org/clulab/processors/Document.scala @@ -8,7 +8,6 @@ import org.json4s.JValue import org.json4s.jackson.prettyJson import java.io.PrintWriter -import scala.collection.mutable /** * Stores all annotations for one document. @@ -24,9 +23,13 @@ class Document( /** The original text corresponding to this document, if it was preserved by the corresponding processor */ val text: Option[String] = None, /** Map of any arbitrary document attachments such as document creation time */ - protected val attachments: Option[mutable.HashMap[String, DocumentAttachment]] = None, - /** DCT is Document Creation Time */ - protected val dct: Option[String] = None + val attachments: Option[DocumentAttachments.Type] = None, + /** + * The document creation time using the CoreNLP format + * See useFixedDate here for more details: https://stanfordnlp.github.io/CoreNLP/ner.html#setting-document-date + * The DCT will impact how Sentence.norms are generated for DATE expressions. + */ + val dct: Option[String] = None ) extends Serializable { def copy( @@ -34,19 +37,18 @@ class Document( id: Option[String] = id, coreferenceChains: Option[CorefChains] = coreferenceChains, text: Option[String] = text, - attachments: Option[mutable.HashMap[String, DocumentAttachment]] = None, + attachments: Option[DocumentAttachments.Type] = None, dct: Option[String] = dct ): Document = new Document(sentences, id, coreferenceChains, text, attachments, dct) /** Clears any internal state potentially constructed by the annotators */ - // def clear(): Unit = { } + def clear(): Unit = { } // This is for subclass support. /** * Used to compare Documents. * @return a hash (Int) based primarily on the sentences, ignoring attachments */ def equivalenceHash: Int = { - val stringCode = "org.clulab.processors.Document" // Hash representing the sentences. @@ -72,30 +74,6 @@ class Document( Hash.ordered(sentences.map(_.ambivalenceHash)) ) - /** Retrieves the attachment with the given name */ - def getAttachment(name: String): Option[DocumentAttachment] = attachments.flatMap(_.get(name)) - - /** Retrieves keys to all attachments so that the entire collection can be read - * for purposes including but not limited to serialization. If there are no - * attachments, that is attachments == None, an empty set is returned. - * This does not distinguish between None and Some(HashMap.empty), especially - * since the latter should not be possible because of the lazy initialization. - */ - def getAttachmentKeys: collection.Set[String] = { - attachments.map { attachments => - attachments.keySet - }.getOrElse(collection.Set.empty[String]) - } - - /** - * Sets the document creation time using the CoreNLP format. - * See useFixedDate here for more details: https://stanfordnlp.github.io/CoreNLP/ner.html#setting-document-date - * The DCT will impacts how Sentence.norms are generated for DATE expressions - * @param dct Document creation time - */ - - def getDCT: Option[String] = dct - def prettyPrint(pw: PrintWriter): Unit = { // let's print the sentence-level annotations sentences.zipWithIndex.foreach { case (sentence, sentenceCount) => @@ -312,6 +290,11 @@ trait JsonSerializerAble { */ trait DocumentAttachment extends DocumentAble with DocumentSerializerAble with JsonSerializerAble +object DocumentAttachments { + type Type = Map[String, DocumentAttachment] +} + + /** * Designed to store intermediate attachments that are only used to pass information between processor components. * Thus, these do not need to be serialized diff --git a/library/src/main/scala/org/clulab/processors/Processor.scala b/library/src/main/scala/org/clulab/processors/Processor.scala index 9d84e1527..e856f20a5 100644 --- a/library/src/main/scala/org/clulab/processors/Processor.scala +++ b/library/src/main/scala/org/clulab/processors/Processor.scala @@ -26,23 +26,21 @@ trait Processor { val headId = headDocument.id require(tailDocuments.forall(_.id == headId)) - val headDctOpt = headDocument.getDCT - require(documents.tail.forall(_.getDCT == headDctOpt)) + val headDctOpt = headDocument.dct + require(documents.tail.forall(_.dct == headDctOpt)) // Coreference chains involve Mentions that include references to documents. The Mentions are being // moved to a new Document and it would be infeasible to move the chains. require(documents.forall(_.coreferenceChains.isEmpty)) - val attachments = mutable.HashMap[String, DocumentAttachment]() - - documents.foreach { document => - document.getAttachmentKeys.foreach { attachmentKey => - val valueOpt = attachments.get(attachmentKey) - val isValid = valueOpt.forall(_ == document.getAttachment(attachmentKey).get) - - require(isValid, "The attachments cannot contradict each other.") - attachments(attachmentKey) = document.getAttachment(attachmentKey).get - } + val allAttachments = documents.flatMap { document => + document.attachments.getOrElse(Map.empty).toSeq } + // This will remove duplicate (key, value) pairs. + val distinctAttachments = allAttachments.distinct + // If for any key, there are different, contradictory values, only one value will make it into the map. + val attachments = distinctAttachments.toMap + + require(attachments.size == distinctAttachments.length, "Attachments can't contradict each other. Each key needs to map onto the same value.") val combinedSentences = documents.flatMap(_.sentences) val combinedDocument = new Document( diff --git a/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala b/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala index cfae7e40b..093185b7f 100644 --- a/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala +++ b/library/src/main/scala/org/clulab/serialization/DocumentSerializer.scala @@ -116,10 +116,7 @@ class DocumentSerializer extends Logging { */ val attachmentsOpt = namedDocumentAttachmentsOpt.map { namedDocumentAttachments => - val attachments = mutable.HashMap[String, DocumentAttachment]() - - attachments ++= namedDocumentAttachments - attachments + namedDocumentAttachments.toMap } val doc = new Document( @@ -334,11 +331,12 @@ class DocumentSerializer extends Logging { } // Sort these so that serialization is the same each time. - val attachmentKeys = doc.getAttachmentKeys.toList.sorted + val attachments = doc.attachments.getOrElse(Map.empty) + val attachmentKeys = attachments.keySet if (attachmentKeys.nonEmpty) { os.println(START_ATTACHMENTS + SEP + attachmentKeys.size) attachmentKeys.foreach { key => - val value = doc.getAttachment(key).get + val value = attachments(key) os.print(escapeAttachment(key)) os.print(SEP) os.print(escapeAttachment(value.documentAttachmentBuilderFromTextClassName)) diff --git a/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala b/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala index 2e66d1f76..ebc20d8b7 100644 --- a/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala +++ b/library/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala @@ -1,9 +1,9 @@ package org.clulab.serialization.json import java.io.File -import org.clulab.processors.{Document, DocumentAttachment, DocumentAttachmentBuilderFromJson, Sentence} +import org.clulab.processors.{Document, DocumentAttachment, DocumentAttachmentBuilderFromJson, DocumentAttachments, Sentence} import org.clulab.struct.Edge -import org.clulab.struct.{DirectedGraph, GraphMap} +import org.clulab.struct.DirectedGraph import org.clulab.utils.FileUtils import org.json4s import org.json4s.JsonDSL._ @@ -24,13 +24,12 @@ object JSONSerializer { def jsonAST(f: File): JValue = jsonAST(FileUtils.getTextFromFile(f)) - protected def getDocumentAttachments(jValue: JValue): Option[mutable.HashMap[String, DocumentAttachment]] = { + protected def getDocumentAttachments(jValue: JValue): Option[DocumentAttachments.Type] = { // See also DocumentSerializer for text version of nearly the same thing. (jValue \ DOCUMENT_ATTACHMENTS_KEY) match { case jObject: JObject => - val attachments = new mutable.HashMap[String, DocumentAttachment]() val keys = jObject.values.keys - keys.foreach { (key: String) => + val keyAndDocumentAttachmentPairs = keys.flatMap { (key: String) => (jObject \ key) match { case jObject: JObject => val documentAttachmentBuilderFromJsonClassName = (jObject \ DOCUMENT_ATTACHMENTS_BUILDER_KEY).extract[String] @@ -40,15 +39,16 @@ object JSONSerializer { val documentAttachmentBuilder = obj.asInstanceOf[DocumentAttachmentBuilderFromJson] val value = (jObject \ DOCUMENT_ATTACHMENTS_VALUE_KEY) val documentAttachment = documentAttachmentBuilder.mkDocumentAttachment(value) - attachments(key) = documentAttachment + + Some((key, documentAttachment)) case jValue: JValue => val text = prettyJson(jValue) throw new RuntimeException(s"ERROR: While deserializing document attachments expected JObject but found this: $text") // case _ => // noop. It should never get here. (Famous last words.) - case null => // noop. It should never get here. (Famous last words.) Scala 3 prefers null over _. + case null => None // noop. It should never get here. (Famous last words.) Scala 3 prefers null over _. } } - Some(attachments) + Some(keyAndDocumentAttachmentPairs.toMap) case _ => // Leave documentAttachments as is: None None } diff --git a/library/src/main/scala/org/clulab/serialization/json/package.scala b/library/src/main/scala/org/clulab/serialization/json/package.scala index 88276826b..a0fbf4f0e 100644 --- a/library/src/main/scala/org/clulab/serialization/json/package.scala +++ b/library/src/main/scala/org/clulab/serialization/json/package.scala @@ -61,10 +61,11 @@ package object json { def jsonAST: JValue = { // See also DocumentSerializer for a similar text implementation. - val attachmentKeys = doc.getAttachmentKeys.toList.sorted + val attachments = doc.attachments.getOrElse(Map.empty) + val attachmentKeys = attachments.keySet.toList.sorted val documentAttachments: JValue = if (attachmentKeys.nonEmpty) { val jFields = attachmentKeys.map { key => - val value = doc.getAttachment(key).get + val value = attachments(key) JField(key, (DOCUMENT_ATTACHMENTS_BUILDER_KEY -> JString(value.documentAttachmentBuilderFromJsonClassName)) ~ (DOCUMENT_ATTACHMENTS_VALUE_KEY -> value.toJsonSerializer) diff --git a/library/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala b/library/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala index 8bf2c792c..a820e26fa 100644 --- a/library/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala +++ b/library/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala @@ -16,7 +16,6 @@ import java.io.ByteArrayInputStream import java.io.ByteArrayOutputStream import java.io.ObjectInputStream import java.io.ObjectOutputStream -import scala.collection.mutable import scala.util.Using class TestDocumentAttachment extends Test { @@ -124,7 +123,7 @@ class TestDocumentAttachment extends Test { // } "Document with TextNameDocumentAttachment" should "serialize as text" in { - val oldAttachments = mutable.HashMap[String, DocumentAttachment]( + val oldAttachments = Map[String, DocumentAttachment]( (FIRST_KEY, new TextNameDocumentAttachment(FIRST_NAME)), (MIDDLE_KEY, new TextNameDocumentAttachment(MIDDLE_NAME)), (LAST_KEY, new TextNameDocumentAttachment(LAST_NAME)), @@ -136,18 +135,20 @@ class TestDocumentAttachment extends Test { val documentString = documentSerializer.save(oldDocument) val newDocument = documentSerializer.load(documentString) - require(newDocument.getAttachment(FIRST_KEY) == oldDocument.getAttachment(FIRST_KEY)) - require(newDocument.getAttachment(MIDDLE_KEY) == oldDocument.getAttachment(MIDDLE_KEY)) - require(newDocument.getAttachment(LAST_KEY) == oldDocument.getAttachment(LAST_KEY)) - require(newDocument.getAttachment(ALIAS_KEY).get.asInstanceOf[NameDocumentAttachment].name == - oldDocument.getAttachment(ALIAS_KEY).get.asInstanceOf[NameDocumentAttachment].name) + val newAttachments = newDocument.attachments.get + + require(newAttachments(FIRST_KEY) == oldAttachments(FIRST_KEY)) + require(newAttachments(MIDDLE_KEY) == oldAttachments(MIDDLE_KEY)) + require(newAttachments(LAST_KEY) == oldAttachments(LAST_KEY)) + require(newAttachments(ALIAS_KEY).asInstanceOf[NameDocumentAttachment].name == + oldAttachments(ALIAS_KEY).asInstanceOf[NameDocumentAttachment].name) // This one must be avoided. /*require(newDocument == oldDocument)*/ } "Document with ObjectNameDocumentAttachment" should "serialize as text" in { - val oldAttachments = mutable.HashMap[String, DocumentAttachment]( + val oldAttachments = Map[String, DocumentAttachment]( (FIRST_KEY, new ObjectNameDocumentAttachment(FIRST_NAME)), (MIDDLE_KEY, new ObjectNameDocumentAttachment(MIDDLE_NAME)), (LAST_KEY, new ObjectNameDocumentAttachment(LAST_NAME)), @@ -158,20 +159,21 @@ class TestDocumentAttachment extends Test { val documentSerializer = new DocumentSerializer() // This should be a messy string. val documentString = documentSerializer.save(oldDocument) - val newDocument = documentSerializer.load(documentString) - require(newDocument.getAttachment(FIRST_KEY) == oldDocument.getAttachment(FIRST_KEY)) - require(newDocument.getAttachment(MIDDLE_KEY) == oldDocument.getAttachment(MIDDLE_KEY)) - require(newDocument.getAttachment(LAST_KEY) == oldDocument.getAttachment(LAST_KEY)) - require(newDocument.getAttachment(ALIAS_KEY).get.asInstanceOf[NameDocumentAttachment].name == - oldDocument.getAttachment(ALIAS_KEY).get.asInstanceOf[NameDocumentAttachment].name) + val newAttachments = newDocument.attachments.get + + require(newAttachments(FIRST_KEY) == oldAttachments(FIRST_KEY)) + require(newAttachments(MIDDLE_KEY) == oldAttachments(MIDDLE_KEY)) + require(newAttachments(LAST_KEY) == oldAttachments(LAST_KEY)) + require(newAttachments(ALIAS_KEY).asInstanceOf[NameDocumentAttachment].name == + oldAttachments(ALIAS_KEY).asInstanceOf[NameDocumentAttachment].name) // This one must be avoided. /*require(newDocument == oldDocument)*/ } "Document with TextNameDocumentAttachments" should "serialize as json" in { - val oldAttachments = mutable.HashMap[String, DocumentAttachment]( + val oldAttachments = Map[String, DocumentAttachment]( (FIRST_KEY, new TextNameDocumentAttachment(FIRST_NAME)), (MIDDLE_KEY, new TextNameDocumentAttachment(MIDDLE_NAME)), (LAST_KEY, new TextNameDocumentAttachment(LAST_NAME)), @@ -183,13 +185,14 @@ class TestDocumentAttachment extends Test { /*oldDocument.addAttachment("wrong", new NameMethodAttachment("name"))*/ val documentString = prettyJson(renderJValue(oldDocument.jsonAST)) - val newDocument: Document = JSONSerializer.toDocument(parseJson(documentString)) - newDocument.getAttachment(FIRST_KEY) should be (oldDocument.getAttachment(FIRST_KEY)) - newDocument.getAttachment(MIDDLE_KEY) should be (oldDocument.getAttachment(MIDDLE_KEY)) - newDocument.getAttachment(LAST_KEY) should be (oldDocument.getAttachment(LAST_KEY)) - newDocument.getAttachment(ALIAS_KEY).asInstanceOf[Option[NameDocumentAttachment]].get.name should be ( - oldDocument.getAttachment(ALIAS_KEY).asInstanceOf[Option[NameDocumentAttachment]].get.name + val newAttachments = newDocument.attachments.get + + newAttachments(FIRST_KEY) should be (oldAttachments(FIRST_KEY)) + newAttachments(MIDDLE_KEY) should be (oldAttachments(MIDDLE_KEY)) + newAttachments(LAST_KEY) should be (oldAttachments(LAST_KEY)) + newAttachments(ALIAS_KEY).asInstanceOf[NameDocumentAttachment].name should be ( + oldAttachments(ALIAS_KEY).asInstanceOf[NameDocumentAttachment].name ) // This one must be avoided. @@ -197,7 +200,7 @@ class TestDocumentAttachment extends Test { } "Document with ObjectNameDocumentAttachment" should "serialize as json" in { - val oldAttachments = mutable.HashMap[String, DocumentAttachment]( + val oldAttachments = Map[String, DocumentAttachment]( (FIRST_KEY, new ObjectNameDocumentAttachment(FIRST_NAME)), (MIDDLE_KEY, new ObjectNameDocumentAttachment(MIDDLE_NAME)), (LAST_KEY, new ObjectNameDocumentAttachment(LAST_NAME)), @@ -207,13 +210,14 @@ class TestDocumentAttachment extends Test { // This should be a messy string. val documentString = prettyJson(renderJValue(oldDocument.jsonAST)) - val newDocument: Document = JSONSerializer.toDocument(parseJson(documentString)) - require(newDocument.getAttachment(FIRST_KEY) == oldDocument.getAttachment(FIRST_KEY)) - require(newDocument.getAttachment(MIDDLE_KEY) == oldDocument.getAttachment(MIDDLE_KEY)) - require(newDocument.getAttachment(LAST_KEY) == oldDocument.getAttachment(LAST_KEY)) - require(newDocument.getAttachment(ALIAS_KEY).get.asInstanceOf[NameDocumentAttachment].name == - oldDocument.getAttachment(ALIAS_KEY).get.asInstanceOf[NameDocumentAttachment].name) + val newAttachments = newDocument.attachments.get + + require(newAttachments(FIRST_KEY) == oldAttachments(FIRST_KEY)) + require(newAttachments(MIDDLE_KEY) == oldAttachments(MIDDLE_KEY)) + require(newAttachments(LAST_KEY) == oldAttachments(LAST_KEY)) + require(newAttachments(ALIAS_KEY).asInstanceOf[NameDocumentAttachment].name == + oldAttachments(ALIAS_KEY).asInstanceOf[NameDocumentAttachment].name) // This one must be avoided. /*require(newDocument == oldDocument)*/ From e9864206085631bbf7b8a559948562474ad16922 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Tue, 27 May 2025 09:58:32 -0700 Subject: [PATCH 33/42] Fix test compilation warning --- .../src/test/scala/org/clulab/processors/TestHashTrie.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/library/src/test/scala/org/clulab/processors/TestHashTrie.scala b/library/src/test/scala/org/clulab/processors/TestHashTrie.scala index d304713b7..4ec8ee171 100644 --- a/library/src/test/scala/org/clulab/processors/TestHashTrie.scala +++ b/library/src/test/scala/org/clulab/processors/TestHashTrie.scala @@ -19,7 +19,7 @@ class TestHashTrie extends Test { //println("TRIE:\n" + trie) - val tokens = Array("a", "a", "b", "d", "a", "b", "d", "b", "b", "b") + val tokens = Seq("a", "a", "b", "d", "a", "b", "d", "b", "b", "b") val labels = trie.find(tokens, "O") //println("TOKENS: " + tokens.mkString(" ")) //println("LABELS: " + labels.mkString(" ")) @@ -44,7 +44,7 @@ class TestHashTrie extends Test { trie.add(Array("this", "is", "c", "test")) trie.add(Array("this", "is", "b", "test")) - val labels = trie.find(Array("this", "is", "c", "test"), "o") + val labels = trie.find(Seq("this", "is", "c", "test"), "o") sameLabels(Array("B-hello", "I-hello", "I-hello", "I-hello"), labels) } @@ -55,7 +55,7 @@ class TestHashTrie extends Test { trie.add(Array("this", "is", "c", "test")) trie.add(Array("this", "is", "d", "test")) - val labels = trie.find(Array("this", "is", "b", "test"), "o") + val labels = trie.find(Seq("this", "is", "b", "test"), "o") sameLabels(Array("o", "o", "o", "o"), labels) } From 7d4fec111c892efd7fd69d6952ef35d8c33bd6db Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Thu, 29 May 2025 08:39:23 -0700 Subject: [PATCH 34/42] Use Option.when --- .../apps/InfiniteParallelProcessorsExample.scala | 9 ++++----- .../scala/org/clulab/processors/apps/OdinStarter.scala | 3 ++- .../scala/org/clulab/odin/impl/MarkdownGeneration.scala | 7 ++++--- .../scala/org/clulab/odin/impl/OdinResourceManager.scala | 4 ++-- .../src/main/scala/org/clulab/odin/impl/RuleReader.scala | 4 ++-- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/apps/src/main/scala/org/clulab/processors/apps/InfiniteParallelProcessorsExample.scala b/apps/src/main/scala/org/clulab/processors/apps/InfiniteParallelProcessorsExample.scala index f320b6f4b..3465d8758 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/InfiniteParallelProcessorsExample.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/InfiniteParallelProcessorsExample.scala @@ -2,23 +2,22 @@ package org.clulab.processors.apps import org.clulab.processors.Document import org.clulab.processors.Processor +import org.clulab.processors.clu.BalaurProcessor import org.clulab.serialization.DocumentSerializer import org.clulab.utils.{FileUtils, StringUtils, ThreadUtils, Timer} -import java.io.BufferedOutputStream import java.io.File -import java.io.FileOutputStream import java.io.PrintWriter +import scala.collection.compat._ import scala.collection.parallel.ParSeq import scala.util.Using -import org.clulab.processors.clu.BalaurProcessor + object InfiniteParallelProcessorsExample { class ProcessorProvider(reuseProcessor: Boolean) { protected val processorOpt: Option[Processor] = - if (reuseProcessor) Some(new BalaurProcessor()) - else None + Option.when(reuseProcessor)(new BalaurProcessor()) def newOrReusedProcessor: Processor = if (reuseProcessor) processorOpt.get diff --git a/apps/src/main/scala/org/clulab/processors/apps/OdinStarter.scala b/apps/src/main/scala/org/clulab/processors/apps/OdinStarter.scala index 54abb3b5e..106f7d09b 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/OdinStarter.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/OdinStarter.scala @@ -6,6 +6,7 @@ import org.clulab.sequences.LexiconNER import org.clulab.utils.FileUtils import java.io.File +import scala.collection.compat._ object OdinStarter extends App { // When using an IDE rather than sbt, make sure the working directory for the run @@ -20,7 +21,7 @@ object OdinStarter extends App { val kbs = kbsAndCaseInsensitiveMatchings.map(_._1) val caseInsensitiveMatchings = kbsAndCaseInsensitiveMatchings.map(_._2) val isLocal = kbs.forall(new File(resourceDir, _).exists) - val baseDirOpt = if (isLocal) Some(resourceDir) else None + val baseDirOpt = Option.when(isLocal)(resourceDir) LexiconNER(kbs, caseInsensitiveMatchings, baseDirOpt) } diff --git a/library/src/main/scala/org/clulab/odin/impl/MarkdownGeneration.scala b/library/src/main/scala/org/clulab/odin/impl/MarkdownGeneration.scala index f6e282934..26c2252e8 100644 --- a/library/src/main/scala/org/clulab/odin/impl/MarkdownGeneration.scala +++ b/library/src/main/scala/org/clulab/odin/impl/MarkdownGeneration.scala @@ -3,6 +3,7 @@ package org.clulab.odin.impl import org.clulab.odin.impl.MarkdownGeneration._ import org.clulab.odin.impl.RuleReader.{DefaultAction, Rule} +import scala.collection.compat._ import scala.collection.mutable.ArrayBuffer case class RuleSchema( @@ -180,7 +181,7 @@ object MarkdownGeneration { extractorType = "CrossSentenceExtractor", labels = x.labels, priority = priorityString(x.priority), - action = if (r.action != DefaultAction) Some(r.action) else None, + action = Option.when(r.action != DefaultAction)(r.action), keep = x.keep, additional = Map( "leftWindow" -> x.leftWindow.toString, @@ -198,7 +199,7 @@ object MarkdownGeneration { extractorType = "TokenExtractor", labels = x.labels, priority = priorityString(x.priority), - action = if (r.action != DefaultAction) Some(r.action) else None, + action = Option.when(r.action != DefaultAction)(r.action), keep = x.keep, additional = Map.empty, arguments = Seq.empty @@ -213,7 +214,7 @@ object MarkdownGeneration { extractorType = "GraphExtractor", labels = x.labels, priority = priorityString(x.priority), - action = if (r.action != DefaultAction) Some(r.action) else None, + action = Option.when(r.action != DefaultAction)(r.action), keep = x.keep, additional = Map.empty, arguments = toArgSchema(x.pattern.arguments) diff --git a/library/src/main/scala/org/clulab/odin/impl/OdinResourceManager.scala b/library/src/main/scala/org/clulab/odin/impl/OdinResourceManager.scala index f6b8c2c7c..817d93d52 100644 --- a/library/src/main/scala/org/clulab/odin/impl/OdinResourceManager.scala +++ b/library/src/main/scala/org/clulab/odin/impl/OdinResourceManager.scala @@ -1,6 +1,7 @@ package org.clulab.odin.impl import java.io.{BufferedInputStream, InputStream} +import scala.collection.compat._ import scala.io.Source /** @@ -22,8 +23,7 @@ object OdinResourceManager { val embeddingsOption: Option[OdinResource] = constructorMap("embeddings") // cast as EmbeddingsResources, if present val embeddings: Option[EmbeddingsResource] = - if (embeddingsOption.nonEmpty) Some(embeddingsOption.get.asInstanceOf[EmbeddingsResource]) - else None + Option.when(embeddingsOption.nonEmpty)(embeddingsOption.get.asInstanceOf[EmbeddingsResource]) new OdinResourceManager(embeddings) } diff --git a/library/src/main/scala/org/clulab/odin/impl/RuleReader.scala b/library/src/main/scala/org/clulab/odin/impl/RuleReader.scala index a349b4193..45f9a1e35 100644 --- a/library/src/main/scala/org/clulab/odin/impl/RuleReader.scala +++ b/library/src/main/scala/org/clulab/odin/impl/RuleReader.scala @@ -13,6 +13,7 @@ import java.net.URL import java.nio.charset.Charset import java.nio.charset.StandardCharsets import java.util.{Collection, Map => JMap} +import scala.collection.compat._ import scala.io.{Codec, Source} import scala.jdk.CollectionConverters._ import scala.util.Using @@ -28,8 +29,7 @@ class RuleReader(val actions: Actions, val charset: Charset, val ruleDir: Option private val mirror = new ActionMirror(actions) val ruleYamlOpt = - if (OdinConfig.keepRule) Some(new Yaml(new Constructor(classOf[Map[String, Any]]))) - else None + Option.when(OdinConfig.keepRule)(new Yaml(new Constructor(classOf[Map[String, Any]]))) def read(input: String): Vector[Extractor] = { val rules = getRules(input) From 4f06301cfba9caba8e02c8e29697026ce8545343 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Thu, 29 May 2025 10:17:35 -0700 Subject: [PATCH 35/42] Extract the DocumentPrinter --- .../InfiniteParallelProcessorsExample.scala | 8 +- .../apps/ParallelProcessorsExample.scala | 7 +- .../apps/ProcessorsDocSerializerExample.scala | 7 +- .../processors/apps/ProcessorsShell.scala | 7 +- .../org/clulab/processors/Document.scala | 75 --------------- .../processors/clu/DocumentPrinter.scala | 91 +++++++++++++++++++ 6 files changed, 102 insertions(+), 93 deletions(-) create mode 100644 library/src/main/scala/org/clulab/processors/clu/DocumentPrinter.scala diff --git a/apps/src/main/scala/org/clulab/processors/apps/InfiniteParallelProcessorsExample.scala b/apps/src/main/scala/org/clulab/processors/apps/InfiniteParallelProcessorsExample.scala index 3465d8758..6220f625e 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/InfiniteParallelProcessorsExample.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/InfiniteParallelProcessorsExample.scala @@ -2,7 +2,7 @@ package org.clulab.processors.apps import org.clulab.processors.Document import org.clulab.processors.Processor -import org.clulab.processors.clu.BalaurProcessor +import org.clulab.processors.clu.{BalaurProcessor, DocumentPrettyPrinter} import org.clulab.serialization.DocumentSerializer import org.clulab.utils.{FileUtils, StringUtils, ThreadUtils, Timer} @@ -12,7 +12,6 @@ import scala.collection.compat._ import scala.collection.parallel.ParSeq import scala.util.Using - object InfiniteParallelProcessorsExample { class ProcessorProvider(reuseProcessor: Boolean) { @@ -36,9 +35,6 @@ object InfiniteParallelProcessorsExample { val documentSerializer = new DocumentSerializer def processFiles(parFiles: ParSeq[File], processor: Processor): Unit = { - - def printDocument(document: Document, printWriter: PrintWriter): Unit = document.prettyPrint(printWriter) - parFiles.foreach { file => println(s"Processing ${file.getName}...") @@ -46,7 +42,7 @@ object InfiniteParallelProcessorsExample { val outputFile = new File(outputDir + "/" + file.getName) val document = processor.annotate(text) val printedDocument = StringUtils.viaPrintWriter { printWriter => - printDocument(document, printWriter) + new DocumentPrettyPrinter(printWriter).print(document) } val savedDocument = documentSerializer.save(document) val outputDocument = printedDocument + savedDocument diff --git a/apps/src/main/scala/org/clulab/processors/apps/ParallelProcessorsExample.scala b/apps/src/main/scala/org/clulab/processors/apps/ParallelProcessorsExample.scala index 6e514c3e1..bc28048e5 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/ParallelProcessorsExample.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/ParallelProcessorsExample.scala @@ -2,7 +2,7 @@ package org.clulab.processors.apps import org.clulab.processors.Document import org.clulab.processors.Processor -import org.clulab.processors.clu.BalaurProcessor +import org.clulab.processors.clu.{BalaurProcessor, DocumentPrettyPrinter} import org.clulab.serialization.DocumentSerializer import org.clulab.utils.{FileUtils, StringUtils, ThreadUtils, Timer} @@ -13,9 +13,6 @@ import scala.util.Using object ParallelProcessorsExample { def mainWithCallback(args: Array[String])(callback: (File, String) => Unit): Unit = { - - def printDocument(document: Document, printWriter: PrintWriter): Unit = document.prettyPrint(printWriter) - val inputDir = args(0) val outputDir = args(1) val extension = args(2) @@ -56,7 +53,7 @@ object ParallelProcessorsExample { throw throwable } val printedDocument = StringUtils.viaPrintWriter { printWriter => - printDocument(document, printWriter) + new DocumentPrettyPrinter(printWriter).print(document) } val savedDocument = documentSerializer.save(document) val outputDocument = printedDocument + savedDocument diff --git a/apps/src/main/scala/org/clulab/processors/apps/ProcessorsDocSerializerExample.scala b/apps/src/main/scala/org/clulab/processors/apps/ProcessorsDocSerializerExample.scala index 8bc6aa608..518e0f667 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/ProcessorsDocSerializerExample.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/ProcessorsDocSerializerExample.scala @@ -1,5 +1,6 @@ package org.clulab.processors.apps +import org.clulab.processors.clu.DocumentPrettyPrinter import org.clulab.processors.{Document, Processor} import org.clulab.serialization.DocumentSerializer @@ -13,6 +14,7 @@ import java.io.PrintWriter */ object ProcessorsDocSerializerExample { def main(args:Array[String]): Unit = { + val documentPrinter = new DocumentPrettyPrinter(new PrintWriter(System.out)) // create the processor val proc = Processor() @@ -20,14 +22,11 @@ object ProcessorsDocSerializerExample { val doc = proc.annotate("John Smith went to China. He visited Beijing, on January 10th, 2013.") // you are basically done. the rest of this code simply prints out the annotations - printDoc(doc) + documentPrinter.print(doc) // serialize the doc using our custom serializer val ser = new DocumentSerializer val out = ser.save(doc) println("SERIALIZED DOC:\n" + out) } - - def printDoc(doc:Document): Unit = { doc.prettyPrint(new PrintWriter(System.out)) } - } diff --git a/apps/src/main/scala/org/clulab/processors/apps/ProcessorsShell.scala b/apps/src/main/scala/org/clulab/processors/apps/ProcessorsShell.scala index 012949e4a..903cf6113 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/ProcessorsShell.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/ProcessorsShell.scala @@ -1,7 +1,7 @@ package org.clulab.processors.apps import org.clulab.processors.Processor -import org.clulab.processors.clu.BalaurProcessor +import org.clulab.processors.clu.{BalaurProcessor, DocumentPrettyPrinter} import org.clulab.utils.CliReader import org.clulab.utils.ExitMenuItem import org.clulab.utils.HelpMenuItem @@ -27,6 +27,7 @@ class ProcessorsShell extends Shell { val lineReader = new CliReader(proc.prompt, "user.home", ".processorshellhistory") val printWriter = new PrintWriter(System.out) + val documentPrinter = new DocumentPrettyPrinter(printWriter) def prepareProcessor(message: String, promptedReloadableProcessor: PromptedReloadableProcessor): Unit = { lineReader.setPrompt(promptedReloadableProcessor.prompt) @@ -40,8 +41,8 @@ class ProcessorsShell extends Shell { override def work(text: String): Unit = { val doc = proc.get.annotate(text) - doc.prettyPrint(printWriter) - printWriter.flush() + + documentPrinter.print(doc) } // We inherit now just from Shell, so no reloading is performed. diff --git a/library/src/main/scala/org/clulab/processors/Document.scala b/library/src/main/scala/org/clulab/processors/Document.scala index 7dd9bcfd3..eb379e3a1 100644 --- a/library/src/main/scala/org/clulab/processors/Document.scala +++ b/library/src/main/scala/org/clulab/processors/Document.scala @@ -74,81 +74,6 @@ class Document( Hash.ordered(sentences.map(_.ambivalenceHash)) ) - def prettyPrint(pw: PrintWriter): Unit = { - // let's print the sentence-level annotations - sentences.zipWithIndex.foreach { case (sentence, sentenceCount) => - pw.println("Sentence #" + sentenceCount + ":") - pw.println("Tokens: " + sentence.words.zipWithIndex.mkString(" ")) - pw.println("Start character offsets: " + sentence.startOffsets.mkString(" ")) - pw.println("End character offsets: " + sentence.endOffsets.mkString(" ")) - - // these annotations are optional, so they are stored using Option objects, hence the foreach statement - sentence.lemmas.foreach(lemmas => pw.println(s"Lemmas: ${lemmas.mkString(" ")}")) - sentence.tags.foreach(tags => pw.println(s"POS tags: ${tags.mkString(" ")}")) - sentence.chunks.foreach(chunks => pw.println(s"Chunks: ${chunks.mkString(" ")}")) - sentence.entities.foreach(entities => pw.println(s"Named entities: ${entities.mkString(" ")}")) - sentence.norms.foreach(norms => pw.println(s"Normalized entities: ${norms.mkString(" ")}")) - sentence.universalBasicDependencies.foreach(dependencies => { - pw.println("Basic syntactic dependencies:") - val iterator = new DirectedGraphEdgeIterator[String](dependencies) - while(iterator.hasNext) { - val dep = iterator.next() - // note that we use offsets starting at 0 (unlike CoreNLP, which uses offsets starting at 1) - pw.println(" head:" + dep._1 + " modifier:" + dep._2 + " label:" + dep._3) - } - }) - sentence.universalEnhancedDependencies.foreach(dependencies => { - pw.println("Enhanced syntactic dependencies:") - val iterator = new DirectedGraphEdgeIterator[String](dependencies) - while(iterator.hasNext) { - val dep = iterator.next() - // note that we use offsets starting at 0 (unlike CoreNLP, which uses offsets starting at 1) - pw.println(" head:" + dep._1 + " modifier:" + dep._2 + " label:" + dep._3) - } - }) - sentence.semanticRoles.foreach(dependencies => { - pw.println("Semantic dependencies:") - val iterator = new DirectedGraphEdgeIterator[String](dependencies) - while(iterator.hasNext) { - val dep = iterator.next() - // note that we use offsets starting at 0 (unlike CoreNLP, which uses offsets starting at 1) - pw.println(" head:" + dep._1 + " modifier:" + dep._2 + " label:" + dep._3) - } - }) - sentence.enhancedSemanticRoles.foreach(dependencies => { - pw.println("Enhanced semantic dependencies:") - val iterator = new DirectedGraphEdgeIterator[String](dependencies) - while(iterator.hasNext) { - val dep = iterator.next() - // note that we use offsets starting at 0 (unlike CoreNLP, which uses offsets starting at 1) - pw.println(" head:" + dep._1 + " modifier:" + dep._2 + " label:" + dep._3) - } - }) - sentence.syntacticTree.foreach(tree => { - pw.println("Constituent tree: " + tree.toStringDepth(showHead = false)) - // see the org.clulab.struct.Tree class for more information - // on syntactic trees, including access to head phrases/words - }) - - pw.println("\n") - } - - // let's print the coreference chains - coreferenceChains.foreach(chains => { - for (chain <- chains.getChains) { - pw.println("Found one coreference chain containing the following mentions:") - for (mention <- chain) { - // note that all these offsets start at 0 too - pw.println("\tsentenceIndex:" + mention.sentenceIndex + - " headIndex:" + mention.headIndex + - " startTokenOffset:" + mention.startOffset + - " endTokenOffset:" + mention.endOffset + - " text: " + sentences(mention.sentenceIndex).words.slice(mention.startOffset, mention.endOffset).mkString("[", " ", "]")) - } - } - }) - } - def offset(offset: Int): Document = // If a subclass of Document constructs itself with an attachment or a documentCreationTime that // would be overwritten on the copy(), then it should provide its own copy() method(s). diff --git a/library/src/main/scala/org/clulab/processors/clu/DocumentPrinter.scala b/library/src/main/scala/org/clulab/processors/clu/DocumentPrinter.scala new file mode 100644 index 000000000..22c9845c7 --- /dev/null +++ b/library/src/main/scala/org/clulab/processors/clu/DocumentPrinter.scala @@ -0,0 +1,91 @@ +package org.clulab.processors.clu + +import org.clulab.processors.Document +import org.clulab.struct.DirectedGraphEdgeIterator + +import java.io.PrintWriter + +trait DocumentPrinter { + def print(document: Document): Unit +} + +class DocumentPrettyPrinter(printWriter: PrintWriter) extends DocumentPrinter { + + def println(string: String): Unit = printWriter.println(string) + + def print(document: Document): Unit = { + // let's print the sentence-level annotations + document.sentences.zipWithIndex.foreach { case (sentence, sentenceCount) => + println("Sentence #" + sentenceCount + ":") + println("Tokens: " + sentence.words.zipWithIndex.mkString(" ")) + println("Start character offsets: " + sentence.startOffsets.mkString(" ")) + println("End character offsets: " + sentence.endOffsets.mkString(" ")) + + // these annotations are optional, so they are stored using Option objects, hence the foreach statement + sentence.lemmas.foreach(lemmas => println(s"Lemmas: ${lemmas.mkString(" ")}")) + sentence.tags.foreach(tags => println(s"POS tags: ${tags.mkString(" ")}")) + sentence.chunks.foreach(chunks => println(s"Chunks: ${chunks.mkString(" ")}")) + sentence.entities.foreach(entities => println(s"Named entities: ${entities.mkString(" ")}")) + sentence.norms.foreach(norms => println(s"Normalized entities: ${norms.mkString(" ")}")) + sentence.universalBasicDependencies.foreach(dependencies => { + println("Basic syntactic dependencies:") + val iterator = new DirectedGraphEdgeIterator[String](dependencies) + while (iterator.hasNext) { + val dep = iterator.next() + // note that we use offsets starting at 0 (unlike CoreNLP, which uses offsets starting at 1) + println(" head:" + dep._1 + " modifier:" + dep._2 + " label:" + dep._3) + } + }) + sentence.universalEnhancedDependencies.foreach(dependencies => { + println("Enhanced syntactic dependencies:") + val iterator = new DirectedGraphEdgeIterator[String](dependencies) + while (iterator.hasNext) { + val dep = iterator.next() + // note that we use offsets starting at 0 (unlike CoreNLP, which uses offsets starting at 1) + println(" head:" + dep._1 + " modifier:" + dep._2 + " label:" + dep._3) + } + }) + sentence.semanticRoles.foreach(dependencies => { + println("Semantic dependencies:") + val iterator = new DirectedGraphEdgeIterator[String](dependencies) + while (iterator.hasNext) { + val dep = iterator.next() + // note that we use offsets starting at 0 (unlike CoreNLP, which uses offsets starting at 1) + println(" head:" + dep._1 + " modifier:" + dep._2 + " label:" + dep._3) + } + }) + sentence.enhancedSemanticRoles.foreach(dependencies => { + println("Enhanced semantic dependencies:") + val iterator = new DirectedGraphEdgeIterator[String](dependencies) + while (iterator.hasNext) { + val dep = iterator.next() + // note that we use offsets starting at 0 (unlike CoreNLP, which uses offsets starting at 1) + println(" head:" + dep._1 + " modifier:" + dep._2 + " label:" + dep._3) + } + }) + sentence.syntacticTree.foreach(tree => { + println("Constituent tree: " + tree.toStringDepth(showHead = false)) + // see the org.clulab.struct.Tree class for more information + // on syntactic trees, including access to head phrases/words + }) + + println("\n") + } + + // let's print the coreference chains + document.coreferenceChains.foreach(chains => { + for (chain <- chains.getChains) { + println("Found one coreference chain containing the following mentions:") + for (mention <- chain) { + // note that all these offsets start at 0 too + println("\tsentenceIndex:" + mention.sentenceIndex + + " headIndex:" + mention.headIndex + + " startTokenOffset:" + mention.startOffset + + " endTokenOffset:" + mention.endOffset + + " text: " + document.sentences(mention.sentenceIndex).words.slice(mention.startOffset, mention.endOffset).mkString("[", " ", "]")) + } + } + }) + printWriter.flush() + } +} From 0b33f209be12926826a4b980f9c6ea48dd31e5a4 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Thu, 29 May 2025 11:02:38 -0700 Subject: [PATCH 36/42] Clean up DocumentMaker Clean it up more Get rid of debug files --- .../org/clulab/processors/Document.scala | 4 +- .../clulab/processors/clu/DocumentMaker.scala | 88 +++++++++++-------- 2 files changed, 51 insertions(+), 41 deletions(-) diff --git a/library/src/main/scala/org/clulab/processors/Document.scala b/library/src/main/scala/org/clulab/processors/Document.scala index eb379e3a1..1c9e1ece3 100644 --- a/library/src/main/scala/org/clulab/processors/Document.scala +++ b/library/src/main/scala/org/clulab/processors/Document.scala @@ -1,14 +1,12 @@ package org.clulab.processors -import org.clulab.struct.{CorefChains, DirectedGraphEdgeIterator} +import org.clulab.struct.CorefChains import org.clulab.utils.Hash import org.clulab.utils.Serializer import org.json4s.JString import org.json4s.JValue import org.json4s.jackson.prettyJson -import java.io.PrintWriter - /** * Stores all annotations for one document. * Written by: Mihai Surdeanu and Gus Hahn-Powell. diff --git a/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala b/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala index 2e228c6f3..0a303701e 100644 --- a/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala +++ b/library/src/main/scala/org/clulab/processors/clu/DocumentMaker.scala @@ -3,7 +3,6 @@ package org.clulab.processors.clu import org.clulab.processors.Document import org.clulab.processors.Sentence import org.clulab.processors.clu.tokenizer.Tokenizer -import org.clulab.scala.WrappedArrayBuffer._ import org.clulab.utils.WrappedArraySeq import org.slf4j.Logger import org.slf4j.LoggerFactory @@ -14,26 +13,29 @@ import scala.collection.mutable.ArrayBuffer class DocumentMaker object DocumentMaker { - val logger:Logger = LoggerFactory.getLogger(classOf[DocumentMaker]) + val logger: Logger = LoggerFactory.getLogger(classOf[DocumentMaker]) /** Constructs a document of tokens from free text; includes sentence splitting and tokenization */ - def mkDocument(tokenizer:Tokenizer, - text:String, - keepText:Boolean): Document = { - val sents = tokenizer.tokenize(text) + def mkDocument( // TODO: mkDocumentFromText + tokenizer: Tokenizer, + text: String, + keepText: Boolean + ): Document = { + val sentences = tokenizer.tokenize(text) val textOpt = Option.when(keepText)(text) - val doc = Document(sents, textOpt) + val document = Document(sentences, textOpt) - doc + document } /** Constructs a document of tokens from an array of untokenized sentences */ - def mkDocumentFromSentences( + def mkDocumentFromSentences( // TODO: mkDocumentFromTexts tokenizer: Tokenizer, texts: Iterable[String], keepText: Boolean, charactersBetweenSentences: Int ): Document = { + val sentenceSep = " " * charactersBetweenSentences var characterOffset = 0 val sentencesArray = texts.map { text => val sentence = tokenizer.tokenize(text, sentenceSplit = false, characterOffset).head // We produce a single sentence here! @@ -42,47 +44,57 @@ object DocumentMaker { sentence }.toArray val sentences = WrappedArraySeq(sentencesArray).toImmutableSeq - val textOpt = Option.when(keepText)(texts.mkString(mkSep(charactersBetweenSentences))) + val textOpt = Option.when(keepText)(texts.mkString(sentenceSep)) val document = Document(sentences, textOpt) document } /** Constructs a document of tokens from an array of tokenized sentences */ - def mkDocumentFromTokens(sentences:Iterable[Iterable[String]], - keepText:Boolean, - charactersBetweenSentences:Int, - charactersBetweenTokens:Int): Document = { + def mkDocumentFromTokens( // TODO: mkDocumentFromTokenizedTexts + tokenizedTexts: Iterable[Iterable[String]], + keepText: Boolean, + charactersBetweenSentences: Int, + charactersBetweenTokens: Int + ): Document = { + val sentenceSep = " " * charactersBetweenSentences + val tokenSep = " " * charactersBetweenTokens var charOffset = 0 - val sents = new ArrayBuffer[Sentence]() val text = new StringBuilder - for(sentence <- sentences) { - val startOffsets = new ArrayBuffer[Int]() - val endOffsets = new ArrayBuffer[Int]() - for(word <- sentence) { - startOffsets += charOffset - charOffset += word.length - endOffsets += charOffset + // Just use one buffer for each but clear them as necessary. + val startOffsetsBuffer = new ArrayBuffer[Int]() + val endOffsetsBuffer = new ArrayBuffer[Int]() + val sentencesArray = tokenizedTexts.map { tokenizedTextIterable => + // We are going to need to tokens in an array anyway, so make them now. + val tokenizedTextArray = tokenizedTextIterable.toArray + + tokenizedTextArray.foreach { token => + startOffsetsBuffer += charOffset + charOffset += token.length + endOffsetsBuffer += charOffset charOffset += charactersBetweenTokens } - // note: NO postprocessing happens in this case, so use it carefully! - sents += new Sentence(sentence.toSeq, startOffsets, endOffsets, sentence.toSeq) - charOffset += charactersBetweenSentences - charactersBetweenTokens - if(keepText) { - text.append(sentence.mkString(mkSep(charactersBetweenTokens))) - text.append(mkSep(charactersBetweenSentences)) - } - } + // The simple version of this doesn't work if there were no tokens. + charOffset += charactersBetweenSentences - (if (tokenizedTextArray.nonEmpty) charactersBetweenTokens else 0) - val textOpt = Option.when(keepText)(text.toString) - val doc = Document(sents, textOpt) + // Note: NO postprocessing happens in this case, so use it carefully! + val startOffsets = WrappedArraySeq(startOffsetsBuffer.toArray).toImmutableSeq + startOffsetsBuffer.clear() + val endOffsets = WrappedArraySeq(endOffsetsBuffer.toArray).toImmutableSeq + endOffsetsBuffer.clear() + val tokens = WrappedArraySeq(tokenizedTextArray).toImmutableSeq + val sentence = new Sentence(tokens, startOffsets, endOffsets, tokens) - doc - } + if (keepText) { + text.append(tokens.mkString(tokenSep)) + text.append(sentenceSep) + } + sentence + }.toArray + val sentences = WrappedArraySeq(sentencesArray).toImmutableSeq + val textOpt = Option.when(keepText)(text.toString) + val document = Document(sentences, textOpt) - private def mkSep(size:Int):String = { - val os = new StringBuilder - for (_ <- 0 until size) os.append(" ") - os.toString() + document } } From 276e89467d4caf45e5124dd2118f610d5ee5ebfa Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Thu, 29 May 2025 17:29:29 -0700 Subject: [PATCH 37/42] Fix ColumnsToDocument --- .../processors/apps/ColumnsToDocument.scala | 134 +++++++++--------- 1 file changed, 67 insertions(+), 67 deletions(-) diff --git a/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala b/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala index e38b14615..e412c6be4 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala @@ -3,6 +3,7 @@ package org.clulab.processors.apps import org.clulab.processors.{Document, Processor, Sentence} import org.clulab.processors.clu.BalaurProcessor import org.clulab.scala.WrappedArrayBuffer._ +import org.clulab.utils.WrappedArraySeq import org.slf4j.{Logger, LoggerFactory} import java.io.InputStream @@ -18,6 +19,8 @@ class ColumnsToDocument * Last Modified: Fix compiler issue: import scala.io.Source. */ object ColumnsToDocument { + type LabelSetter = (Sentence, Seq[String]) => Sentence + type Annotator = (Document) => Document val logger:Logger = LoggerFactory.getLogger(classOf[ColumnsToDocument]) val WORD_POS_CONLLX = 1 @@ -25,91 +28,94 @@ object ColumnsToDocument { val WORD_POS_CONLLU = 1 val TAG_POS_CONLLU = 3 - var proc:Processor = new BalaurProcessor() + var proc: Processor = new BalaurProcessor() var prevLang: String = "en" - def readFromFile(fn:String, - wordPos:Int = WORD_POS_CONLLX, - labelPos:Int = TAG_POS_CONLLX, - setLabels: (Sentence, Array[String]) => Unit, - annotate: (Document) => Unit, - filterOutContractions:Boolean = false, - lang: String = "en" - ): Document = { - - // redefine proc acording to the language used + protected def setProcessor(lang: String): Unit = { if (lang != prevLang) { if (lang == "pt") { println("Using Portuguese processors") throw new RuntimeException(s"ERROR: language '$lang' not supported!") //this.proc = new PortugueseCluProcessor() - } else if (lang == "es") { + } + else if (lang == "es") { println("Using Spanish processors") //this.proc = new SpanishCluProcessor() throw new RuntimeException(s"ERROR: language '$lang' not supported!") - } else { + } + else { println("Using English processors") this.proc = new BalaurProcessor() } this.prevLang = lang } + } + def readFromFile( + fn: String, + wordPos: Int = WORD_POS_CONLLX, + labelPos: Int = TAG_POS_CONLLX, + setLabels: LabelSetter, + annotate: Annotator, + filterOutContractions: Boolean = false, + lang: String = "en" + ): Document = { + setProcessor(lang) Using.resource(Source.fromFile(fn)) { source => readFromSource(source, wordPos, labelPos, setLabels, annotate, filterOutContractions) } } - def readFromStream(stream:InputStream, - wordPos:Int = WORD_POS_CONLLX, - labelPos:Int = TAG_POS_CONLLX, - setLabels: (Sentence, Array[String]) => Unit, - annotate: (Document) => Unit, - filterOutContractions:Boolean = false, - lang: String = "en"): Document = { - - // redefine proc acording to the language used - if (lang == "pt"){ - println("Using Portuguese processors") - //this.proc = new PortugueseCluProcessor() - throw new RuntimeException(s"ERROR: language '$lang' not supported!") - } else if(lang == "es") { - println("Using Spanish processors") - //this.proc = new SpanishCluProcessor() - throw new RuntimeException(s"ERROR: language '$lang' not supported!") - } else { - println("Using English processors") - this.proc = new BalaurProcessor() - } - + def readFromStream( + stream: InputStream, + wordPos: Int = WORD_POS_CONLLX, + labelPos: Int = TAG_POS_CONLLX, + setLabels: LabelSetter, + annotate: Annotator, + filterOutContractions: Boolean = false, + lang: String = "en" + ): Document = { + setProcessor(lang) Using.resource(Source.fromInputStream(stream)) { source => readFromSource(source, wordPos, labelPos, setLabels, annotate, filterOutContractions) } } - def readFromSource(source:Source, - wordPos:Int, - labelPos:Int, - setLabels: (Sentence, Array[String]) => Unit, - annotate: (Document) => Unit, - filterOutContractions:Boolean): Document = { - var words = new ArrayBuffer[String]() - var startOffsets = new ArrayBuffer[Int]() - var endOffsets = new ArrayBuffer[Int]() - var labels = new ArrayBuffer[String]() - var charOffset = 0 + def readFromSource( + source: Source, + wordPos: Int, + labelPos: Int, + setLabels: LabelSetter, + annotate: Annotator, + filterOutContractions:Boolean + ): Document = { + val words = new ArrayBuffer[String]() + val startOffsets = new ArrayBuffer[Int]() + val endOffsets = new ArrayBuffer[Int]() + val labels = new ArrayBuffer[String]() val sentences = new ArrayBuffer[Sentence]() - for(line <- source.getLines()) { + var charOffset = 0 + + def mkSentence(): Sentence = { + val wordsSeq = new WrappedArraySeq(words.toArray).toImmutableSeq + val unlabeledSentence = new Sentence(wordsSeq, startOffsets, endOffsets, wordsSeq) + + words.clear() + startOffsets.clear() + endOffsets.clear() + + val labeledSentence = setLabels(unlabeledSentence, labels.toSeq) + + labels.clear() + labeledSentence + } + + for (line <- source.getLines()) { val l = line.trim if (l.isEmpty) { // end of sentence if (words.nonEmpty) { - val s = new Sentence(words, startOffsets, endOffsets, words) - setLabels(s, labels.toArray) - sentences += s - words = new ArrayBuffer[String]() - startOffsets = new ArrayBuffer[Int]() - endOffsets = new ArrayBuffer[Int]() - labels = new ArrayBuffer[String]() + sentences += mkSentence() charOffset += 1 } } else { @@ -126,7 +132,7 @@ object ColumnsToDocument { // 10 as o DET _ Gender=Fem|Number=Plur 11 det _ _ // val offset = bits(0) // we assume token offsets are always in column 0! - if(! filterOutContractions || ! offset.contains("-")) { + if (!filterOutContractions || ! offset.contains("-")) { words += bits(wordPos) labels += bits(labelPos) startOffsets += charOffset @@ -138,21 +144,15 @@ object ColumnsToDocument { } } } - if(words.nonEmpty) { - val s = new Sentence( - words, startOffsets, endOffsets, words, - tags = Some(labels) - ) - sentences += s - } + if (words.nonEmpty) + sentences += mkSentence() logger.debug(s"Loaded ${sentences.size} sentences.") - val d = new Document(sentences) - annotate(d) - - d + val unannotatedSentence = new Document(sentences) + val annotatedSentence = annotate(unannotatedSentence) + annotatedSentence } - def annotateNil(doc:Document): Unit = {} + def annotateNil(document: Document): Document = document } From 0b933796e715a417a18174b63dee5a2e1011ca0e Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Thu, 29 May 2025 17:30:23 -0700 Subject: [PATCH 38/42] Remove unused and duplicate code in NumericUtils --- .../processors/apps/ColumnsToDocument.scala | 9 ++-- .../org/clulab/numeric/NumericUtils.scala | 47 ------------------- 2 files changed, 5 insertions(+), 51 deletions(-) diff --git a/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala b/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala index e412c6be4..23ad73ca1 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala @@ -110,15 +110,15 @@ object ColumnsToDocument { labeledSentence } - for (line <- source.getLines()) { - val l = line.trim + source.getLines().map(_.trim).foreach { l => if (l.isEmpty) { // end of sentence if (words.nonEmpty) { sentences += mkSentence() charOffset += 1 } - } else { + } + else { // within the same sentence val bits = l.split("\\s+") if (bits.length < 2) @@ -139,7 +139,8 @@ object ColumnsToDocument { charOffset = bits(wordPos).length endOffsets += charOffset charOffset += 1 - } else { + } + else { // println("Skipped line: " + l) } } diff --git a/library/src/main/scala/org/clulab/numeric/NumericUtils.scala b/library/src/main/scala/org/clulab/numeric/NumericUtils.scala index e60bb225d..ba9bcd84b 100644 --- a/library/src/main/scala/org/clulab/numeric/NumericUtils.scala +++ b/library/src/main/scala/org/clulab/numeric/NumericUtils.scala @@ -8,7 +8,6 @@ import org.clulab.struct.Interval import org.clulab.utils.WrappedArraySeq import scala.collection.mutable -import _root_.scala.util.control.Breaks._ object NumericUtils { def displayMentions(mentions: Seq[Mention], doc: Document): Unit = { @@ -128,54 +127,8 @@ object NumericUtils { triggered = false } } - - - // removes entities and norms for unallowable entity sequences, e.g., don't extract 'in' as 'inch' before B-LOC in '... Sahal 108 in Senegal' - // toBeRemovedShortened is entity without BIO- - val zippedEntities = entities.zipWithIndex - - // So remove all consecutive MEASREMENT-LENGTH in front of a B-LOC - // Can it just be done backwards in one pass in a state matchine? - - zippedEntities.foreach { case (outerEntity, outerIndex) => - if (outerIndex > 0 && outerEntity == triggerEntity && entities(outerIndex - 1).endsWith(toBeRemovedShortened)) { - // Go in reverse replacing indices and norms in the immediate preceding mention. - breakable { // TODO: rewrite - for ((innerEntity, innerIndex) <- zippedEntities.slice(0, outerIndex).reverse) { - if (innerEntity.endsWith(toBeRemovedShortened)) { - entities(innerIndex) = "O" - norms(innerIndex) = "" - } else break() - } - } - } - } - } - - def removeOneEntityBeforeAnother2(entities: mutable.Seq[String], norms: mutable.Seq[String], triggerEntity: String, toBeRemovedShortened: String): Unit = { - // removes entities and norms for unallowable entity sequences, e.g., don't extract 'in' as 'inch' before B-LOC in '... Sahal 108 in Senegal' - // toBeRemovedShortened is entity without BIO- - val zippedEntities = entities.zipWithIndex - - // So remove all consecutive MEASREMENT-LENGTH in front of a B-LOC - // Can it just be done backwards in one pass in a state matchine? - - zippedEntities.foreach { case (outerEntity, outerIndex) => - if (outerIndex > 0 && outerEntity == triggerEntity && entities(outerIndex - 1).endsWith(toBeRemovedShortened)) { - // Go in reverse replacing indices and norms in the immediate preceding mention. - breakable { // TODO: rewrite - for ((innerEntity, innerIndex) <- zippedEntities.slice(0, outerIndex).reverse) { - if (innerEntity.endsWith(toBeRemovedShortened)) { - entities(innerIndex) = "O" - norms(innerIndex) = "" - } else break() - } - } - } - } } - // TODO: These need to be mutable private def addLabelsAndNorms(label: String, norm: String, tokenInt: Interval, entities: mutable.Seq[String], norms: mutable.Seq[String]): Unit = { // careful here: we may override some existing entities and norms // but, given that the numeric entity rules tend to be high precision, this is probably Ok... From e8262754fb4bc45be0188aef4db8beea5315e595 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Fri, 30 May 2025 08:23:27 -0700 Subject: [PATCH 39/42] Fix typos --- .../scala/org/clulab/processors/apps/ColumnsToDocument.scala | 2 +- .../main/scala/org/clulab/sequences/MEMMSequenceTagger.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala b/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala index 23ad73ca1..506486e88 100644 --- a/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala +++ b/apps/src/main/scala/org/clulab/processors/apps/ColumnsToDocument.scala @@ -21,7 +21,7 @@ class ColumnsToDocument object ColumnsToDocument { type LabelSetter = (Sentence, Seq[String]) => Sentence type Annotator = (Document) => Document - val logger:Logger = LoggerFactory.getLogger(classOf[ColumnsToDocument]) + val logger: Logger = LoggerFactory.getLogger(classOf[ColumnsToDocument]) val WORD_POS_CONLLX = 1 val TAG_POS_CONLLX = 4 diff --git a/library/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala b/library/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala index 7cba53724..ff2dacaab 100644 --- a/library/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala +++ b/library/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala @@ -14,7 +14,7 @@ import scala.reflect.ClassTag import scala.util.Using /** - * Sequence tagger using a maximum entrop Markov model (MEMM) + * Sequence tagger using a maximum entropy Markov model (MEMM) * User: mihais * Date: 8/26/17 */ From b355af7e0cb4de684e68092fcc1a6bc2f2a9409a Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Mon, 2 Jun 2025 10:21:16 -0700 Subject: [PATCH 40/42] Combine named entity without exposing array --- .../org/clulab/processors/clu/BalaurProcessor.scala | 9 +++------ .../main/scala/org/clulab/sequences/NamedEntity.scala | 10 +++++++--- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala index 8404ed13f..2af5a1d3a 100644 --- a/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala +++ b/library/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala @@ -250,15 +250,12 @@ class BalaurProcessor protected ( if (customNamedEntities.isEmpty) generic else { - val genericNamedEntities = NamedEntity.collect(generic) - val result = generic.toArray // A copy of the generic labels is created here. - //println(s"Generic NamedEntity: ${genericNamedEntities.mkString(", ")}") //println(s"Custom NamedEntity: ${customNamedEntities.mkString(", ")}") + val genericNamedEntities = NamedEntity.collect(generic) + val combinedNamedEntities = NamedEntity.combine(generic, genericNamedEntities, customNamedEntities) - // The custom labels override the generic ones! - NamedEntity.combine(result, genericNamedEntities, customNamedEntities) - WrappedArraySeq(result).toImmutableSeq + combinedNamedEntities } } diff --git a/library/src/main/scala/org/clulab/sequences/NamedEntity.scala b/library/src/main/scala/org/clulab/sequences/NamedEntity.scala index 2b74c5b6d..a8f2a8da8 100644 --- a/library/src/main/scala/org/clulab/sequences/NamedEntity.scala +++ b/library/src/main/scala/org/clulab/sequences/NamedEntity.scala @@ -1,5 +1,7 @@ package org.clulab.sequences +import org.clulab.utils.WrappedArraySeq + import scala.collection.mutable // This is definitely not the most efficient as far as number of objects @@ -43,11 +45,12 @@ object NamedEntity { namedEntities } - def combine(bioLabels: Array[String], genericNamedEntities: Seq[NamedEntity], customNamedEntities: Seq[NamedEntity]): Unit = { + def combine(bioLabels: Seq[String], genericNamedEntities: Seq[NamedEntity], customNamedEntities: Seq[NamedEntity]): Seq[String] = { + val bioLabelsArray = bioLabels.toArray // Neither named entities sequence can contain overlapping elements within the sequence. // At most, there is overlap between sequences. Use is made of that fact. // The NamedEntities never have empty Ranges, so end - 1 is always at least start. - val outsides = bioLabels.indices.filter(bioLabels(_) == OUTSIDE) + val outsides = bioLabelsArray.indices.filter(bioLabelsArray(_) == OUTSIDE) val validStarts = (genericNamedEntities.map(_.range.start) ++ outsides).toSet // The -1 is used to coordinate ends (exclusive) with the OUTSIDE positions (inclusive). val validEnds = (genericNamedEntities.map(_.range.end - 1) ++ outsides).toSet @@ -56,8 +59,9 @@ object NamedEntity { } validCustomNamedEntities.foreach { customNamedEntity => - customNamedEntity.fill(bioLabels) + customNamedEntity.fill(bioLabelsArray) } + WrappedArraySeq(bioLabelsArray).toImmutableSeq } // Only INSIDEs can be invalid, and they are made valid by From 000f0ed06ea1175ddbb00a03272809caa79610a6 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Mon, 2 Jun 2025 10:21:24 -0700 Subject: [PATCH 41/42] Update sbt again --- project/build.properties | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/project/build.properties b/project/build.properties index 29f5dd953..75ac47aaa 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1,9 +1,9 @@ -# This was last checked on 2025-05-26. +# This was last checked on 2025-06-02. # Version 1.7.2+ will cause problems when combined with the play plug-in used for the webapp! # [error] * org.scala-lang.modules:scala-xml_2.12:2.1.0 (early-semver) is selected over {1.2.0, 1.1.1} # [error] +- org.scala-lang:scala-compiler:2.12.17 (depends on 2.1.0) # [error] +- com.typesafe.sbt:sbt-native-packager:1.5.2 (scalaVersion=2.12, sbtVersion=1.0) (depends on 1.1.1) # [error] +- com.typesafe.play:twirl-api_2.12:1.5.1 (depends on 1.2.0) # This error is solved by adding a VersionScheme.Always to plugins.sbt. -# up to 1.11.0 -sbt.version = 1.11.0 +# up to 1.11.1 +sbt.version = 1.11.1 From 143298578c2447da25e65d0c2fe8e2b6b25606a4 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Mon, 2 Jun 2025 11:01:47 -0700 Subject: [PATCH 42/42] Fix test --- .../src/test/scala/org/clulab/sequences/TestNamedEntity.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/library/src/test/scala/org/clulab/sequences/TestNamedEntity.scala b/library/src/test/scala/org/clulab/sequences/TestNamedEntity.scala index cd731635f..08a774400 100644 --- a/library/src/test/scala/org/clulab/sequences/TestNamedEntity.scala +++ b/library/src/test/scala/org/clulab/sequences/TestNamedEntity.scala @@ -45,8 +45,7 @@ class TestNamedEntity extends Test { val customBioLabels = customBioLabelString.split(" +") val genericNamedEntities = NamedEntity.collect(genericBioLabels) val customNamedEntities = NamedEntity.collect(customBioLabels) - NamedEntity.combine(genericBioLabels, genericNamedEntities, customNamedEntities) - val actualCombinedBioLabels = genericBioLabels + val actualCombinedBioLabels = NamedEntity.combine(genericBioLabels, genericNamedEntities, customNamedEntities) val actualCombinedBioLabelString = actualCombinedBioLabels.mkString(" ") val formattedExpectedCombinedBioLabelString = expectedCombinedBioLabelString.split(" +").mkString(" ")