From 140c7ec088a758a9a885e64b71580a3eb1a40a5c Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 22 Jul 2022 11:44:25 +0900 Subject: [PATCH 01/23] implement style for paragraphs --- .../grobid/core/document/TEIFormatter.java | 213 ++++++++++++++++-- .../core/document/TEIFormatterTest.java | 105 ++++++++- 2 files changed, 289 insertions(+), 29 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index ff5a3783be..5a273b4d50 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1,8 +1,11 @@ package org.grobid.core.document; import com.google.common.base.Joiner; +import com.google.common.collect.Iterables; import com.google.common.collect.Sets; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.tuple.Pair; +import org.apache.commons.lang3.tuple.Triple; import org.apache.commons.lang3.StringUtils; import nu.xom.Attribute; @@ -18,12 +21,11 @@ import org.grobid.core.engines.FullTextParser; import org.grobid.core.engines.label.SegmentationLabels; import org.grobid.core.engines.config.GrobidAnalysisConfig; -import org.grobid.core.engines.counters.ReferenceMarkerMatcherCounters; import org.grobid.core.engines.label.TaggingLabel; import org.grobid.core.engines.label.TaggingLabels; import org.grobid.core.exceptions.GrobidException; import org.grobid.core.lang.Language; -import org.grobid.core.utilities.SentenceUtilities; +import org.grobid.core.utilities.*; import org.grobid.core.layout.BoundingBox; import org.grobid.core.layout.GraphicObject; import org.grobid.core.layout.LayoutToken; @@ -31,8 +33,6 @@ import org.grobid.core.layout.Page; import org.grobid.core.tokenization.TaggingTokenCluster; import org.grobid.core.tokenization.TaggingTokenClusteror; -import org.grobid.core.utilities.*; -import org.grobid.core.utilities.counters.CntManager; import org.grobid.core.utilities.matching.EntityMatcherException; import org.grobid.core.utilities.matching.ReferenceMarkerMatcher; import org.grobid.core.engines.citations.CalloutAnalyzer.MarkerType; @@ -45,7 +45,6 @@ import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.io.*; import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId; @@ -1155,16 +1154,24 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, TaggingLabel clusterLabel = cluster.getTaggingLabel(); Engine.getCntManager().i(clusterLabel); if (clusterLabel.equals(TaggingLabels.SECTION)) { - String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens()); + List dehyphenized = LayoutTokensUtil.dehyphenize(cluster.concatTokens()); + String text = LayoutTokensUtil.toText(dehyphenized).replace("\n", " "); + curDiv = teiElement("div"); Element head = teiElement("head"); // section numbers - org.grobid.core.utilities.Pair numb = getSectionNumber(clusterContent); + Pair, String> numb = getSectionNumber(dehyphenized); if (numb != null) { - head.addAttribute(new Attribute("n", numb.b)); - head.appendChild(numb.a); + head.addAttribute(new Attribute("n", numb.getRight())); + dehyphenized = numb.getLeft(); + text = LayoutTokensUtil.toText(dehyphenized); + } + List> stylesList = extractStylesList(dehyphenized); + + if (CollectionUtils.isNotEmpty(stylesList)) { + applyStyleList(head, text, stylesList); } else { - head.appendChild(clusterContent); + head.appendChild(text); } if (config.isGenerateTeiIds()) { @@ -1173,10 +1180,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, } if (config.isGenerateTeiCoordinates("head") ) { - String coords = LayoutTokensUtil.getCoordsString(cluster.concatTokens()); - if (coords != null) { - head.addAttribute(new Attribute("coords", coords)); - } + head.addAttribute(new Attribute("coords", LayoutTokensUtil.getCoordsString(cluster.concatTokens()))); } curDiv.appendChild(head); @@ -1185,7 +1189,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, clusterLabel.equals(TaggingLabels.EQUATION_LABEL)) { // get starting position of the cluster int start = -1; - if ( (cluster.concatTokens() != null) && (cluster.concatTokens().size() > 0) ) { + if ( CollectionUtils.isEmpty(cluster.concatTokens()) ) { start = cluster.concatTokens().get(0).getOffset(); } // get the corresponding equation @@ -1210,9 +1214,17 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, } } } else if (clusterLabel.equals(TaggingLabels.ITEM)) { - String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens()); - //curDiv.appendChild(teiElement("item", clusterContent)); - Element itemNode = teiElement("item", clusterContent); + String text = LayoutTokensUtil.toText(cluster.concatTokens()).replace("\n", " "); + Element itemNode = teiElement("item"); + + List> stylesList = extractStylesList(cluster.concatTokens()); + + if (CollectionUtils.isNotEmpty(stylesList)) { + applyStyleList(itemNode, text, stylesList); + } else { + itemNode.appendChild(text); + } + if (!MARKER_LABELS.contains(lastClusterLabel) && (lastClusterLabel != TaggingLabels.ITEM)) { curList = teiElement("list"); curDiv.appendChild(curList); @@ -1230,7 +1242,9 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, } curDiv.appendChild(note); } else if (clusterLabel.equals(TaggingLabels.PARAGRAPH)) { - String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens()); + List dehyphenized = LayoutTokensUtil.dehyphenize(cluster.concatTokens()); + String text = LayoutTokensUtil.toText(dehyphenized).replace("\n", " "); + if (isNewParagraph(lastClusterLabel, curParagraph)) { if (curParagraph != null && config.isWithSentenceSegmentation()) { segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage()); @@ -1243,7 +1257,14 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, curDiv.appendChild(curParagraph); curParagraphTokens = new ArrayList<>(); } - curParagraph.appendChild(clusterContent); + + List> stylesList = extractStylesList(dehyphenized); + + if (CollectionUtils.isEmpty(stylesList)) { + + } else { + applyStyleList(curParagraph, text, stylesList); + } curParagraphTokens.addAll(cluster.concatTokens()); } else if (MARKER_LABELS.contains(clusterLabel)) { List refTokens = cluster.concatTokens(); @@ -1356,6 +1377,32 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, return buffer; } + private Element applyStyleList(Element paragraphElem, String paragraphText, List> stylesList) { +// if (CollectionUtils.isEmpty(stylesList)) { +// paragraphElem.appendChild(StringUtils.normalizeSpace(paragraphText)); +// return paragraphElem; +// } + + int lastPosition = 0; + for (Triple style : stylesList) { + OffsetPosition offsetStyle = style.getRight(); + String subString = paragraphText.substring(lastPosition, offsetStyle.start); + String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : ""; + String suffixSpace = StringUtils.endsWith(subString, " ") ? " " : ""; + paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString) + suffixSpace); + Element rend = teiElement("hi"); + rend.addAttribute(new Attribute("rend", style.getLeft())); + rend.appendChild(StringUtils.normalizeSpace(paragraphText.substring(offsetStyle.start, offsetStyle.end))); + lastPosition = offsetStyle.end; + paragraphElem.appendChild(rend); + } + String subString = paragraphText.substring(lastPosition); + String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : ""; + paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString)); + + return paragraphElem; + } + public static boolean isNewParagraph(TaggingLabel lastClusterLabel, Element curParagraph) { return (!MARKER_LABELS.contains(lastClusterLabel) && lastClusterLabel != TaggingLabels.FIGURE && lastClusterLabel != TaggingLabels.TABLE) || curParagraph == null; @@ -1537,7 +1584,103 @@ public void segmentIntoSentences(Element curParagraph, List curPara } } - } + } + + private List> segmentLayoutTokenLists(List curParagraphTokens, String text, List sentencesOffsetPosition) { + int pos; + List> segmentedParagraphTokens = new ArrayList<>(); + List currentSentenceTokens = new ArrayList<>(); + pos = 0; + + int currentSentenceIndex = 0; +//System.out.println(text); +//System.out.println("theSentences.size(): " + theSentences.size()); + String sentenceChunk = text.substring(sentencesOffsetPosition.get(currentSentenceIndex).start, + sentencesOffsetPosition.get(currentSentenceIndex).end); + + for (LayoutToken token : curParagraphTokens) { + if (StringUtils.isEmpty(token.getText())) + continue; + + int newPos = sentenceChunk.indexOf(token.getText(), pos); + if ((newPos != -1) || SentenceUtilities.toSkipToken(token.getText())) { + // just move on + currentSentenceTokens.add(token); + if (newPos != -1 && !SentenceUtilities.toSkipToken(token.getText())) + pos = newPos; + } else { + if (currentSentenceTokens.size() > 0) { + segmentedParagraphTokens.add(currentSentenceTokens); + currentSentenceIndex++; + if (currentSentenceIndex >= sentencesOffsetPosition.size()) { + currentSentenceTokens = new ArrayList<>(); + break; + } + sentenceChunk = text.substring(sentencesOffsetPosition.get(currentSentenceIndex).start, sentencesOffsetPosition.get(currentSentenceIndex).end); + } + currentSentenceTokens = new ArrayList<>(); + currentSentenceTokens.add(token); + pos = 0; + } + + if (currentSentenceIndex >= sentencesOffsetPosition.size()) + break; + } + // last sentence + if (currentSentenceTokens.size() > 0) { + // check sentence index too ? + segmentedParagraphTokens.add(currentSentenceTokens); + } + return segmentedParagraphTokens; + } + + protected List> extractStylesList(List tokenList) { + List> styleList = new ArrayList<>(); + String previousStyleName = ""; + StringBuilder temporaryText = new StringBuilder(); + StringBuilder value = new StringBuilder(); + + for (int index = 0; index < tokenList.size(); index++) { + LayoutToken token = tokenList.get(index); + int startOffset = temporaryText.toString().length(); + temporaryText.append(token.getText()); + int endOffset = temporaryText.toString().length(); + + StringBuilder styleName = new StringBuilder(); + if (token.isBold()) { + styleName.append("bold").append(" "); + } + + if (token.isItalic()) { + styleName.append("italic").append(" "); + } + + if(token.isSuperscript()) { + styleName.append("superscript"); + } else if(token.isSubscript()) { + styleName.append("subscript"); + } + + String styleNameTrimmed = StringUtils.trim(styleName.toString()); + value.append(token.getText()); + + if (StringUtils.isEmpty(styleNameTrimmed)) { + previousStyleName = styleNameTrimmed; + value = new StringBuilder(); + continue; + } + + if (styleNameTrimmed.equals(previousStyleName)) { + Iterables.getLast(styleList).getRight().end = endOffset; + } else { + styleList.add(Triple.of(styleNameTrimmed, value.toString(), new OffsetPosition(startOffset, endOffset))); + } + + previousStyleName = styleNameTrimmed; + } + + return styleList; + } /** * Return the graphic objects in a given interval position in the document. @@ -1555,26 +1698,46 @@ private List getGraphicObject(List graphicObjects, return result; } - private org.grobid.core.utilities.Pair getSectionNumber(String text) { + protected Pair, String> getSectionNumber(List tokens) { + + String text = LayoutTokensUtil.toText(tokens); + Matcher m1 = BasicStructureBuilder.headerNumbering1.matcher(text); Matcher m2 = BasicStructureBuilder.headerNumbering2.matcher(text); Matcher m3 = BasicStructureBuilder.headerNumbering3.matcher(text); Matcher m = null; + OffsetPosition position = null; String numb = null; if (m1.find()) { numb = m1.group(0); + position = new OffsetPosition(m1.start(), m1.end()); m = m1; } else if (m2.find()) { numb = m2.group(0); + position = new OffsetPosition(m2.start(), m2.end()); m = m2; } else if (m3.find()) { numb = m3.group(0); + position = new OffsetPosition(m3.start(), m3.end()); m = m3; } if (numb != null) { - text = text.replace(numb, "").trim(); + int lastPosition = 0; + StringBuilder acc = new StringBuilder(); + List tokensWithoutSectionNumbers = new ArrayList<>(); + for (int idx=0; idx < tokens.size(); idx++) { + if (!(lastPosition >= position.start && lastPosition < position.end )) { + if (!(tokensWithoutSectionNumbers.size() == 0 && tokens.get(idx).getText().equals(" "))) { + //adding a space at the beginning of the accumulator should be ignored + tokensWithoutSectionNumbers.add(tokens.get(idx)); + } + } + acc.append(tokens.get(idx).getText()); + lastPosition = acc.toString().length(); + } + numb = numb.replace(" ", ""); - return new org.grobid.core.utilities.Pair<>(text, numb); + return Pair.of(tokensWithoutSectionNumbers, numb); } else { return null; } @@ -1640,7 +1803,7 @@ public List markReferencesTEILuceneBased(List refTokens, if ( (refTokens == null) || (refTokens.size() == 0) ) return null; String text = LayoutTokensUtil.toText(refTokens); - if (text == null || text.trim().length() == 0 || text.endsWith("") || text.startsWith("") || text.startsWith("singletonList(new Text(text)); boolean spaceEnd = false; diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java index bd4d3d118a..b7a1052538 100644 --- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java +++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java @@ -1,15 +1,22 @@ package org.grobid.core.document; +import nu.xom.Element; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.commons.lang3.tuple.Triple; +import org.grobid.core.analyzers.GrobidAnalyzer; +import org.grobid.core.document.xml.XmlBuilderUtils; +import org.grobid.core.engines.config.GrobidAnalysisConfig; +import org.grobid.core.layout.LayoutToken; import org.grobid.core.utilities.GrobidProperties; +import org.grobid.core.utilities.LayoutTokensUtil; +import org.grobid.core.utilities.OffsetPosition; import org.junit.BeforeClass; import org.junit.Test; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import java.util.List; import static org.hamcrest.CoreMatchers.is; -import static org.hamcrest.core.IsNull.notNullValue; -import static org.hamcrest.core.IsNull.nullValue; +import static org.hamcrest.Matchers.hasSize; import static org.junit.Assert.assertThat; public class TEIFormatterTest { @@ -19,4 +26,94 @@ public static void setInitialContext() throws Exception { GrobidProperties.getInstance(); } + @Test + public void testSegmentIntoSentences_simpleText_ShouldSplitIntoSentencesAndAddSTag() throws Exception { + String text = "One sentence. Second sentence."; + + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); + List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + currentParagraph.appendChild(text); + + new TEIFormatter(null, null) + .segmentIntoSentences(currentParagraph, currentParagraphTokens, config, "en"); + + assertThat(currentParagraph.toXML(), is("

One sentence.Second sentence.

")); + assertThat(currentParagraph.getChildElements().size(), is(2)); + } + + @Test + public void testSegmentIntoSentences_Bold_ShouldSplitIntoSentencesAndAddSTag() throws Exception { + String text = "One sentence. Second sentence."; + + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); + List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + currentParagraphTokens.get(0).setBold(true); + currentParagraphTokens.get(2).setBold(true); + currentParagraphTokens.get(2).setItalic(true); + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + currentParagraph.appendChild(text); + + new TEIFormatter(null, null) + .segmentIntoSentences(currentParagraph, currentParagraphTokens, config, "en"); + + assertThat(currentParagraph.toXML(), is("

One sentence.Second sentence.

")); + assertThat(currentParagraph.getChildElements().size(), is(2)); + } + + @Test + public void testExtractStylesList_1_shouldWork() throws Exception { + String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure "; + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); + List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + + currentParagraphTokens.get(26).setSubscript(true); + currentParagraphTokens.get(30).setSubscript(true); + + List> pairs = new TEIFormatter(null, null).extractStylesList(currentParagraphTokens); + + assertThat(pairs, hasSize(2)); + assertThat(pairs.get(0).getLeft(), is("subscript")); + assertThat(pairs.get(0).getMiddle(), is("2")); + assertThat(pairs.get(0).getRight().start, is(79)); + assertThat(pairs.get(0).getRight().end, is(80)); + + assertThat(pairs.get(1).getLeft(), is("subscript")); + assertThat(pairs.get(1).getMiddle(), is("14")); + assertThat(pairs.get(1).getRight().start, is(84)); + assertThat(pairs.get(1).getRight().end, is(86)); + + + } + + @Test + public void testGetSectionNumber_simple_ShouldWork() throws Exception { + String text = "3 Supercon 2"; + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); + List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + + currentParagraphTokens.get(4).setSubscript(true); + Pair, String> sectionNumber = new TEIFormatter(null, null) + .getSectionNumber(currentParagraphTokens); + + String output = LayoutTokensUtil.toText(sectionNumber.getLeft()); + assertThat(output, is("Supercon 2")); + assertThat(sectionNumber.getRight(), is("3")); + } + + @Test + public void testGetSectionNumber_doubleSpace_ShouldWork() throws Exception { + String text = "3 Supercon 2"; + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); + List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + + currentParagraphTokens.get(6).setSubscript(true); + Pair, String> sectionNumber = new TEIFormatter(null, null) + .getSectionNumber(currentParagraphTokens); + + String output = LayoutTokensUtil.toText(sectionNumber.getLeft()); + assertThat(output, is("Supercon 2")); + assertThat(sectionNumber.getRight(), is("3")); + } + } \ No newline at end of file From e3986861b88c584267a8a149b4092546ed0324ff Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 22 Jul 2022 13:59:54 +0900 Subject: [PATCH 02/23] correct missing paragraphs --- .../org/grobid/core/document/TEIFormatter.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 5a273b4d50..197828c3d4 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1171,7 +1171,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, if (CollectionUtils.isNotEmpty(stylesList)) { applyStyleList(head, text, stylesList); } else { - head.appendChild(text); + head.appendChild(StringUtils.normalizeSpace(text.replace("\n", ""))); } if (config.isGenerateTeiIds()) { @@ -1222,7 +1222,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, if (CollectionUtils.isNotEmpty(stylesList)) { applyStyleList(itemNode, text, stylesList); } else { - itemNode.appendChild(text); + itemNode.appendChild(StringUtils.normalizeSpace(text)); } if (!MARKER_LABELS.contains(lastClusterLabel) && (lastClusterLabel != TaggingLabels.ITEM)) { @@ -1260,10 +1260,10 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, List> stylesList = extractStylesList(dehyphenized); - if (CollectionUtils.isEmpty(stylesList)) { - - } else { + if (CollectionUtils.isNotEmpty(stylesList)) { applyStyleList(curParagraph, text, stylesList); + } else { + curParagraph.appendChild(StringUtils.normalizeSpace(text)); } curParagraphTokens.addAll(cluster.concatTokens()); } else if (MARKER_LABELS.contains(clusterLabel)) { @@ -1389,16 +1389,16 @@ private Element applyStyleList(Element paragraphElem, String paragraphText, List String subString = paragraphText.substring(lastPosition, offsetStyle.start); String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : ""; String suffixSpace = StringUtils.endsWith(subString, " ") ? " " : ""; - paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString) + suffixSpace); + paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " ")) + suffixSpace); Element rend = teiElement("hi"); rend.addAttribute(new Attribute("rend", style.getLeft())); - rend.appendChild(StringUtils.normalizeSpace(paragraphText.substring(offsetStyle.start, offsetStyle.end))); + rend.appendChild(StringUtils.normalizeSpace(paragraphText.substring(offsetStyle.start, offsetStyle.end).replace("\n", " "))); lastPosition = offsetStyle.end; paragraphElem.appendChild(rend); } String subString = paragraphText.substring(lastPosition); String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : ""; - paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString)); + paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " "))); return paragraphElem; } From e6ba12be4d28e699de745c1013189a8ca356face Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 22 Jul 2022 15:21:50 +0900 Subject: [PATCH 03/23] add style to figure and table caption --- .../java/org/grobid/core/data/Figure.java | 22 +++++++++++++------ .../main/java/org/grobid/core/data/Table.java | 17 ++++++++++++-- .../grobid/core/document/TEIFormatter.java | 5 ++--- .../core/document/xml/XmlBuilderUtils.java | 2 +- 4 files changed, 33 insertions(+), 13 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java index e9417e9217..b4784a8e70 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Figure.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Figure.java @@ -5,6 +5,7 @@ import com.google.common.collect.Lists; import com.google.common.base.Joiner; +import org.apache.commons.lang3.tuple.Triple; import org.grobid.core.GrobidModels; import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; @@ -17,12 +18,9 @@ import org.grobid.core.layout.GraphicObjectType; import org.grobid.core.layout.LayoutToken; import org.grobid.core.layout.VectorGraphicBoxCalculator; -import org.grobid.core.utilities.BoundingBoxCalculator; -import org.grobid.core.utilities.LayoutTokensUtil; -import org.grobid.core.utilities.TextUtilities; +import org.grobid.core.utilities.*; import org.grobid.core.tokenization.TaggingTokenCluster; import org.grobid.core.tokenization.TaggingTokenClusteror; -import org.grobid.core.utilities.KeyGen; import org.grobid.core.engines.label.TaggingLabels; import org.grobid.core.engines.label.TaggingLabel; import org.grobid.core.engines.citations.CalloutAnalyzer.MarkerType; @@ -41,6 +39,8 @@ import java.util.SortedSet; import java.util.Collections; +import static org.grobid.core.document.TEIFormatter.applyStyleList; +import static org.grobid.core.document.TEIFormatter.extractStylesList; import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId; import static org.grobid.core.document.xml.XmlBuilderUtils.textNode; @@ -388,7 +388,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form // if the segment has been parsed with the full text model we further extract the clusters // to get the bibliographical references - if ( (labeledCaption != null) && (labeledCaption.length() > 0) ) { + if (StringUtils.isNotEmpty(labeledCaption)) { TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens); List clusters = clusteror.cluster(); @@ -404,7 +404,9 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form TaggingLabel clusterLabel = cluster.getTaggingLabel(); //String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens()); - String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens()); + List dehyphenized = LayoutTokensUtil.dehyphenize(cluster.concatTokens()); + String text = LayoutTokensUtil.toText(dehyphenized).replace("\n", " "); + if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) { try { List refNodes = formatter.markReferencesTEILuceneBased( @@ -422,7 +424,13 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form LOGGER.warn("Problem when serializing TEI fragment for figure caption", e); } } else { - desc.appendChild(textNode(clusterContent)); + List> stylesList = extractStylesList(dehyphenized); + + if (CollectionUtils.isNotEmpty(stylesList)) { + applyStyleList(desc, text, stylesList); + } else { + desc.appendChild(StringUtils.normalizeSpace(text)); + } } } } else { diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java index 95b0bf8704..9a9aa6cf3e 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Table.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java @@ -1,5 +1,7 @@ package org.grobid.core.data; +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.tuple.Triple; import org.grobid.core.GrobidModels; import org.apache.commons.lang3.StringUtils; import org.grobid.core.data.table.Cell; @@ -15,6 +17,7 @@ import org.grobid.core.layout.LayoutToken; import org.grobid.core.utilities.BoundingBoxCalculator; import org.grobid.core.utilities.LayoutTokensUtil; +import org.grobid.core.utilities.OffsetPosition; import org.grobid.core.utilities.counters.CntManager; import org.grobid.core.engines.counters.TableRejectionCounters; import org.grobid.core.tokenization.TaggingTokenCluster; @@ -32,6 +35,8 @@ import nu.xom.Node; import nu.xom.Text; +import static org.grobid.core.document.TEIFormatter.applyStyleList; +import static org.grobid.core.document.TEIFormatter.extractStylesList; import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId; import static org.grobid.core.document.xml.XmlBuilderUtils.textNode; @@ -119,7 +124,9 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form TaggingLabel clusterLabel = cluster.getTaggingLabel(); //String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens()); - String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens()); + List dehyphenized = LayoutTokensUtil.dehyphenize(cluster.concatTokens()); + String text = LayoutTokensUtil.toText(dehyphenized).replace("\n", " "); + if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) { try { List refNodes = formatter.markReferencesTEILuceneBased( @@ -137,7 +144,13 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form LOGGER.warn("Problem when serializing TEI fragment for table caption", e); } } else { - desc.appendChild(textNode(clusterContent)); + List> stylesList = extractStylesList(dehyphenized); + + if (CollectionUtils.isNotEmpty(stylesList)) { + applyStyleList(desc, text, stylesList); + } else { + desc.appendChild(StringUtils.normalizeSpace(text)); + } } if (desc != null && config.isWithSentenceSegmentation()) { diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 197828c3d4..1b7c12dacd 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1377,7 +1377,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, return buffer; } - private Element applyStyleList(Element paragraphElem, String paragraphText, List> stylesList) { + public static Element applyStyleList(Element paragraphElem, String paragraphText, List> stylesList) { // if (CollectionUtils.isEmpty(stylesList)) { // paragraphElem.appendChild(StringUtils.normalizeSpace(paragraphText)); // return paragraphElem; @@ -1515,7 +1515,6 @@ public void segmentIntoSentences(Element curParagraph, List curPara k++; } }*/ - } // update the xml paragraph element int currenChildIndex = 0; @@ -1634,7 +1633,7 @@ private List> segmentLayoutTokenLists(List curPar return segmentedParagraphTokens; } - protected List> extractStylesList(List tokenList) { + public static List> extractStylesList(List tokenList) { List> styleList = new ArrayList<>(); String previousStyleName = ""; StringBuilder temporaryText = new StringBuilder(); diff --git a/grobid-core/src/main/java/org/grobid/core/document/xml/XmlBuilderUtils.java b/grobid-core/src/main/java/org/grobid/core/document/xml/XmlBuilderUtils.java index 5d4850f94e..0c549078df 100644 --- a/grobid-core/src/main/java/org/grobid/core/document/xml/XmlBuilderUtils.java +++ b/grobid-core/src/main/java/org/grobid/core/document/xml/XmlBuilderUtils.java @@ -102,7 +102,7 @@ public static void main(String[] args) throws ParsingException, IOException { } public static String stripNonValidXMLCharacters(String in) { - StringBuffer out = new StringBuffer(); // Used to hold the output. + StringBuilder out = new StringBuilder(); // Used to hold the output. char current; // Used to reference the current character. if (in == null || ("".equals(in))) From 17a914666bf24732454ae6f8d58eb279d5a54b2e Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 22 Jul 2022 15:38:34 +0900 Subject: [PATCH 04/23] add style to title --- .../grobid/core/document/TEIFormatter.java | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 1b7c12dacd..1b30dac357 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -155,7 +155,33 @@ public StringBuilder toTEIHeader(BiblioItem biblio, } if (biblio.getTitle() != null) { - tei.append(TextUtilities.HTMLEncode(biblio.getTitle())); + List layoutTokens = biblio.getLayoutTokens(TaggingLabels.HEADER_TITLE); + + String text = LayoutTokensUtil.toText(layoutTokens).replace("\n", " "); + + List> stylesList = extractStylesList(layoutTokens); + + if (CollectionUtils.isNotEmpty(stylesList)) { + int lastPosition = 0; + for (Triple style : stylesList) { + OffsetPosition offsetStyle = style.getRight(); + String subString = text.substring(lastPosition, offsetStyle.start); + String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : ""; + String suffixSpace = StringUtils.endsWith(subString, " ") ? " " : ""; + tei.append(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " ")) + suffixSpace); + tei.append("") + .append(StringUtils.normalizeSpace(text.substring(offsetStyle.start, offsetStyle.end).replace("\n", " "))) + .append(""); + lastPosition = offsetStyle.end; + } + String subString = text.substring(lastPosition); + String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : ""; + tei.append(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " "))); + + } else { + String title = biblio.getTitle(); + tei.append(TextUtilities.HTMLEncode(title)); + } } tei.append("\n\t\t\t\n"); From cc3a0e5494ace00b71ef1fe2ee6a5cda5c4c5d53 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 25 Jul 2022 08:48:10 +0900 Subject: [PATCH 05/23] wrongly inverted if --- .../src/main/java/org/grobid/core/document/TEIFormatter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 1b30dac357..dd7f1959f0 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1215,7 +1215,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, clusterLabel.equals(TaggingLabels.EQUATION_LABEL)) { // get starting position of the cluster int start = -1; - if ( CollectionUtils.isEmpty(cluster.concatTokens()) ) { + if ( CollectionUtils.isNotEmpty(cluster.concatTokens()) ) { start = cluster.concatTokens().get(0).getOffset(); } // get the corresponding equation From d5ae544febd66739b48a847e1f9c64677467986f Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 25 Jul 2022 08:57:34 +0900 Subject: [PATCH 06/23] missing parenthesis --- .../java/org/grobid/core/document/TEIFormatter.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index dd7f1959f0..8c6f37fd5c 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1486,7 +1486,7 @@ public void segmentIntoSentences(Element curParagraph, List curPara pos = 0; if (config.isGenerateTeiCoordinates("s")) { - + int currentSentenceIndex = 0; //System.out.println(text); //System.out.println("theSentences.size(): " + theSentences.size()); @@ -1494,7 +1494,7 @@ public void segmentIntoSentences(Element curParagraph, List curPara for(int i=0; i curPara currentSentenceTokens.add(token); pos = 0; } - + if (currentSentenceIndex >= theSentences.size()) break; } @@ -1541,7 +1541,7 @@ public void segmentIntoSentences(Element curParagraph, List curPara k++; } }*/ - + } // update the xml paragraph element int currenChildIndex = 0; pos = 0; @@ -1564,12 +1564,12 @@ public void segmentIntoSentences(Element curParagraph, List curPara } } } - + int sentenceLength = theSentences.get(i).end - pos; // check if we have a ref between pos and pos+sentenceLength for(int j=refIndex; j= pos+posInSentence && refPos <= pos+sentenceLength) { From 064c2f6bcee3978e9b7ba96cf824c8225f0a2ad8 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 25 Jul 2022 10:04:27 +0900 Subject: [PATCH 07/23] add decoration in equation/formula --- .../java/org/grobid/core/data/Equation.java | 32 +++++++++-- .../grobid/core/document/TEIFormatter.java | 25 +++++---- .../grobid/core/engines/FullTextParser.java | 7 +-- .../core/document/TEIFormatterTest.java | 54 ++++++++++++++++++- 4 files changed, 99 insertions(+), 19 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/data/Equation.java b/grobid-core/src/main/java/org/grobid/core/data/Equation.java index 141660d848..753e3d5908 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Equation.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Equation.java @@ -2,7 +2,9 @@ import nu.xom.Attribute; import nu.xom.Element; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.Triple; import org.grobid.core.document.xml.XmlBuilderUtils; import org.grobid.core.engines.Engine; import org.grobid.core.engines.config.GrobidAnalysisConfig; @@ -10,13 +12,17 @@ import org.grobid.core.layout.LayoutToken; import org.grobid.core.utilities.BoundingBoxCalculator; import org.grobid.core.utilities.LayoutTokensUtil; +import org.grobid.core.utilities.OffsetPosition; import org.grobid.core.utilities.counters.CntManager; import org.grobid.core.utilities.TextUtilities; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.SortedSet; +import static org.grobid.core.document.TEIFormatter.*; + /** * Class for representing an equation. * @@ -56,9 +62,15 @@ public Element toTEIElement(GrobidAnalysisConfig config) { XmlBuilderUtils.addCoords(formulaElement, LayoutTokensUtil.getCoordsStringForOneBox(getLayoutTokens())); } - formulaElement.appendChild(LayoutTokensUtil.normalizeText(content.toString()).trim()); + List> stylesList = extractStylesList(getContentTokens(), Arrays.asList(TEI_STYLE_BOLD_NAME, TEI_STYLE_ITALIC_NAME)) ; + + if (CollectionUtils.isNotEmpty(stylesList)) { + applyStyleList(formulaElement, getContent(), stylesList); + } else { + formulaElement.appendChild(LayoutTokensUtil.normalizeText(content.toString()).trim()); + } - if ( (label != null) && (label.length()>0) ) { + if ( StringUtils.isNotEmpty(label) ) { Element labelEl = XmlBuilderUtils.teiElement("label", LayoutTokensUtil.normalizeText(label.toString())); formulaElement.appendChild(labelEl); @@ -79,6 +91,16 @@ public List getContentTokens() { return contentTokens; } + public void addContentTokens(List tokens) { + if (tokens == null) + return; + + if (contentTokens == null) + contentTokens = new ArrayList<>(); + + contentTokens.addAll(tokens); + } + public List getLabelTokens() { return labelTokens; } @@ -181,9 +203,9 @@ public void addLayoutTokens(List tokens) { if (tokens == null) return; if (layoutTokens == null) - layoutTokens = new ArrayList(); - for(LayoutToken token : tokens) - layoutTokens.add(token); + layoutTokens = new ArrayList<>(); + + layoutTokens.addAll(tokens); } public List getCoordinates() { diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 8c6f37fd5c..6845dccd96 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -57,6 +57,10 @@ @SuppressWarnings("StringConcatenationInsideStringBuilderAppend") public class TEIFormatter { private static final Logger LOGGER = LoggerFactory.getLogger(TEIFormatter.class); + public static final String TEI_STYLE_ITALIC_NAME = "italic"; + public static String TEI_STYLE_BOLD_NAME = "bold"; + public static String TEI_STYLE_SUPERSCRIPT_NAME = "superscript"; + public static String TEI_STYLE_SUBSCRIPT_NAME = "subscript"; private Document doc = null; private FullTextParser fullTextParser = null; @@ -1221,7 +1225,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, // get the corresponding equation if (start != -1) { Equation theEquation = null; - if (equations != null) { + if (CollectionUtils.isNotEmpty(equations)) { for(int i=0; i> segmentLayoutTokenLists(List curPar } public static List> extractStylesList(List tokenList) { + return extractStylesList(tokenList, new ArrayList<>()); + } + public static List> extractStylesList(List tokenList, List ignoreStyles) { List> styleList = new ArrayList<>(); String previousStyleName = ""; StringBuilder temporaryText = new StringBuilder(); @@ -1672,18 +1679,18 @@ public static List> extractStylesList(Lis int endOffset = temporaryText.toString().length(); StringBuilder styleName = new StringBuilder(); - if (token.isBold()) { - styleName.append("bold").append(" "); + if (token.isBold() && !ignoreStyles.contains(TEI_STYLE_BOLD_NAME)) { + styleName.append(TEI_STYLE_BOLD_NAME).append(" "); } - if (token.isItalic()) { - styleName.append("italic").append(" "); + if (token.isItalic() && !ignoreStyles.contains(TEI_STYLE_ITALIC_NAME)) { + styleName.append(TEI_STYLE_ITALIC_NAME).append(" "); } - if(token.isSuperscript()) { - styleName.append("superscript"); - } else if(token.isSubscript()) { - styleName.append("subscript"); + if(token.isSuperscript() && !ignoreStyles.contains(TEI_STYLE_SUPERSCRIPT_NAME)) { + styleName.append(TEI_STYLE_SUPERSCRIPT_NAME); + } else if(token.isSubscript() && !ignoreStyles.contains(TEI_STYLE_SUBSCRIPT_NAME)) { + styleName.append(TEI_STYLE_SUBSCRIPT_NAME); } String styleNameTrimmed = StringUtils.trim(styleName.toString()); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 5febe8ea14..980c17a14c 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -2272,7 +2272,7 @@ protected List processEquations(String rese, } List tokenizationEquation = cluster.concatTokens(); - String clusterContent = LayoutTokensUtil.normalizeText(LayoutTokensUtil.toText(cluster.concatTokens())); + String clusterContent = LayoutTokensUtil.toText(cluster.concatTokens()); if (currentResult == null) currentResult = new Equation(); @@ -2288,10 +2288,11 @@ protected List processEquations(String rese, currentResult = new Equation(); } currentResult.appendContent(clusterContent); - currentResult.addLayoutTokens(cluster.concatTokens()); + currentResult.addLayoutTokens(tokenizationEquation); + currentResult.addContentTokens(tokenizationEquation); } else if (clusterLabel.equals(TaggingLabels.EQUATION_LABEL)) { currentResult.appendLabel(clusterContent); - currentResult.addLayoutTokens(cluster.concatTokens()); + currentResult.addLayoutTokens(tokenizationEquation); } lastLabel = clusterLabel; diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java index b7a1052538..e652846343 100644 --- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java +++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java @@ -13,8 +13,10 @@ import org.junit.BeforeClass; import org.junit.Test; +import java.util.Arrays; import java.util.List; +import static org.grobid.core.document.TEIFormatter.TEI_STYLE_BOLD_NAME; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.Matchers.hasSize; import static org.junit.Assert.assertThat; @@ -62,7 +64,7 @@ public void testSegmentIntoSentences_Bold_ShouldSplitIntoSentencesAndAddSTag() t } @Test - public void testExtractStylesList_1_shouldWork() throws Exception { + public void testExtractStylesList_single_shouldWork() throws Exception { String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure "; GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); @@ -70,7 +72,7 @@ public void testExtractStylesList_1_shouldWork() throws Exception { currentParagraphTokens.get(26).setSubscript(true); currentParagraphTokens.get(30).setSubscript(true); - List> pairs = new TEIFormatter(null, null).extractStylesList(currentParagraphTokens); + List> pairs = TEIFormatter.extractStylesList(currentParagraphTokens); assertThat(pairs, hasSize(2)); assertThat(pairs.get(0).getLeft(), is("subscript")); @@ -82,8 +84,56 @@ public void testExtractStylesList_1_shouldWork() throws Exception { assertThat(pairs.get(1).getMiddle(), is("14")); assertThat(pairs.get(1).getRight().start, is(84)); assertThat(pairs.get(1).getRight().end, is(86)); + } + + @Test + public void testExtractStylesList_combined_shouldWork() throws Exception { + String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure "; + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); + List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + + currentParagraphTokens.get(26).setSubscript(true); + currentParagraphTokens.get(26).setBold(true); + currentParagraphTokens.get(26).setItalic(true); + currentParagraphTokens.get(30).setSubscript(true); + + List> pairs = TEIFormatter.extractStylesList(currentParagraphTokens); + + assertThat(pairs, hasSize(2)); + assertThat(pairs.get(0).getLeft(), is("bold italic subscript")); + assertThat(pairs.get(0).getMiddle(), is("2")); + assertThat(pairs.get(0).getRight().start, is(79)); + assertThat(pairs.get(0).getRight().end, is(80)); + + assertThat(pairs.get(1).getLeft(), is("subscript")); + assertThat(pairs.get(1).getMiddle(), is("14")); + assertThat(pairs.get(1).getRight().start, is(84)); + assertThat(pairs.get(1).getRight().end, is(86)); + } + + @Test + public void testExtractStylesList_ignoreBold_shouldWork() throws Exception { + String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure "; + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); + List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + + currentParagraphTokens.get(26).setSubscript(true); + currentParagraphTokens.get(26).setBold(true); + currentParagraphTokens.get(26).setItalic(true); + currentParagraphTokens.get(30).setSubscript(true); + List> pairs = TEIFormatter.extractStylesList(currentParagraphTokens, Arrays.asList(TEI_STYLE_BOLD_NAME)); + assertThat(pairs, hasSize(2)); + assertThat(pairs.get(0).getLeft(), is("italic subscript")); + assertThat(pairs.get(0).getMiddle(), is("2")); + assertThat(pairs.get(0).getRight().start, is(79)); + assertThat(pairs.get(0).getRight().end, is(80)); + + assertThat(pairs.get(1).getLeft(), is("subscript")); + assertThat(pairs.get(1).getMiddle(), is("14")); + assertThat(pairs.get(1).getRight().start, is(84)); + assertThat(pairs.get(1).getRight().end, is(86)); } @Test From c5f607b33c64b2f5bff71fa862f146f01180f63b Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 25 Jul 2022 11:42:55 +0900 Subject: [PATCH 08/23] Fix spaces --- .../grobid/core/document/TEIFormatter.java | 13 ++++-- .../grobid/core/engines/FullTextParser.java | 2 + .../core/document/TEIFormatterTest.java | 46 ++++++++++++++++--- 3 files changed, 49 insertions(+), 12 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 6845dccd96..1a59321547 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1407,7 +1407,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, return buffer; } - public static Element applyStyleList(Element paragraphElem, String paragraphText, List> stylesList) { + public static Element applyStyleList(Element paragraphElem, String text, List> stylesList) { // if (CollectionUtils.isEmpty(stylesList)) { // paragraphElem.appendChild(StringUtils.normalizeSpace(paragraphText)); // return paragraphElem; @@ -1416,17 +1416,20 @@ public static Element applyStyleList(Element paragraphElem, String paragraphText int lastPosition = 0; for (Triple style : stylesList) { OffsetPosition offsetStyle = style.getRight(); - String subString = paragraphText.substring(lastPosition, offsetStyle.start); + String subString = text.substring(lastPosition, offsetStyle.start); String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : ""; - String suffixSpace = StringUtils.endsWith(subString, " ") ? " " : ""; + String suffixSpace = ""; + if (subString.length() > 1) { + suffixSpace = StringUtils.endsWith(subString, " ") ? " " : ""; + } paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " ")) + suffixSpace); Element rend = teiElement("hi"); rend.addAttribute(new Attribute("rend", style.getLeft())); - rend.appendChild(StringUtils.normalizeSpace(paragraphText.substring(offsetStyle.start, offsetStyle.end).replace("\n", " "))); + rend.appendChild(StringUtils.normalizeSpace(text.substring(offsetStyle.start, offsetStyle.end).replace("\n", " "))); lastPosition = offsetStyle.end; paragraphElem.appendChild(rend); } - String subString = paragraphText.substring(lastPosition); + String subString = text.substring(lastPosition); String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : ""; paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " "))); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 980c17a14c..4aaa818803 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -2272,6 +2272,8 @@ protected List processEquations(String rese, } List tokenizationEquation = cluster.concatTokens(); + //LF: I removed the normalisation to keep the content in sync with contentTokens. + // The normalisation "StringUtils.normaliseSpaces()" is called anyway when building the XML String clusterContent = LayoutTokensUtil.toText(cluster.concatTokens()); if (currentResult == null) diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java index e652846343..bed87b58da 100644 --- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java +++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java @@ -13,6 +13,7 @@ import org.junit.BeforeClass; import org.junit.Test; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -33,7 +34,7 @@ public void testSegmentIntoSentences_simpleText_ShouldSplitIntoSentencesAndAddST String text = "One sentence. Second sentence."; GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); - List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + List currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); Element currentParagraph = XmlBuilderUtils.teiElement("p"); currentParagraph.appendChild(text); @@ -49,7 +50,7 @@ public void testSegmentIntoSentences_Bold_ShouldSplitIntoSentencesAndAddSTag() t String text = "One sentence. Second sentence."; GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); - List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + List currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); currentParagraphTokens.get(0).setBold(true); currentParagraphTokens.get(2).setBold(true); currentParagraphTokens.get(2).setItalic(true); @@ -67,7 +68,7 @@ public void testSegmentIntoSentences_Bold_ShouldSplitIntoSentencesAndAddSTag() t public void testExtractStylesList_single_shouldWork() throws Exception { String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure "; GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); - List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + List currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); currentParagraphTokens.get(26).setSubscript(true); currentParagraphTokens.get(30).setSubscript(true); @@ -86,11 +87,42 @@ public void testExtractStylesList_single_shouldWork() throws Exception { assertThat(pairs.get(1).getRight().end, is(86)); } + @Test + public void applyStyleList_simpleStyles_shouldWork() throws Exception { + String text = "This is bold and italic."; + List> styles = new ArrayList<>(); + styles.add(Triple.of("bold", "bold", new OffsetPosition(8, 12))); + styles.add(Triple.of("italic", "italic", new OffsetPosition(17, 23))); + Element rootElement = XmlBuilderUtils.teiElement("p"); + TEIFormatter.applyStyleList(rootElement, text, styles); + + assertThat(rootElement.toXML(), is("

This is " + + "bold and italic.

")); + } + + @Test + public void applyStyleList_complexStyles_shouldWork() throws Exception { + String text = "This is bold and italic."; + List> styles = new ArrayList<>(); + styles.add(Triple.of("subscript", "is", new OffsetPosition(5, 7))); + styles.add(Triple.of("bold subscript", "bold", new OffsetPosition(8, 12))); + styles.add(Triple.of("italic superscript", "and", new OffsetPosition(13, 16))); + styles.add(Triple.of("italic", "italic", new OffsetPosition(17, 23))); + Element rootElement = XmlBuilderUtils.teiElement("p"); + TEIFormatter.applyStyleList(rootElement, text, styles); + + assertThat(rootElement.toXML(), is("

This " + + "is " + + "bold " + + "and " + + "italic.

")); + } + @Test public void testExtractStylesList_combined_shouldWork() throws Exception { String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure "; GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); - List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + List currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); currentParagraphTokens.get(26).setSubscript(true); currentParagraphTokens.get(26).setBold(true); @@ -115,7 +147,7 @@ public void testExtractStylesList_combined_shouldWork() throws Exception { public void testExtractStylesList_ignoreBold_shouldWork() throws Exception { String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure "; GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); - List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + List currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); currentParagraphTokens.get(26).setSubscript(true); currentParagraphTokens.get(26).setBold(true); @@ -140,7 +172,7 @@ public void testExtractStylesList_ignoreBold_shouldWork() throws Exception { public void testGetSectionNumber_simple_ShouldWork() throws Exception { String text = "3 Supercon 2"; GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); - List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + List currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); currentParagraphTokens.get(4).setSubscript(true); Pair, String> sectionNumber = new TEIFormatter(null, null) @@ -155,7 +187,7 @@ public void testGetSectionNumber_simple_ShouldWork() throws Exception { public void testGetSectionNumber_doubleSpace_ShouldWork() throws Exception { String text = "3 Supercon 2"; GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); - List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + List currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); currentParagraphTokens.get(6).setSubscript(true); Pair, String> sectionNumber = new TEIFormatter(null, null) From 386e6b4e48ec8effaba43188870da16866c0afa7 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 25 Jul 2022 11:48:23 +0900 Subject: [PATCH 09/23] add comments --- .../java/org/grobid/core/document/TEIFormatter.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 1a59321547..579ccb722c 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1407,6 +1407,10 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, return buffer; } + /** + * Apply the styles as described in the stylesList. + * This method modifies the input paragraphElem. + */ public static Element applyStyleList(Element paragraphElem, String text, List> stylesList) { // if (CollectionUtils.isEmpty(stylesList)) { // paragraphElem.appendChild(StringUtils.normalizeSpace(paragraphText)); @@ -1669,6 +1673,12 @@ private List> segmentLayoutTokenLists(List curPar public static List> extractStylesList(List tokenList) { return extractStylesList(tokenList, new ArrayList<>()); } + + + /** + * Extracts the stiles from the list of token. The additional parameter can ignore certain styles + * (e.g. to restrict only superscript/subscript when decorating formulas) + */ public static List> extractStylesList(List tokenList, List ignoreStyles) { List> styleList = new ArrayList<>(); String previousStyleName = ""; From 44e70f383c721361e69de65b6f24e85bb39c817d Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 25 Jul 2022 15:06:46 +0900 Subject: [PATCH 10/23] add some more tests --- .../core/document/TEIFormatterTest.java | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java index bed87b58da..9a64b8141f 100644 --- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java +++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java @@ -64,6 +64,34 @@ public void testSegmentIntoSentences_Bold_ShouldSplitIntoSentencesAndAddSTag() t assertThat(currentParagraph.getChildElements().size(), is(2)); } + @Test + public void testSegmentIntoSentences_Bold_ShouldWork() throws Exception { + String text = "One sentence (Foppiano et al.). Second sentence (Lopez et al.). "; + + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); + List currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + currentParagraphTokens.get(0).setBold(true); + currentParagraphTokens.get(2).setBold(true); + currentParagraphTokens.get(2).setItalic(true); + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + currentParagraph.appendChild("One sentence"); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Foppiano et al.)")); + currentParagraph.appendChild(". "); + currentParagraph.appendChild("Second sentence"); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)")); + currentParagraph.appendChild("."); + + System.out.println(currentParagraph.toXML()); + + new TEIFormatter(null, null) + .segmentIntoSentences(currentParagraph, currentParagraphTokens, config, "en"); + + assertThat(currentParagraph.toXML(), + is("

One sentence (Foppiano et al.).Second sentence (Lopez et al.).

")); + } + @Test public void testExtractStylesList_single_shouldWork() throws Exception { String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure "; From a090297f761ce35181cb8ad398e0c16872d28362 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 25 Jul 2022 15:22:10 +0900 Subject: [PATCH 11/23] some refactoring --- .../grobid/core/document/TEIFormatter.java | 65 ++++--------------- 1 file changed, 13 insertions(+), 52 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 579ccb722c..aa98bd1019 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1454,7 +1454,7 @@ public void segmentIntoSentences(Element curParagraph, List curPara // in xom, the following gives all the text under the element, for the whole subtree String text = curParagraph.getValue(); - if (text == null || text.length() == 0) + if (StringUtils.isEmpty(text)) return; // identify ref nodes, ref spans and ref positions @@ -1481,7 +1481,7 @@ public void segmentIntoSentences(Element curParagraph, List curPara } } - List theSentences = + List sentencesOffsetPosition = SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang)); /*if (theSentences.size() == 0) { @@ -1492,50 +1492,7 @@ public void segmentIntoSentences(Element curParagraph, List curPara }*/ // segment the list of layout tokens according to the sentence segmentation if the coordinates are needed - List> segmentedParagraphTokens = new ArrayList<>(); - List currentSentenceTokens = new ArrayList<>(); - pos = 0; - - if (config.isGenerateTeiCoordinates("s")) { - - int currentSentenceIndex = 0; -//System.out.println(text); -//System.out.println("theSentences.size(): " + theSentences.size()); - String sentenceChunk = text.substring(theSentences.get(currentSentenceIndex).start, theSentences.get(currentSentenceIndex).end); - - for(int i=0; i 0) { - segmentedParagraphTokens.add(currentSentenceTokens); - currentSentenceIndex++; - if (currentSentenceIndex >= theSentences.size()) { - currentSentenceTokens = new ArrayList<>(); - break; - } - sentenceChunk = text.substring(theSentences.get(currentSentenceIndex).start, theSentences.get(currentSentenceIndex).end); - } - currentSentenceTokens = new ArrayList<>(); - currentSentenceTokens.add(token); - pos = 0; - } - - if (currentSentenceIndex >= theSentences.size()) - break; - } - // last sentence - if (currentSentenceTokens.size() > 0) { - // check sentence index too ? - segmentedParagraphTokens.add(currentSentenceTokens); - } + List> segmentedParagraphTokens = segmentLayoutTokenLists(curParagraphTokens, text, sentencesOffsetPosition); /*if (segmentedParagraphTokens.size() != theSentences.size()) { System.out.println("ERROR, segmentedParagraphTokens size:" + segmentedParagraphTokens.size() + " vs theSentences size: " + theSentences.size()); @@ -1552,16 +1509,20 @@ public void segmentIntoSentences(Element curParagraph, List curPara k++; } }*/ - } + + // update the xml paragraph element int currenChildIndex = 0; pos = 0; int posInSentence = 0; int refIndex = 0; - for(int i=0; i currentSentenceTokens = segmentedParagraphTokens.get(i); + if (config.isGenerateTeiIds()) { String sID = KeyGen.getKey().substring(0, 7); addXmlId(sentenceElement, "_" + sID); @@ -1576,7 +1537,7 @@ public void segmentIntoSentences(Element curParagraph, List curPara } } - int sentenceLength = theSentences.get(i).end - pos; + int sentenceLength = sentencesOffsetPosition.get(i).end - pos; // check if we have a ref between pos and pos+sentenceLength for(int j=refIndex; j curPara } } - if (pos+posInSentence <= theSentences.get(i).end) { - String local_text_chunk = text.substring(pos+posInSentence, theSentences.get(i).end); + if (pos + posInSentence <= sentencesOffsetPosition.get(i).end) { + String local_text_chunk = text.substring(pos + posInSentence, sentencesOffsetPosition.get(i).end); local_text_chunk = XmlBuilderUtils.stripNonValidXMLCharacters(local_text_chunk); sentenceElement.appendChild(local_text_chunk); curParagraph.appendChild(sentenceElement); From 599559e6f4dfa14d089d31a9bccaa9021b63b96b Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 25 Jul 2022 15:40:19 +0900 Subject: [PATCH 12/23] minor changes --- .../java/org/grobid/core/document/TEIFormatter.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index aa98bd1019..06b4360347 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1522,6 +1522,7 @@ public void segmentIntoSentences(Element curParagraph, List curPara Element sentenceElement = teiElement("s"); List currentSentenceTokens = segmentedParagraphTokens.get(i); +// List> styleList = extractStylesList(currentSentenceTokens); if (config.isGenerateTeiIds()) { String sID = KeyGen.getKey().substring(0, 7); @@ -1547,9 +1548,9 @@ public void segmentIntoSentences(Element curParagraph, List curPara if (refPos >= pos+posInSentence && refPos <= pos+sentenceLength) { Node valueNode = mapRefNodes.get(new Integer(refPos)); if (pos+posInSentence < refPos) { - String local_text_chunk = text.substring(pos+posInSentence, refPos); - local_text_chunk = XmlBuilderUtils.stripNonValidXMLCharacters(local_text_chunk); - sentenceElement.appendChild(local_text_chunk); + String localTextChunk = text.substring(pos+posInSentence, refPos); + localTextChunk = XmlBuilderUtils.stripNonValidXMLCharacters(localTextChunk); + sentenceElement.appendChild(localTextChunk); } valueNode.detach(); sentenceElement.appendChild(valueNode); @@ -1562,9 +1563,9 @@ public void segmentIntoSentences(Element curParagraph, List curPara } if (pos + posInSentence <= sentencesOffsetPosition.get(i).end) { - String local_text_chunk = text.substring(pos + posInSentence, sentencesOffsetPosition.get(i).end); - local_text_chunk = XmlBuilderUtils.stripNonValidXMLCharacters(local_text_chunk); - sentenceElement.appendChild(local_text_chunk); + String localTextChunk = text.substring(pos + posInSentence, sentencesOffsetPosition.get(i).end); + localTextChunk = XmlBuilderUtils.stripNonValidXMLCharacters(localTextChunk); + sentenceElement.appendChild(localTextChunk); curParagraph.appendChild(sentenceElement); } } From eee28ab2e0bdd3ada837a418ae39c7985bcd3faa Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 25 Jul 2022 16:06:41 +0900 Subject: [PATCH 13/23] implement change when segmenting paragraphs in sentences --- .../grobid/core/document/TEIFormatter.java | 74 +++++++++++-------- .../core/document/TEIFormatterTest.java | 24 ++++++ 2 files changed, 68 insertions(+), 30 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 06b4360347..00a54e28dd 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -45,6 +45,7 @@ import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Collectors; import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId; @@ -1457,29 +1458,14 @@ public void segmentIntoSentences(Element curParagraph, List curPara if (StringUtils.isEmpty(text)) return; - // identify ref nodes, ref spans and ref positions - Map mapRefNodes = new HashMap<>(); - List refPositions = new ArrayList<>(); - List forbiddenPositions = new ArrayList<>(); - int pos = 0; - for(int i=0; i> mapRefNodes = identifyNestedNodes(curParagraph); - String chunk = theNode.getValue(); - forbiddenPositions.add(new OffsetPosition(pos, pos+chunk.length())); - pos += chunk.length(); - } - } - } + List forbiddenPositions = mapRefNodes.entrySet() + .stream() + .map(entry -> new OffsetPosition(entry.getKey(), entry.getValue().getRight().length() + entry.getKey())) + .collect(Collectors.toList()); + + List refPositions = mapRefNodes.keySet().stream().sorted().collect(Collectors.toList()); List sentencesOffsetPosition = SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang)); @@ -1513,7 +1499,7 @@ public void segmentIntoSentences(Element curParagraph, List curPara // update the xml paragraph element int currenChildIndex = 0; - pos = 0; + int pos = 0; int posInSentence = 0; int refIndex = 0; for(int i=0; i curPara Element sentenceElement = teiElement("s"); List currentSentenceTokens = segmentedParagraphTokens.get(i); -// List> styleList = extractStylesList(currentSentenceTokens); if (config.isGenerateTeiIds()) { String sID = KeyGen.getKey().substring(0, 7); @@ -1531,22 +1516,21 @@ public void segmentIntoSentences(Element curParagraph, List curPara if (config.isGenerateTeiCoordinates("s")) { if (segmentedParagraphTokens.size()>=i+1) { currentSentenceTokens = segmentedParagraphTokens.get(i); - String coords = LayoutTokensUtil.getCoordsString(currentSentenceTokens); - if (coords != null) { - sentenceElement.addAttribute(new Attribute("coords", coords)); - } + sentenceElement.addAttribute(new Attribute("coords", LayoutTokensUtil.getCoordsString(currentSentenceTokens))); } } + List> styleList = extractStylesList(currentSentenceTokens); + int sentenceLength = sentencesOffsetPosition.get(i).end - pos; // check if we have a ref between pos and pos+sentenceLength for(int j=refIndex; j= pos+posInSentence && refPos <= pos+sentenceLength) { - Node valueNode = mapRefNodes.get(new Integer(refPos)); + Node valueNode = mapRefNodes.get(refPos).getLeft(); if (pos+posInSentence < refPos) { String localTextChunk = text.substring(pos+posInSentence, refPos); localTextChunk = XmlBuilderUtils.stripNonValidXMLCharacters(localTextChunk); @@ -1584,6 +1568,36 @@ public void segmentIntoSentences(Element curParagraph, List curPara } + protected Map> identifyNestedNodes(Element curParagraph) { + // identify ref nodes, ref spans and ref positions + Map> mapRefNodes = new HashMap<>(); + + int pos = 0; + for(int i = 0; i< curParagraph.getChildCount(); i++) { + Node theNode = curParagraph.getChild(i); + if (theNode instanceof Text) { + String chunk = theNode.getValue(); + pos += chunk.length(); + } else if (theNode instanceof Element) { + // for readability in another conditional + if (((Element) theNode).getLocalName().equals("ref")) { + String chunk = theNode.getValue(); + // map character offset of the node and the chunk text + mapRefNodes.put(pos, Pair.of(theNode, chunk)); + + pos += chunk.length(); + } else if (((Element) theNode).getLocalName().equals("hi")) { + String chunk = theNode.getValue(); + mapRefNodes.put(pos, Pair.of(theNode, chunk)); + + pos += chunk.length(); + } + } + } + + return mapRefNodes; + } + private List> segmentLayoutTokenLists(List curParagraphTokens, String text, List sentencesOffsetPosition) { int pos; List> segmentedParagraphTokens = new ArrayList<>(); diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java index 9a64b8141f..56b2d26afa 100644 --- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java +++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java @@ -1,6 +1,7 @@ package org.grobid.core.document; import nu.xom.Element; +import nu.xom.Node; import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.lang3.tuple.Triple; import org.grobid.core.analyzers.GrobidAnalyzer; @@ -16,6 +17,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Map; import static org.grobid.core.document.TEIFormatter.TEI_STYLE_BOLD_NAME; import static org.hamcrest.CoreMatchers.is; @@ -92,6 +94,28 @@ public void testSegmentIntoSentences_Bold_ShouldWork() throws Exception { is("

One sentence (Foppiano et al.).Second sentence (Lopez et al.).

")); } + @Test + public void testIdentifyRefNotes() throws Exception { + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + currentParagraph.appendChild("One sentence"); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Foppiano et al.)")); + currentParagraph.appendChild(". "); + currentParagraph.appendChild("Second sentence"); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)")); + currentParagraph.appendChild("."); + + Map> integerPairMap = new TEIFormatter(null, null).identifyNestedNodes(currentParagraph); + + assertThat(integerPairMap.keySet(), hasSize(2)); + assertThat(integerPairMap.keySet().stream().toArray()[1], is(13)); + assertThat(integerPairMap.get(13).getRight(), is("(Foppiano et al.)")); + + assertThat(integerPairMap.keySet().stream().toArray()[0], is(48)); + assertThat(integerPairMap.get(48).getRight(), is("(Lopez et al.)")); + } + @Test public void testExtractStylesList_single_shouldWork() throws Exception { String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure "; From af9442846041b92afafbec9b88f5c02a8f44a860 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 25 Jul 2022 17:32:11 +0900 Subject: [PATCH 14/23] Test sentence segmentation with decoration and references --- .../core/document/TEIFormatterTest.java | 57 +++++++++++++++++-- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java index 56b2d26afa..eda76d2cd7 100644 --- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java +++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java @@ -19,7 +19,7 @@ import java.util.List; import java.util.Map; -import static org.grobid.core.document.TEIFormatter.TEI_STYLE_BOLD_NAME; +import static org.grobid.core.document.TEIFormatter.*; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.Matchers.hasSize; import static org.junit.Assert.assertThat; @@ -67,7 +67,7 @@ public void testSegmentIntoSentences_Bold_ShouldSplitIntoSentencesAndAddSTag() t } @Test - public void testSegmentIntoSentences_Bold_ShouldWork() throws Exception { + public void testSegmentIntoSentences_NoStyle_ShouldWork() throws Exception { String text = "One sentence (Foppiano et al.). Second sentence (Lopez et al.). "; GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); @@ -85,8 +85,6 @@ public void testSegmentIntoSentences_Bold_ShouldWork() throws Exception { currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)")); currentParagraph.appendChild("."); - System.out.println(currentParagraph.toXML()); - new TEIFormatter(null, null) .segmentIntoSentences(currentParagraph, currentParagraphTokens, config, "en"); @@ -94,6 +92,57 @@ public void testSegmentIntoSentences_Bold_ShouldWork() throws Exception { is("

One sentence (Foppiano et al.).Second sentence (Lopez et al.).

")); } + + @Test + public void testSegmentIntoSentences_Style_ShouldWork() throws Exception { + String text1_0 = "One sentence "; + String text1_1 = ". "; + String text2_0 = "Second sentence "; + String text2_1 = "."; + + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder() + .withSentenceSegmentation(true) + .build(); + + List tokens = new ArrayList<>(); + List currentParagraphTokens1_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_0); + tokens.addAll(currentParagraphTokens1_0); + List currentParagraphTokens1_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_1); + tokens.addAll(currentParagraphTokens1_1); + List currentParagraphTokens2_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text2_0); + tokens.addAll(currentParagraphTokens2_0); + List currentParagraphTokens2_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text2_1); + tokens.addAll(currentParagraphTokens2_1); + + currentParagraphTokens1_0.get(0).setBold(true); + currentParagraphTokens1_0.get(2).setBold(true); + currentParagraphTokens1_0.get(2).setItalic(true); + + List> styles1_0 = extractStylesList(currentParagraphTokens1_0); + List> styles1_1 = extractStylesList(currentParagraphTokens1_1); + List> styles2_0 = extractStylesList(currentParagraphTokens2_0); + List> styles2_1 = extractStylesList(currentParagraphTokens2_1); + + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + + applyStyleList(currentParagraph, text1_0, styles1_0); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Foppiano et al.)")); + applyStyleList(currentParagraph, text1_1, styles1_1); + currentParagraph.appendChild(" "); + applyStyleList(currentParagraph, text2_0, styles2_0); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)")); + applyStyleList(currentParagraph, text2_1, styles2_1); + + //Assuming these are injected correctly + + new TEIFormatter(null, null).segmentIntoSentences(currentParagraph, tokens, config, "en"); + + assertThat(currentParagraph.toXML(), + is("

One sentence (Foppiano et al.).Second sentence (Lopez et al.).

")); + } + @Test public void testIdentifyRefNotes() throws Exception { Element currentParagraph = XmlBuilderUtils.teiElement("p"); From e8a00fea993c2be4ce65342f4071ae7f7d5adeba Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 26 Jul 2022 14:14:32 +0900 Subject: [PATCH 15/23] Fix style extraction + adding more tests --- .../grobid/core/document/TEIFormatter.java | 10 ++- .../core/document/TEIFormatterTest.java | 72 +++++++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 00a54e28dd..e57edb4cdf 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1667,6 +1667,13 @@ public static List> extractStylesList(Lis temporaryText.append(token.getText()); int endOffset = temporaryText.toString().length(); + if (token.getText().equals(" ")) { + if (value.length() > 0) { + value.append(token.getText()); + } + continue; + } + StringBuilder styleName = new StringBuilder(); if (token.isBold() && !ignoreStyles.contains(TEI_STYLE_BOLD_NAME)) { styleName.append(TEI_STYLE_BOLD_NAME).append(" "); @@ -1692,7 +1699,8 @@ public static List> extractStylesList(Lis } if (styleNameTrimmed.equals(previousStyleName)) { - Iterables.getLast(styleList).getRight().end = endOffset; + Triple last = Iterables.getLast(styleList); + styleList.set(styleList.size()-1, Triple.of(last.getLeft(), value.toString(), new OffsetPosition(last.getRight().start, endOffset))); } else { styleList.add(Triple.of(styleNameTrimmed, value.toString(), new OffsetPosition(startOffset, endOffset))); } diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java index eda76d2cd7..41ea80e4d5 100644 --- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java +++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java @@ -143,6 +143,58 @@ public void testSegmentIntoSentences_Style_ShouldWork() throws Exception { is("

One sentence (Foppiano et al.).Second sentence (Lopez et al.).

")); } + @Test + public void testSegmentIntoSentences_StyleBetweenTwoSentences_ShouldWork() throws Exception { + String text1_0 = "One sentence"; + String text1_1 = ". "; + String text2_0 = "Second sentence"; + String text2_1 = "."; + + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder() + .withSentenceSegmentation(true) + .build(); + + List tokens = new ArrayList<>(); + List currentParagraphTokens1_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_0); + tokens.addAll(currentParagraphTokens1_0); + List currentParagraphTokens1_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_1); + tokens.addAll(currentParagraphTokens1_1); + List currentParagraphTokens2_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text2_0); + tokens.addAll(currentParagraphTokens2_0); + List currentParagraphTokens2_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text2_1); + tokens.addAll(currentParagraphTokens2_1); + + currentParagraphTokens1_0.get(0).setBold(true); //One + currentParagraphTokens1_0.get(2).setItalic(true); //sentence + currentParagraphTokens1_1.get(0).setItalic(true); //. + currentParagraphTokens2_0.get(0).setItalic(true); //Second + currentParagraphTokens2_0.get(2).setItalic(true); //sentence + + List> styles1_0 = extractStylesList(currentParagraphTokens1_0); + List> styles1_1 = extractStylesList(currentParagraphTokens1_1); + List> styles2_0 = extractStylesList(currentParagraphTokens2_0); + List> styles2_1 = extractStylesList(currentParagraphTokens2_1); + + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + + applyStyleList(currentParagraph, text1_0, styles1_0); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Foppiano et al.)")); + applyStyleList(currentParagraph, text1_1, styles1_1); + currentParagraph.appendChild(" "); + applyStyleList(currentParagraph, text2_0, styles2_0); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)")); + applyStyleList(currentParagraph, text2_1, styles2_1); + + //Assuming these are injected correctly + + new TEIFormatter(null, null).segmentIntoSentences(currentParagraph, tokens, config, "en"); + + assertThat(currentParagraph.toXML(), + is("

One sentence (Foppiano et al.).Second sentence (Lopez et al.).

")); + } + @Test public void testIdentifyRefNotes() throws Exception { Element currentParagraph = XmlBuilderUtils.teiElement("p"); @@ -244,6 +296,26 @@ public void testExtractStylesList_combined_shouldWork() throws Exception { assertThat(pairs.get(1).getRight().end, is(86)); } + @Test + public void testExtractStylesList_continuousTokens_shouldWork() throws Exception { + String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure "; + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); + List currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + + currentParagraphTokens.get(24).setBold(true); + currentParagraphTokens.get(26).setBold(true); + currentParagraphTokens.get(28).setBold(true); + currentParagraphTokens.get(30).setBold(true); + + List> pairs = TEIFormatter.extractStylesList(currentParagraphTokens); + + assertThat(pairs, hasSize(1)); + assertThat(pairs.get(0).getLeft(), is("bold")); + assertThat(pairs.get(0).getMiddle(), is("Nd 2 Fe 14")); + assertThat(pairs.get(0).getRight().start, is(76)); + assertThat(pairs.get(0).getRight().end, is(86)); + } + @Test public void testExtractStylesList_ignoreBold_shouldWork() throws Exception { String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure "; From ccce04939a2708215a17f6124b12820e9dad9976 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 27 Jul 2022 15:15:05 +0900 Subject: [PATCH 16/23] Split decoration between sentences if neeed --- .../grobid/core/document/TEIFormatter.java | 132 +++++++++++-- .../core/document/TEIFormatterTest.java | 186 ++++++++++++++++++ 2 files changed, 306 insertions(+), 12 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index e57edb4cdf..4e9c05513f 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1462,14 +1462,17 @@ public void segmentIntoSentences(Element curParagraph, List curPara List forbiddenPositions = mapRefNodes.entrySet() .stream() + .filter(entry -> ((Element) entry.getValue().getLeft()).getLocalName().equals("ref")) .map(entry -> new OffsetPosition(entry.getKey(), entry.getValue().getRight().length() + entry.getKey())) .collect(Collectors.toList()); - List refPositions = mapRefNodes.keySet().stream().sorted().collect(Collectors.toList()); - List sentencesOffsetPosition = SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang)); - + + mapRefNodes = splitMapNodesOverSentenceSplits(mapRefNodes, text, sentencesOffsetPosition); + + List refPositions = mapRefNodes.keySet().stream().sorted().collect(Collectors.toList()); + /*if (theSentences.size() == 0) { // this should normally not happen, but it happens (depending on sentence splitter, usually the text // is just a punctuation) @@ -1507,21 +1510,17 @@ public void segmentIntoSentences(Element curParagraph, List curPara posInSentence = 0; Element sentenceElement = teiElement("s"); - List currentSentenceTokens = segmentedParagraphTokens.get(i); - if (config.isGenerateTeiIds()) { String sID = KeyGen.getKey().substring(0, 7); addXmlId(sentenceElement, "_" + sID); } if (config.isGenerateTeiCoordinates("s")) { if (segmentedParagraphTokens.size()>=i+1) { - currentSentenceTokens = segmentedParagraphTokens.get(i); + List currentSentenceTokens = segmentedParagraphTokens.get(i); sentenceElement.addAttribute(new Attribute("coords", LayoutTokensUtil.getCoordsString(currentSentenceTokens))); } } - List> styleList = extractStylesList(currentSentenceTokens); - int sentenceLength = sentencesOffsetPosition.get(i).end - pos; // check if we have a ref between pos and pos+sentenceLength for(int j=refIndex; j curPara } } } + } + + /** + * Adjust the nodes that could be over a sentence split. + * We know that refs cannot be split over sentences, so we can ignore them happily + **/ + protected Map> splitMapNodesOverSentenceSplits(Map> mapRefNodes, String text, List sentencesOffsetPosition) { + Map> adjustedMap = new TreeMap<>(); + + StringBuilder textAccumulator = new StringBuilder(); + List refPositions = mapRefNodes.keySet().stream().sorted().collect(Collectors.toList()); + + int currentNodeIdx = 0; + for(int i=0; i= sentenceOffsetStart+posInSentence && refPos < sentenceOffsetEnd) { + + //adding what's before the refPos to the accumulator + if (refPos > sentenceOffsetStart + posInSentence) { + textAccumulator.append(text, sentenceOffsetStart + posInSentence, refPos); + sentenceAccumulator.append(text, sentenceOffsetStart + posInSentence, refPos); + } + + //the node finishes before sentence ends - all good here :-) + if (sentenceOffsetStart + posInSentence + currentNodeLength < sentenceOffsetEnd) { + adjustedMap.put(refPos, mapRefNodes.get(refPos)); + textAccumulator.append(mapRefNodes.get(refPos).getRight()); + sentenceAccumulator.append(mapRefNodes.get(refPos).getRight()); + posInSentence = refPos + currentNodeLength - sentenceOffsetStart; + continue; + } else { + //The node exceed the sentence, we are in trouble! Cut it! + int splitElementSize = sentenceOffsetEnd - refPos; + + String substringPrefix = currentNode.getValue().substring(0, splitElementSize); + Element newElementPrefix = generateNewElement((Element) currentNode, substringPrefix); + adjustedMap.put(refPos, Pair.of(newElementPrefix, substringPrefix)); + textAccumulator.append(substringPrefix); + posInSentence = refPos + newElementPrefix.getValue().length() - sentenceOffsetStart; + currentNodeIdx = j; + break; + } + } else if (refPos > sentenceOffsetEnd) { + // add to accumulator the rest of the sentence and moving on to the next sentence + textAccumulator.append(text, sentenceOffsetStart + posInSentence, sentenceOffsetEnd); + sentenceAccumulator.append(text, sentenceOffsetStart + posInSentence, sentenceOffsetEnd); + break; + } else if (refPos < sentenceOffsetStart && textAccumulator.length() > refPos + && textAccumulator.length() < refPos + currentNodeLength) { + //The node is between this sentence and the previous one - trouble again dude + + String exceeded = textAccumulator.substring(0, refPos) + mapRefNodes.get(refPos).getLeft().getValue(); + + if (exceeded.length() > sentenceOffsetEnd) { + String previousNodeSuffix = exceeded.substring(sentenceOffsetStart, sentenceOffsetEnd); + Element newElementSuffix = generateNewElement((Element) currentNode, previousNodeSuffix); + adjustedMap.put(sentenceOffsetStart, Pair.of(newElementSuffix, previousNodeSuffix)); + if (textAccumulator.length() < sentenceOffsetStart) { + textAccumulator.append(exceeded, textAccumulator.length(), sentenceOffsetStart); + } + textAccumulator.append(previousNodeSuffix); + + posInSentence = sentenceOffsetStart + previousNodeSuffix.length(); + currentNodeIdx = j; + break; + } else { + String previousNodeSuffix = exceeded.substring(sentenceOffsetStart); + Element newElementSuffix = generateNewElement((Element) currentNode, previousNodeSuffix); + adjustedMap.put(sentenceOffsetStart, Pair.of(newElementSuffix, previousNodeSuffix)); + if (textAccumulator.length() < sentenceOffsetStart) { + textAccumulator.append(exceeded, textAccumulator.length(), sentenceOffsetStart); + } + textAccumulator.append(previousNodeSuffix); + posInSentence = sentenceOffsetStart + previousNodeSuffix.length(); + } + } + } + + if (sentenceOffsetStart + posInSentence <= sentenceOffsetEnd) { + textAccumulator.append(text, sentenceOffsetStart + posInSentence, sentencesOffsetPosition.get(i).end); + } + } + return adjustedMap; + } + + private Element generateNewElement(Element currentNode, String value) { + Element newElement = teiElement(currentNode.getLocalName(), value); + for (int i=0; i < currentNode.getAttributeCount(); i++) { + Attribute a = new Attribute(currentNode.getAttribute(i)); + newElement.addAttribute(a); + } + return newElement; } protected Map> identifyNestedNodes(Element curParagraph) { // identify ref nodes, ref spans and ref positions - Map> mapRefNodes = new HashMap<>(); + Map> mapNodes = new HashMap<>(); int pos = 0; for(int i = 0; i< curParagraph.getChildCount(); i++) { @@ -1583,19 +1691,19 @@ protected Map> identifyNestedNodes(Element curParagr if (((Element) theNode).getLocalName().equals("ref")) { String chunk = theNode.getValue(); // map character offset of the node and the chunk text - mapRefNodes.put(pos, Pair.of(theNode, chunk)); + mapNodes.put(pos, Pair.of(theNode, chunk)); pos += chunk.length(); } else if (((Element) theNode).getLocalName().equals("hi")) { String chunk = theNode.getValue(); - mapRefNodes.put(pos, Pair.of(theNode, chunk)); + mapNodes.put(pos, Pair.of(theNode, chunk)); pos += chunk.length(); } } } - return mapRefNodes; + return mapNodes; } private List> segmentLayoutTokenLists(List curParagraphTokens, String text, List sentencesOffsetPosition) { diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java index 41ea80e4d5..dd3575fa67 100644 --- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java +++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java @@ -7,10 +7,12 @@ import org.grobid.core.analyzers.GrobidAnalyzer; import org.grobid.core.document.xml.XmlBuilderUtils; import org.grobid.core.engines.config.GrobidAnalysisConfig; +import org.grobid.core.lang.Language; import org.grobid.core.layout.LayoutToken; import org.grobid.core.utilities.GrobidProperties; import org.grobid.core.utilities.LayoutTokensUtil; import org.grobid.core.utilities.OffsetPosition; +import org.grobid.core.utilities.SentenceUtilities; import org.junit.BeforeClass; import org.junit.Test; @@ -18,8 +20,10 @@ import java.util.Arrays; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import static org.grobid.core.document.TEIFormatter.*; +import static org.hamcrest.CoreMatchers.any; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.Matchers.hasSize; import static org.junit.Assert.assertThat; @@ -195,6 +199,188 @@ public void testSegmentIntoSentences_StyleBetweenTwoSentences_ShouldWork() throw is("

One sentence (Foppiano et al.).Second sentence (Lopez et al.).

")); } + @Test + public void testSegmentIntoSentences_StyleBetweenTwoSentences_oneRef_ShouldWork() throws Exception { + String text1_0 = "One sentence. Second sentence"; + String text1_1 = "."; + + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder() + .withSentenceSegmentation(true) + .build(); + + List tokens = new ArrayList<>(); + List currentParagraphTokens1_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_0); + tokens.addAll(currentParagraphTokens1_0); + List currentParagraphTokens1_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_1); + tokens.addAll(currentParagraphTokens1_1); + + currentParagraphTokens1_0.get(0).setBold(true); //One + currentParagraphTokens1_0.get(2).setItalic(true); //sentence + currentParagraphTokens1_0.get(3).setItalic(true); //. + currentParagraphTokens1_0.get(5).setItalic(true); //Second + + List> styles1_0 = extractStylesList(currentParagraphTokens1_0); + List> styles1_1 = extractStylesList(currentParagraphTokens1_1); + + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + + applyStyleList(currentParagraph, text1_0, styles1_0); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)")); + applyStyleList(currentParagraph, text1_1, styles1_1); + + new TEIFormatter(null, null).segmentIntoSentences(currentParagraph, tokens, config, "en"); + + assertThat(currentParagraph.toXML(), + is("

One sentence.Second sentence (Lopez et al.).

")); + } + + @Test + public void testSegmentIntoSentences_StyleBetweenTwoSentencesWithoutRefs_ShouldWork() throws Exception { + String text = "One sentence. Second sentence."; + + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder() + .withSentenceSegmentation(true) + .build(); + + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + + tokens.get(0).setBold(true); //One + tokens.get(2).setItalic(true); //sentence + tokens.get(3).setItalic(true); //. + tokens.get(5).setItalic(true); //Second +// currentParagraphTokens.get(7).setItalic(true); //sentence + + List> styles = extractStylesList(tokens); + + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + + applyStyleList(currentParagraph, text, styles); + + //Assuming these are injected correctly + new TEIFormatter(null, null).segmentIntoSentences(currentParagraph, tokens, config, "en"); + + assertThat(currentParagraph.toXML(), + is("

One sentence.Second sentence.

")); + } + + @Test + public void testSplitMapNodesOverSentenceSplits_shouldAdjustNodes() { + TEIFormatter teiFormatter = new TEIFormatter(null, null); + + String text1_0 = "One sentence. Second sentence"; + String text1_1 = "."; + + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder() + .withSentenceSegmentation(true) + .build(); + + List tokens = new ArrayList<>(); + List currentParagraphTokens1_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_0); + tokens.addAll(currentParagraphTokens1_0); + List currentParagraphTokens1_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_1); + tokens.addAll(currentParagraphTokens1_1); + + currentParagraphTokens1_0.get(0).setBold(true); //One + currentParagraphTokens1_0.get(2).setItalic(true); //sentence + currentParagraphTokens1_0.get(3).setItalic(true); //. + currentParagraphTokens1_0.get(5).setItalic(true); //Second + + List> styles1_0 = extractStylesList(currentParagraphTokens1_0); + List> styles1_1 = extractStylesList(currentParagraphTokens1_1); + + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + + applyStyleList(currentParagraph, text1_0, styles1_0); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)")); + applyStyleList(currentParagraph, text1_1, styles1_1); + + String text = currentParagraph.getValue(); + + Map> nestedNodes = teiFormatter.identifyNestedNodes(currentParagraph); + List forbiddenPositions = nestedNodes.entrySet() + .stream() + .filter(entry -> ((Element) entry.getValue().getLeft()).getLocalName().equals("ref")) + .map(entry -> new OffsetPosition(entry.getKey(), entry.getValue().getRight().length() + entry.getKey())) + .collect(Collectors.toList()); + + List sentencesOffsetPosition = + SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, tokens, new Language("en")); + + Map> adjustedNestedNodes = teiFormatter.splitMapNodesOverSentenceSplits(nestedNodes, text, sentencesOffsetPosition); + + assertThat(adjustedNestedNodes.size(), is(4)); + + assertThat(new ArrayList<>(adjustedNestedNodes.keySet()), is(Arrays.asList(0, 4, 14, 30))); + + assertThat(adjustedNestedNodes.get(0).getRight(), is("One")); + assertThat(adjustedNestedNodes.get(4).getRight(), is("sentence.")); + assertThat(adjustedNestedNodes.get(14).getRight(), is("Second")); + assertThat(adjustedNestedNodes.get(30).getRight(), is("(Lopez et al.)")); + } + + @Test + public void testSplitMapNodesOverThreeSentenceSplits_shouldAdjustNodes() { + TEIFormatter teiFormatter = new TEIFormatter(null, null); + + String text1_0 = "One sentence. Second sentence. Third sentence"; + String text1_1 = "."; + + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder() + .withSentenceSegmentation(true) + .build(); + + List tokens = new ArrayList<>(); + List currentParagraphTokens1_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_0); + tokens.addAll(currentParagraphTokens1_0); + List currentParagraphTokens1_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_1); + tokens.addAll(currentParagraphTokens1_1); + + currentParagraphTokens1_0.get(0).setBold(true); //One + currentParagraphTokens1_0.get(2).setItalic(true); //sentence + currentParagraphTokens1_0.get(3).setItalic(true); //. + currentParagraphTokens1_0.get(5).setItalic(true); //Second + currentParagraphTokens1_0.get(7).setItalic(true); //sentence + currentParagraphTokens1_0.get(8).setItalic(true); //. + currentParagraphTokens1_0.get(10).setItalic(true); //Third +// currentParagraphTokens1_0.get(12).setItalic(true); //sentence + + List> styles1_0 = extractStylesList(currentParagraphTokens1_0); + List> styles1_1 = extractStylesList(currentParagraphTokens1_1); + + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + + applyStyleList(currentParagraph, text1_0, styles1_0); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)")); + applyStyleList(currentParagraph, text1_1, styles1_1); + + String text = currentParagraph.getValue(); + + Map> nestedNodes = teiFormatter.identifyNestedNodes(currentParagraph); + List forbiddenPositions = nestedNodes.entrySet() + .stream() + .filter(entry -> ((Element) entry.getValue().getLeft()).getLocalName().equals("ref")) + .map(entry -> new OffsetPosition(entry.getKey(), entry.getValue().getRight().length() + entry.getKey())) + .collect(Collectors.toList()); + + List sentencesOffsetPosition = + SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, tokens, new Language("en")); + + Map> adjustedNestedNodes = teiFormatter.splitMapNodesOverSentenceSplits(nestedNodes, text, sentencesOffsetPosition); + + assertThat(adjustedNestedNodes.size(), is(5)); + + assertThat(new ArrayList<>(adjustedNestedNodes.keySet()), is(Arrays.asList(0, 4, 14, 31, 46))); + + assertThat(adjustedNestedNodes.get(0).getRight(), is("One")); + assertThat(adjustedNestedNodes.get(4).getRight(), is("sentence.")); + assertThat(adjustedNestedNodes.get(14).getRight(), is("Second sentence.")); + assertThat(adjustedNestedNodes.get(31).getRight(), is("Third")); + assertThat(adjustedNestedNodes.get(46).getRight(), is("(Lopez et al.)")); + } + @Test public void testIdentifyRefNotes() throws Exception { Element currentParagraph = XmlBuilderUtils.teiElement("p"); From aaf211d69d9528a4508b4b2f61ce85a61a85f36f Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 27 Jul 2022 16:10:56 +0900 Subject: [PATCH 17/23] Fix bugs with the text accumulator --- .../grobid/core/document/TEIFormatter.java | 35 +++++++++++-------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 4e9c05513f..442f77c81f 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1458,9 +1458,9 @@ public void segmentIntoSentences(Element curParagraph, List curPara if (StringUtils.isEmpty(text)) return; - Map> mapRefNodes = identifyNestedNodes(curParagraph); + Map> rawMapRefNodes = identifyNestedNodes(curParagraph); - List forbiddenPositions = mapRefNodes.entrySet() + List forbiddenPositions = rawMapRefNodes.entrySet() .stream() .filter(entry -> ((Element) entry.getValue().getLeft()).getLocalName().equals("ref")) .map(entry -> new OffsetPosition(entry.getKey(), entry.getValue().getRight().length() + entry.getKey())) @@ -1469,7 +1469,7 @@ public void segmentIntoSentences(Element curParagraph, List curPara List sentencesOffsetPosition = SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang)); - mapRefNodes = splitMapNodesOverSentenceSplits(mapRefNodes, text, sentencesOffsetPosition); + Map> mapRefNodes = splitMapNodesOverSentenceSplits(rawMapRefNodes, text, sentencesOffsetPosition); List refPositions = mapRefNodes.keySet().stream().sorted().collect(Collectors.toList()); @@ -1582,18 +1582,25 @@ protected Map> splitMapNodesOverSentenceSplits(Map sentenceOffsetEnd) { + currentNodeIdx = j; + break; + } adjustedMap.put(refPos, mapRefNodes.get(refPos)); + if (textAccumulator.length() < refPos) { + textAccumulator.append(text, textAccumulator.length(), refPos); + } textAccumulator.append(mapRefNodes.get(refPos).getRight()); - sentenceAccumulator.append(mapRefNodes.get(refPos).getRight()); + posInSentence = refPos + currentNodeLength - sentenceOffsetStart; continue; } - int currentNodeLength = currentNode.getValue().length(); //The ref position is falling between sentence start and end if (refPos >= sentenceOffsetStart+posInSentence && refPos < sentenceOffsetEnd) { @@ -1601,16 +1608,14 @@ protected Map> splitMapNodesOverSentenceSplits(Map sentenceOffsetStart + posInSentence) { textAccumulator.append(text, sentenceOffsetStart + posInSentence, refPos); - sentenceAccumulator.append(text, sentenceOffsetStart + posInSentence, refPos); + posInSentence = refPos - sentenceOffsetStart; } //the node finishes before sentence ends - all good here :-) if (sentenceOffsetStart + posInSentence + currentNodeLength < sentenceOffsetEnd) { adjustedMap.put(refPos, mapRefNodes.get(refPos)); textAccumulator.append(mapRefNodes.get(refPos).getRight()); - sentenceAccumulator.append(mapRefNodes.get(refPos).getRight()); posInSentence = refPos + currentNodeLength - sentenceOffsetStart; - continue; } else { //The node exceed the sentence, we are in trouble! Cut it! int splitElementSize = sentenceOffsetEnd - refPos; @@ -1621,12 +1626,14 @@ protected Map> splitMapNodesOverSentenceSplits(Map sentenceOffsetEnd) { // add to accumulator the rest of the sentence and moving on to the next sentence - textAccumulator.append(text, sentenceOffsetStart + posInSentence, sentenceOffsetEnd); - sentenceAccumulator.append(text, sentenceOffsetStart + posInSentence, sentenceOffsetEnd); + String textChunk = text.substring(sentenceOffsetStart + posInSentence, sentenceOffsetEnd); + textAccumulator.append(textChunk); + posInSentence += textChunk.length(); + currentNodeIdx = j; break; } else if (refPos < sentenceOffsetStart && textAccumulator.length() > refPos && textAccumulator.length() < refPos + currentNodeLength) { @@ -1659,7 +1666,7 @@ protected Map> splitMapNodesOverSentenceSplits(Map> extractStylesList(Lis temporaryText.append(token.getText()); int endOffset = temporaryText.toString().length(); - if (token.getText().equals(" ")) { + if (token.getText().equals(" ") || token.getText().equals("\n")) { if (value.length() > 0) { value.append(token.getText()); } From b82cc4322f37eecd273cc14523ad7d48e1eaeb1f Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 28 Jul 2022 11:26:07 +0900 Subject: [PATCH 18/23] Fix incorrect split and position in sentence markers --- .../main/java/org/grobid/core/document/TEIFormatter.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 442f77c81f..2498ddb6e7 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1626,7 +1626,7 @@ protected Map> splitMapNodesOverSentenceSplits(Map sentenceOffsetEnd) { // add to accumulator the rest of the sentence and moving on to the next sentence @@ -1650,10 +1650,11 @@ protected Map> splitMapNodesOverSentenceSplits(Map> splitMapNodesOverSentenceSplits(Map Date: Mon, 12 Sep 2022 12:44:07 +0900 Subject: [PATCH 19/23] remove suffix space when there is no more text --- .../main/java/org/grobid/core/data/Table.java | 2 -- .../grobid/core/document/TEIFormatter.java | 10 +++++-- .../core/document/TEIFormatterTest.java | 30 +++++++++++++++++-- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java index 9a9aa6cf3e..a2adc428a2 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Table.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java @@ -33,11 +33,9 @@ import nu.xom.Attribute; import nu.xom.Element; import nu.xom.Node; -import nu.xom.Text; import static org.grobid.core.document.TEIFormatter.applyStyleList; import static org.grobid.core.document.TEIFormatter.extractStylesList; -import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId; import static org.grobid.core.document.xml.XmlBuilderUtils.textNode; diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 2498ddb6e7..671b887834 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1424,7 +1424,7 @@ public static Element applyStyleList(Element paragraphElem, String text, List 1) { + if (subString.length() > prefixSpace.length()) { suffixSpace = StringUtils.endsWith(subString, " ") ? " " : ""; } paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " ")) + suffixSpace); @@ -1435,7 +1435,11 @@ public static Element applyStyleList(Element paragraphElem, String text, List 0) { + prefixSpace = StringUtils.startsWith(subString, " ") ? " " : ""; + } paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " "))); return paragraphElem; @@ -1819,10 +1823,12 @@ public static List> extractStylesList(Lis styleList.set(styleList.size()-1, Triple.of(last.getLeft(), value.toString(), new OffsetPosition(last.getRight().start, endOffset))); } else { styleList.add(Triple.of(styleNameTrimmed, value.toString(), new OffsetPosition(startOffset, endOffset))); +// value = new StringBuilder(); } previousStyleName = styleNameTrimmed; } +// List> postProcessedStyleList = styleList.stream().map(s -> Triple.of(s.getLeft(), s.getMiddle().substring(s.getRight().start, s.getRight().end), s.getRight())).collect(Collectors.toList()); return styleList; } diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java index dd3575fa67..9d2dc3d2cd 100644 --- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java +++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java @@ -14,6 +14,7 @@ import org.grobid.core.utilities.OffsetPosition; import org.grobid.core.utilities.SentenceUtilities; import org.junit.BeforeClass; +import org.junit.Ignore; import org.junit.Test; import java.util.ArrayList; @@ -23,7 +24,6 @@ import java.util.stream.Collectors; import static org.grobid.core.document.TEIFormatter.*; -import static org.hamcrest.CoreMatchers.any; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.Matchers.hasSize; import static org.junit.Assert.assertThat; @@ -144,7 +144,7 @@ public void testSegmentIntoSentences_Style_ShouldWork() throws Exception { new TEIFormatter(null, null).segmentIntoSentences(currentParagraph, tokens, config, "en"); assertThat(currentParagraph.toXML(), - is("

One sentence (Foppiano et al.).Second sentence (Lopez et al.).

")); + is("

One sentence (Foppiano et al.).Second sentence (Lopez et al.).

")); } @Test @@ -527,6 +527,32 @@ public void testExtractStylesList_ignoreBold_shouldWork() throws Exception { assertThat(pairs.get(1).getRight().end, is(86)); } + @Ignore("The middle is actually not used") + public void testExtractStylesList_checkProducedText_ShouldWork() throws Exception { + String text = "I. Introduction 1.1. Généralités et rappels "; + List textTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + + textTokens.get(0).setBold(true); + textTokens.get(1).setBold(true); + textTokens.get(3).setBold(true); + + textTokens.get(6).setItalic(true); + textTokens.get(7).setItalic(true); + textTokens.get(8).setItalic(true); + textTokens.get(9).setItalic(true); + textTokens.get(11).setItalic(true); + textTokens.get(13).setItalic(true); + textTokens.get(15).setItalic(true); + + List> pairs = TEIFormatter.extractStylesList(textTokens); + + assertThat(pairs, hasSize(2)); + assertThat(pairs.get(0).getLeft(), is("bold")); + assertThat(pairs.get(0).getMiddle(), is("I. Introduction")); + assertThat(pairs.get(1).getLeft(), is("italic")); + assertThat(pairs.get(1).getMiddle(), is("1.1. Généralités et rappels")); + } + @Test public void testGetSectionNumber_simple_ShouldWork() throws Exception { String text = "3 Supercon 2"; From 80b98c498da27139f6b5067d3638dc5ac9185021 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 13 Sep 2022 12:11:09 +0900 Subject: [PATCH 20/23] fix OOBE when applying sentence splitting --- .../java/org/grobid/core/document/TEIFormatter.java | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 671b887834..e3925f7eab 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1581,12 +1581,17 @@ protected Map> splitMapNodesOverSentenceSplits(Map refPositions = mapRefNodes.keySet().stream().sorted().collect(Collectors.toList()); int currentNodeIdx = 0; + int previousSentenceOffsetStart = 0; + int previousPosInSentence = 0; for(int i=0; i> splitMapNodesOverSentenceSplits(Map refPos + } else if (refPos < sentenceOffsetStart + && textAccumulator.length() > refPos && textAccumulator.length() < refPos + currentNodeLength) { //The node is between this sentence and the previous one - trouble again dude @@ -1670,6 +1676,8 @@ protected Map> splitMapNodesOverSentenceSplits(Map Date: Wed, 17 May 2023 14:47:04 +0900 Subject: [PATCH 21/23] avoid adding styles in head sections --- .../java/org/grobid/core/document/TEIFormatter.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 7872eefcb4..8ef3740304 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1321,13 +1321,13 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, dehyphenized = numb.getLeft(); text = LayoutTokensUtil.toText(dehyphenized); } - List> stylesList = extractStylesList(dehyphenized); +// List> stylesList = extractStylesList(dehyphenized); - if (CollectionUtils.isNotEmpty(stylesList)) { - applyStyleList(head, text, stylesList); - } else { +// if (CollectionUtils.isNotEmpty(stylesList)) { +// applyStyleList(head, text, stylesList); +// } else { head.appendChild(StringUtils.normalizeSpace(text.replace("\n", ""))); - } +// } if (config.isGenerateTeiIds()) { String divID = KeyGen.getKey().substring(0, 7); From 9adb8d864198d0724a7ebda57d23f4e2c999ae6e Mon Sep 17 00:00:00 2001 From: lfoppiano Date: Wed, 17 May 2023 15:56:00 +0900 Subject: [PATCH 22/23] fix inconsistency when having notes in the same page --- .../grobid/core/document/TEIFormatter.java | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 8ef3740304..c22c4dff8a 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1398,7 +1398,8 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, curDiv.appendChild(note); } else if (clusterLabel.equals(TaggingLabels.PARAGRAPH)) { List clusterTokens = cluster.concatTokens(); - int clusterPage = Iterables.getLast(clusterTokens).getPage(); + List dehyphenized = LayoutTokensUtil.dehyphenize(clusterTokens); + int clusterPage = Iterables.getLast(dehyphenized).getPage(); List notesSamePage = null; if (notes != null && notes.size() > 0) { @@ -1408,7 +1409,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, } if (notesSamePage == null) { - List dehyphenized = LayoutTokensUtil.dehyphenize(clusterTokens); + String text = LayoutTokensUtil.toText(dehyphenized).replace("\n", " "); if (isNewParagraph(lastClusterLabel, curParagraph)) { @@ -1460,13 +1461,13 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, List> matchedLabelPosition = new ArrayList<>(); for (Note note : notesSamePage) { - Optional matching = clusterTokens + Optional matching = dehyphenized .stream() .filter(t -> t.getText().equals(note.getLabel()) && t.isSuperscript()) .findFirst(); if (matching.isPresent()) { - int idx = clusterTokens.indexOf(matching.get()); + int idx = dehyphenized.indexOf(matching.get()); note.setIgnored(true); OffsetPosition matchingPosition = new OffsetPosition(); matchingPosition.start = idx; @@ -1490,8 +1491,8 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, Note note = labels2Notes.get(matching.getLeft()); OffsetPosition matchingPosition = matching.getRight(); - List before = clusterTokens.subList(pos, matchingPosition.start); - String clusterContentBefore = LayoutTokensUtil.normalizeDehyphenizeText(before); + List before = dehyphenized.subList(pos, matchingPosition.start); + String clusterContentBefore = LayoutTokensUtil.toText(before); if (CollectionUtils.isNotEmpty(before) && before.get(0).getText().equals(" ")) { curParagraph.appendChild(new Text(" ")); @@ -1506,7 +1507,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, } curParagraphTokens.addAll(cluster.concatTokens()); - List calloutTokens = clusterTokens.subList(matchingPosition.start, matchingPosition.end); + List calloutTokens = dehyphenized.subList(matchingPosition.start, matchingPosition.end); Element ref = teiElement("ref"); ref.addAttribute(new Attribute("type", "foot")); @@ -1526,8 +1527,8 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, } // add last chunk of paragraph stuff (or whole paragraph if no note callout matching) - List remaining = clusterTokens.subList(pos, clusterTokens.size()); - String remainingClusterContent = LayoutTokensUtil.normalizeDehyphenizeText(remaining); + List remaining = dehyphenized.subList(pos, dehyphenized.size()); + String remainingClusterContent = LayoutTokensUtil.toText(remaining); if (CollectionUtils.isNotEmpty(remaining) && remaining.get(0).getText().equals(" ")) { curParagraph.appendChild(new Text(" ")); From 188cda5841047f9f1307a3cad51bf97418ec9227 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 15 Apr 2024 15:00:40 +0900 Subject: [PATCH 23/23] Merge master into features/add-styles-xml --- .../src/main/java/org/grobid/core/document/TEIFormatter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 393c6f9531..a8df9f310b 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1206,7 +1206,7 @@ protected List makeNotes(List noteTokens, String footText, No // add possible subsequent notes concatenated in the same note sequence (this is a common error, // which is addressed here by heuristics, it may not be necessary in the future with a better - // segmentation model using more foot notes training data) + // segmentation model using more footnotes training data) if (currentNumber != -1) { String nextLabel = " " + (currentNumber+1); // sugar characters after note number must be consistent with the previous ones to avoid false match