diff --git a/grobid-core/src/main/java/org/grobid/core/data/Equation.java b/grobid-core/src/main/java/org/grobid/core/data/Equation.java index 141660d848..753e3d5908 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Equation.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Equation.java @@ -2,7 +2,9 @@ import nu.xom.Attribute; import nu.xom.Element; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.Triple; import org.grobid.core.document.xml.XmlBuilderUtils; import org.grobid.core.engines.Engine; import org.grobid.core.engines.config.GrobidAnalysisConfig; @@ -10,13 +12,17 @@ import org.grobid.core.layout.LayoutToken; import org.grobid.core.utilities.BoundingBoxCalculator; import org.grobid.core.utilities.LayoutTokensUtil; +import org.grobid.core.utilities.OffsetPosition; import org.grobid.core.utilities.counters.CntManager; import org.grobid.core.utilities.TextUtilities; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.SortedSet; +import static org.grobid.core.document.TEIFormatter.*; + /** * Class for representing an equation. * @@ -56,9 +62,15 @@ public Element toTEIElement(GrobidAnalysisConfig config) { XmlBuilderUtils.addCoords(formulaElement, LayoutTokensUtil.getCoordsStringForOneBox(getLayoutTokens())); } - formulaElement.appendChild(LayoutTokensUtil.normalizeText(content.toString()).trim()); + List> stylesList = extractStylesList(getContentTokens(), Arrays.asList(TEI_STYLE_BOLD_NAME, TEI_STYLE_ITALIC_NAME)) ; + + if (CollectionUtils.isNotEmpty(stylesList)) { + applyStyleList(formulaElement, getContent(), stylesList); + } else { + formulaElement.appendChild(LayoutTokensUtil.normalizeText(content.toString()).trim()); + } - if ( (label != null) && (label.length()>0) ) { + if ( StringUtils.isNotEmpty(label) ) { Element labelEl = XmlBuilderUtils.teiElement("label", LayoutTokensUtil.normalizeText(label.toString())); formulaElement.appendChild(labelEl); @@ -79,6 +91,16 @@ public List getContentTokens() { return contentTokens; } + public void addContentTokens(List tokens) { + if (tokens == null) + return; + + if (contentTokens == null) + contentTokens = new ArrayList<>(); + + contentTokens.addAll(tokens); + } + public List getLabelTokens() { return labelTokens; } @@ -181,9 +203,9 @@ public void addLayoutTokens(List tokens) { if (tokens == null) return; if (layoutTokens == null) - layoutTokens = new ArrayList(); - for(LayoutToken token : tokens) - layoutTokens.add(token); + layoutTokens = new ArrayList<>(); + + layoutTokens.addAll(tokens); } public List getCoordinates() { diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java index e9417e9217..b4784a8e70 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Figure.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Figure.java @@ -5,6 +5,7 @@ import com.google.common.collect.Lists; import com.google.common.base.Joiner; +import org.apache.commons.lang3.tuple.Triple; import org.grobid.core.GrobidModels; import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; @@ -17,12 +18,9 @@ import org.grobid.core.layout.GraphicObjectType; import org.grobid.core.layout.LayoutToken; import org.grobid.core.layout.VectorGraphicBoxCalculator; -import org.grobid.core.utilities.BoundingBoxCalculator; -import org.grobid.core.utilities.LayoutTokensUtil; -import org.grobid.core.utilities.TextUtilities; +import org.grobid.core.utilities.*; import org.grobid.core.tokenization.TaggingTokenCluster; import org.grobid.core.tokenization.TaggingTokenClusteror; -import org.grobid.core.utilities.KeyGen; import org.grobid.core.engines.label.TaggingLabels; import org.grobid.core.engines.label.TaggingLabel; import org.grobid.core.engines.citations.CalloutAnalyzer.MarkerType; @@ -41,6 +39,8 @@ import java.util.SortedSet; import java.util.Collections; +import static org.grobid.core.document.TEIFormatter.applyStyleList; +import static org.grobid.core.document.TEIFormatter.extractStylesList; import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId; import static org.grobid.core.document.xml.XmlBuilderUtils.textNode; @@ -388,7 +388,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form // if the segment has been parsed with the full text model we further extract the clusters // to get the bibliographical references - if ( (labeledCaption != null) && (labeledCaption.length() > 0) ) { + if (StringUtils.isNotEmpty(labeledCaption)) { TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens); List clusters = clusteror.cluster(); @@ -404,7 +404,9 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form TaggingLabel clusterLabel = cluster.getTaggingLabel(); //String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens()); - String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens()); + List dehyphenized = LayoutTokensUtil.dehyphenize(cluster.concatTokens()); + String text = LayoutTokensUtil.toText(dehyphenized).replace("\n", " "); + if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) { try { List refNodes = formatter.markReferencesTEILuceneBased( @@ -422,7 +424,13 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form LOGGER.warn("Problem when serializing TEI fragment for figure caption", e); } } else { - desc.appendChild(textNode(clusterContent)); + List> stylesList = extractStylesList(dehyphenized); + + if (CollectionUtils.isNotEmpty(stylesList)) { + applyStyleList(desc, text, stylesList); + } else { + desc.appendChild(StringUtils.normalizeSpace(text)); + } } } } else { diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java index 6356978837..abccb0bcd4 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Table.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java @@ -1,5 +1,7 @@ package org.grobid.core.data; +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.tuple.Triple; import org.grobid.core.GrobidModels; import org.apache.commons.lang3.StringUtils; import org.grobid.core.data.table.Cell; @@ -15,6 +17,7 @@ import org.grobid.core.layout.LayoutToken; import org.grobid.core.utilities.BoundingBoxCalculator; import org.grobid.core.utilities.LayoutTokensUtil; +import org.grobid.core.utilities.OffsetPosition; import org.grobid.core.utilities.counters.CntManager; import org.grobid.core.engines.counters.TableRejectionCounters; import org.grobid.core.tokenization.TaggingTokenCluster; @@ -30,9 +33,9 @@ import nu.xom.Attribute; import nu.xom.Element; import nu.xom.Node; -import nu.xom.Text; -import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; +import static org.grobid.core.document.TEIFormatter.applyStyleList; +import static org.grobid.core.document.TEIFormatter.extractStylesList; import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId; import static org.grobid.core.document.xml.XmlBuilderUtils.textNode; @@ -119,7 +122,9 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form TaggingLabel clusterLabel = cluster.getTaggingLabel(); //String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens()); - String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens()); + List dehyphenized = LayoutTokensUtil.dehyphenize(cluster.concatTokens()); + String text = LayoutTokensUtil.toText(dehyphenized).replace("\n", " "); + if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) { try { List refNodes = formatter.markReferencesTEILuceneBased( @@ -137,7 +142,13 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form LOGGER.warn("Problem when serializing TEI fragment for table caption", e); } } else { - desc.appendChild(textNode(clusterContent)); + List> stylesList = extractStylesList(dehyphenized); + + if (CollectionUtils.isNotEmpty(stylesList)) { + applyStyleList(desc, text, stylesList); + } else { + desc.appendChild(StringUtils.normalizeSpace(text)); + } } if (desc != null && config.isWithSentenceSegmentation()) { diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index f66baaa0c0..a8df9f310b 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -3,16 +3,14 @@ import com.google.common.base.Joiner; import com.google.common.collect.Iterables; import com.google.common.collect.Sets; - -import org.apache.commons.collections4.CollectionUtils; -import org.apache.commons.lang3.tuple.Pair; -import org.apache.commons.lang3.StringUtils; - import nu.xom.Attribute; import nu.xom.Element; import nu.xom.Node; import nu.xom.Text; - +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.commons.lang3.tuple.Triple; import org.grobid.core.GrobidModels; import org.grobid.core.data.CopyrightsLicense.License; import org.grobid.core.data.CopyrightsLicense.CopyrightsOwner; @@ -21,21 +19,19 @@ import org.grobid.core.document.xml.XmlBuilderUtils; import org.grobid.core.engines.Engine; import org.grobid.core.engines.FullTextParser; -import org.grobid.core.engines.label.SegmentationLabels; +import org.grobid.core.engines.citations.CalloutAnalyzer.MarkerType; import org.grobid.core.engines.config.GrobidAnalysisConfig; +import org.grobid.core.engines.label.SegmentationLabels; import org.grobid.core.engines.label.TaggingLabel; import org.grobid.core.engines.label.TaggingLabels; import org.grobid.core.exceptions.GrobidException; import org.grobid.core.lang.Language; import org.grobid.core.layout.*; -import org.grobid.core.utilities.SentenceUtilities; import org.grobid.core.tokenization.TaggingTokenCluster; import org.grobid.core.tokenization.TaggingTokenClusteror; import org.grobid.core.utilities.*; import org.grobid.core.utilities.matching.EntityMatcherException; import org.grobid.core.utilities.matching.ReferenceMarkerMatcher; -import org.grobid.core.engines.citations.CalloutAnalyzer.MarkerType; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,10 +42,8 @@ import java.util.regex.Pattern; import java.util.stream.Collectors; +import static org.grobid.core.document.xml.XmlBuilderUtils.*; -import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; -import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId; -import static org.grobid.core.document.xml.XmlBuilderUtils.textNode; /** * Class for generating a TEI representation of a document. @@ -58,6 +52,10 @@ @SuppressWarnings("StringConcatenationInsideStringBuilderAppend") public class TEIFormatter { private static final Logger LOGGER = LoggerFactory.getLogger(TEIFormatter.class); + public static final String TEI_STYLE_ITALIC_NAME = "italic"; + public static String TEI_STYLE_BOLD_NAME = "bold"; + public static String TEI_STYLE_SUPERSCRIPT_NAME = "superscript"; + public static String TEI_STYLE_SUBSCRIPT_NAME = "subscript"; private Document doc = null; private FullTextParser fullTextParser = null; @@ -169,7 +167,33 @@ public StringBuilder toTEIHeader(BiblioItem biblio, } if (biblio.getTitle() != null) { - tei.append(TextUtilities.HTMLEncode(biblio.getTitle())); + List layoutTokens = biblio.getLayoutTokens(TaggingLabels.HEADER_TITLE); + + String text = LayoutTokensUtil.toText(layoutTokens).replace("\n", " "); + + List> stylesList = extractStylesList(layoutTokens); + + if (CollectionUtils.isNotEmpty(stylesList)) { + int lastPosition = 0; + for (Triple style : stylesList) { + OffsetPosition offsetStyle = style.getRight(); + String subString = text.substring(lastPosition, offsetStyle.start); + String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : ""; + String suffixSpace = StringUtils.endsWith(subString, " ") ? " " : ""; + tei.append(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " ")) + suffixSpace); + tei.append("") + .append(StringUtils.normalizeSpace(text.substring(offsetStyle.start, offsetStyle.end).replace("\n", " "))) + .append(""); + lastPosition = offsetStyle.end; + } + String subString = text.substring(lastPosition); + String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : ""; + tei.append(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " "))); + + } else { + String title = biblio.getTitle(); + tei.append(TextUtilities.HTMLEncode(title)); + } } tei.append("\n"); @@ -268,8 +292,8 @@ public StringBuilder toTEIHeader(BiblioItem biblio, // We introduce something more meaningful with TEI customization to encode copyrights information: // - @resp with value "publisher", "authors", "unknown", we add a comment to clarify that @resp // should be interpreted as the copyrights owner - // - license related to copyrights exception is encoded via - // (note: I have no clue what can mean "free" as status for a document - there are always some sort of + // - license related to copyrights exception is encoded via + // (note: I have no clue what can mean "free" as status for a document - there are always some sort of // restrictions like moral rights even for public domain documents) if (copyrightsLicense != null) { tei.append("\t\t\t\t\n"); tei.append("\t\t\t\t\t\n"); - + if (defaultPublicationStatement != null) { tei.append("\t\t\t\t\t

" + TextUtilities.HTMLEncode(defaultPublicationStatement) + "

\n"); @@ -930,8 +954,8 @@ else if (biblio.getE_Year().length() == 4) bds, false, new LayoutTokenization(biblio.getLayoutTokens(TaggingLabels.HEADER_ABSTRACT)), - null, - null, + null, + null, null, null, markerTypes, @@ -1075,7 +1099,7 @@ public StringBuilder toTEIBody(StringBuilder buffer, protected List getTeiNotes(Document doc) { // There are two types of structured notes currently supported, foot notes and margin notes. // We consider that head notes are always only presentation matter and are never references - // in a text body. + // in a text body. SortedSet documentNoteParts = doc.getDocumentPart(SegmentationLabels.FOOTNOTE); List notes = getTeiNotes(doc, documentNoteParts, Note.NoteType.FOOT); @@ -1119,7 +1143,7 @@ protected List getTeiNotes(Document doc, SortedSet document if (localNotes != null) notes.addAll(localNotes); } - + notes.stream() .forEach(n -> n.setText(TextUtilities.dehyphenize(n.getText()))); @@ -1175,13 +1199,13 @@ protected List makeNotes(List noteTokens, String footText, No Note localNote = null; if (currentNumber == -1) localNote = new Note(null, noteTokens, footText, noteType); - else + else localNote = new Note(""+currentNumber, noteTokens, footText, noteType); notes.add(localNote); // add possible subsequent notes concatenated in the same note sequence (this is a common error, - // which is addressed here by heuristics, it may not be necessary in the future with a better + // which is addressed here by heuristics, it may not be necessary in the future with a better // segmentation model using more footnotes training data) if (currentNumber != -1) { String nextLabel = " " + (currentNumber+1); @@ -1191,7 +1215,7 @@ protected List makeNotes(List noteTokens, String footText, No int nextFootnoteLabelIndex = footText.indexOf(nextLabel); if (nextFootnoteLabelIndex != -1) { - // optionally we could restrict here to superscript numbers + // optionally we could restrict here to superscript numbers // review local note localNote.setText(footText.substring(0, nextFootnoteLabelIndex)); int pos = 0; @@ -1233,9 +1257,9 @@ private StringBuilder toTEINote(StringBuilder tei, List markerTypes, GrobidAnalysisConfig config) throws Exception { // pattern is - // or + // or // pattern is - + // if no note label is found, no @n attribute but we generate a random xml:id (not be used currently) for (Note note : notes) { @@ -1247,20 +1271,20 @@ private StringBuilder toTEINote(StringBuilder tei, addXmlId(desc, note.getIdentifier()); - // this is a paragraph element for storing text content of the note, which is + // this is a paragraph element for storing text content of the note, which is // better practice than just putting the text under the element Element pNote = XmlBuilderUtils.teiElement("p"); if (config.isGenerateTeiIds()) { String pID = KeyGen.getKey().substring(0, 7); addXmlId(pNote, "_" + pID); } - + if (config.isGenerateTeiCoordinates("p")) { String coords = LayoutTokensUtil.getCoordsString(note.getTokens()); desc.addAttribute(new Attribute("coords", coords)); } - - // for labelling bibliographical references in notes + + // for labelling bibliographical references in notes List noteTokens = note.getTokens(); String coords = null; @@ -1349,7 +1373,7 @@ public StringBuilder processTEIDivSection(String xmlType, StringBuilder contentBuffer = new StringBuilder(); contentBuffer = toTEITextPiece(contentBuffer, text, null, biblioData, false, - new LayoutTokenization(tokens), null, null, null, + new LayoutTokenization(tokens), null, null, null, null, null, doc, config); String result = contentBuffer.toString(); String[] resultAsArray = result.split("\n"); @@ -1433,17 +1457,25 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, TaggingLabel clusterLabel = cluster.getTaggingLabel(); Engine.getCntManager().i(clusterLabel); if (clusterLabel.equals(TaggingLabels.SECTION)) { - String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens()); + List dehyphenized = LayoutTokensUtil.dehyphenize(cluster.concatTokens()); + String text = LayoutTokensUtil.toText(dehyphenized).replace("\n", " "); + curDiv = teiElement("div"); Element head = teiElement("head"); // section numbers - org.grobid.core.utilities.Pair numb = getSectionNumber(clusterContent); + Pair, String> numb = getSectionNumber(dehyphenized); if (numb != null) { - head.addAttribute(new Attribute("n", numb.b)); - head.appendChild(numb.a); - } else { - head.appendChild(clusterContent); + head.addAttribute(new Attribute("n", numb.getRight())); + dehyphenized = numb.getLeft(); + text = LayoutTokensUtil.toText(dehyphenized); } +// List> stylesList = extractStylesList(dehyphenized); + +// if (CollectionUtils.isNotEmpty(stylesList)) { +// applyStyleList(head, text, stylesList); +// } else { + head.appendChild(StringUtils.normalizeSpace(text.replace("\n", ""))); +// } if (config.isGenerateTeiIds()) { String divID = KeyGen.getKey().substring(0, 7); @@ -1451,10 +1483,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, } if (config.isGenerateTeiCoordinates("head") ) { - String coords = LayoutTokensUtil.getCoordsString(cluster.concatTokens()); - if (coords != null) { - head.addAttribute(new Attribute("coords", coords)); - } + head.addAttribute(new Attribute("coords", LayoutTokensUtil.getCoordsString(cluster.concatTokens()))); } curDiv.appendChild(head); @@ -1463,13 +1492,13 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, clusterLabel.equals(TaggingLabels.EQUATION_LABEL)) { // get starting position of the cluster int start = -1; - if ( (cluster.concatTokens() != null) && (cluster.concatTokens().size() > 0) ) { + if ( CollectionUtils.isNotEmpty(cluster.concatTokens()) ) { start = cluster.concatTokens().get(0).getOffset(); } // get the corresponding equation if (start != -1) { Equation theEquation = null; - if (equations != null) { + if (CollectionUtils.isNotEmpty(equations)) { for(int i=0; i> stylesList = extractStylesList(cluster.concatTokens()); + + if (CollectionUtils.isNotEmpty(stylesList)) { + applyStyleList(itemNode, text, stylesList); + } else { + itemNode.appendChild(StringUtils.normalizeSpace(text)); + } + if (!MARKER_LABELS.contains(lastClusterLabel) && (lastClusterLabel != TaggingLabels.ITEM)) { curList = teiElement("list"); curDiv.appendChild(curList); @@ -1509,17 +1546,20 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, curDiv.appendChild(note); } else if (clusterLabel.equals(TaggingLabels.PARAGRAPH)) { List clusterTokens = cluster.concatTokens(); - int clusterPage = Iterables.getLast(clusterTokens).getPage(); + List dehyphenized = LayoutTokensUtil.dehyphenize(clusterTokens); + int clusterPage = Iterables.getLast(dehyphenized).getPage(); List notesSamePage = null; if (CollectionUtils.isNotEmpty(notes)) { notesSamePage = notes.stream() - .filter(f -> !f.isIgnored() && f.getPageNumber() == clusterPage) - .collect(Collectors.toList()); + .filter(f -> !f.isIgnored() && f.getPageNumber() == clusterPage) + .collect(Collectors.toList()); } if (notesSamePage == null) { - String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(clusterTokens); + + String text = LayoutTokensUtil.toText(dehyphenized).replace("\n", " "); + if (isNewParagraph(lastClusterLabel, curParagraph)) { if (curParagraph != null && config.isWithSentenceSegmentation()) { segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage()); @@ -1529,12 +1569,12 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, String divID = KeyGen.getKey().substring(0, 7); addXmlId(curParagraph, "_" + divID); } - + if (config.isGenerateTeiCoordinates("p")) { String coords = LayoutTokensUtil.getCoordsString(clusterTokens); curParagraph.addAttribute(new Attribute("coords", coords)); } - + curDiv.appendChild(curParagraph); curParagraphTokens = new ArrayList<>(); } else { @@ -1545,8 +1585,15 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, } } } - curParagraph.appendChild(clusterContent); - curParagraphTokens.addAll(clusterTokens); + + List> stylesList = extractStylesList(dehyphenized); + + if (CollectionUtils.isNotEmpty(stylesList)) { + applyStyleList(curParagraph, text, stylesList); + } else { + curParagraph.appendChild(StringUtils.normalizeSpace(text)); + } + curParagraphTokens.addAll(cluster.concatTokens()); } else { if (isNewParagraph(lastClusterLabel, curParagraph)) { if (curParagraph != null && config.isWithSentenceSegmentation()) { @@ -1562,15 +1609,15 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, String coords = LayoutTokensUtil.getCoordsString(clusterTokens); curParagraph.addAttribute(new Attribute("coords", coords)); } - + curDiv.appendChild(curParagraph); curParagraphTokens = new ArrayList<>(); } // we need to cover several footnote callouts in the same paragraph segment - // we also can't assume notes are sorted and will appear first in the text as the same order - // they are defined in the note areas - this might not always be the case in + // we also can't assume notes are sorted and will appear first in the text as the same order + // they are defined in the note areas - this might not always be the case in // ill-formed documents // map the matched note labels to their corresponding note objects @@ -1581,13 +1628,13 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, List> matchedLabelPosition = new ArrayList<>(); for (Note note : notesSamePage) { - Optional matching = clusterTokens + Optional matching = dehyphenized .stream() .filter(t -> t.getText().equals(note.getLabel()) && t.isSuperscript()) .findFirst(); if (matching.isPresent()) { - int idx = clusterTokens.indexOf(matching.get()); + int idx = dehyphenized.indexOf(matching.get()); note.setIgnored(true); OffsetPosition matchingPosition = new OffsetPosition(); matchingPosition.start = idx; @@ -1611,8 +1658,8 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, Note note = labels2Notes.get(matching.getLeft()); OffsetPosition matchingPosition = matching.getRight(); - List before = clusterTokens.subList(pos, matchingPosition.start); - String clusterContentBefore = LayoutTokensUtil.normalizeDehyphenizeText(before); + List before = dehyphenized.subList(pos, matchingPosition.start); + String clusterContentBefore = LayoutTokensUtil.toText(before); if (CollectionUtils.isNotEmpty(before) && before.get(0).getText().equals(" ")) { curParagraph.appendChild(new Text(" ")); @@ -1625,10 +1672,18 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, curParagraph.addAttribute(new Attribute("coords", curParagraph.getAttributeValue("coords") + ";" + coords)); } } - + curParagraphTokens.addAll(before); + List> stylesList = extractStylesList(before); - List calloutTokens = clusterTokens.subList(matchingPosition.start, matchingPosition.end); + if (CollectionUtils.isNotEmpty(stylesList)) { + applyStyleList(curParagraph, clusterContentBefore, stylesList); + } else { + curParagraph.appendChild(StringUtils.normalizeSpace(clusterContentBefore)); + } + curParagraphTokens.addAll(cluster.concatTokens()); + + List calloutTokens = dehyphenized.subList(matchingPosition.start, matchingPosition.end); Element ref = teiElement("ref"); ref.addAttribute(new Attribute("type", "foot")); @@ -1644,12 +1699,12 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, ref.addAttribute(new Attribute("target", "#" + note.getIdentifier())); curParagraph.appendChild(ref); - pos = matchingPosition.end; + pos = matchingPosition.end; } // add last chunk of paragraph stuff (or whole paragraph if no note callout matching) - List remaining = clusterTokens.subList(pos, clusterTokens.size()); - String remainingClusterContent = LayoutTokensUtil.normalizeDehyphenizeText(remaining); + List remaining = dehyphenized.subList(pos, dehyphenized.size()); + String remainingClusterContent = LayoutTokensUtil.toText(remaining); if (CollectionUtils.isNotEmpty(remaining) && remaining.get(0).getText().equals(" ")) { curParagraph.appendChild(new Text(" ")); @@ -1664,6 +1719,16 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, curParagraph.appendChild(remainingClusterContent); curParagraphTokens.addAll(remaining); + List> stylesList = extractStylesList(remaining); + + if (CollectionUtils.isNotEmpty(stylesList)) { + applyStyleList(curParagraph, remainingClusterContent, stylesList); + } else { + curParagraph.appendChild(StringUtils.normalizeSpace(remainingClusterContent)); + } + curParagraphTokens.addAll(cluster.concatTokens()); + + } } else if (MARKER_LABELS.contains(clusterLabel)) { List refTokens = cluster.concatTokens(); @@ -1696,13 +1761,13 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, } else { throw new IllegalStateException("Unsupported marker type: " + clusterLabel); } - + if (refNodes != null) { boolean footNoteCallout = false; if (refNodes.size() == 1 && (refNodes.get(0) instanceof Text)) { - // filtered out superscript reference marker (based on the defined citationMarkerType) might - // be foot note callout - se we need in this particular case to try to match existing notes + // filtered out superscript reference marker (based on the defined citationMarkerType) might + // be footnote callout - se we need in this particular case to try to match existing notes // similarly as within paragraph if (citationMarkerType == null || citationMarkerType != MarkerType.SUPERSCRIPT_NUMBER) { // is refTokens superscript? @@ -1721,7 +1786,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, if (chunkRefString.trim().equals(note.getLabel())) { footNoteCallout = true; note.setIgnored(true); - + Element ref = teiElement("ref"); ref.addAttribute(new Attribute("type", "foot")); @@ -1743,16 +1808,16 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, } } } - } + } } if (!footNoteCallout) { for (Node n : refNodes) { parent.appendChild(n); } - } + } } - + if (curParagraph != null) curParagraphTokens.addAll(cluster.concatTokens()); } else if (clusterLabel.equals(TaggingLabels.FIGURE) || clusterLabel.equals(TaggingLabels.TABLE)) { @@ -1827,6 +1892,43 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, return buffer; } + /** + * Apply the styles as described in the stylesList. + * This method modifies the input paragraphElem. + */ + public static Element applyStyleList(Element paragraphElem, String text, List> stylesList) { +// if (CollectionUtils.isEmpty(stylesList)) { +// paragraphElem.appendChild(StringUtils.normalizeSpace(paragraphText)); +// return paragraphElem; +// } + + int lastPosition = 0; + for (Triple style : stylesList) { + OffsetPosition offsetStyle = style.getRight(); + String subString = text.substring(lastPosition, offsetStyle.start); + String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : ""; + String suffixSpace = ""; + if (subString.length() > prefixSpace.length()) { + suffixSpace = StringUtils.endsWith(subString, " ") ? " " : ""; + } + paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " ")) + suffixSpace); + Element rend = teiElement("hi"); + rend.addAttribute(new Attribute("rend", style.getLeft())); + rend.appendChild(StringUtils.normalizeSpace(text.substring(offsetStyle.start, offsetStyle.end).replace("\n", " "))); + lastPosition = offsetStyle.end; + paragraphElem.appendChild(rend); + } + String subString = text.substring(lastPosition); + String subStringNormalized = StringUtils.normalizeSpace(subString); + String prefixSpace = ""; + if (subStringNormalized.length() > 0) { + prefixSpace = StringUtils.startsWith(subString, " ") ? " " : ""; + } + paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " "))); + + return paragraphElem; + } + public static boolean isNewParagraph(TaggingLabel lastClusterLabel, Element curParagraph) { return (!MARKER_LABELS.contains(lastClusterLabel) && lastClusterLabel != TaggingLabels.FIGURE && lastClusterLabel != TaggingLabels.TABLE) || curParagraph == null; @@ -1841,36 +1943,24 @@ public void segmentIntoSentences(Element curParagraph, List curPara // in xom, the following gives all the text under the element, for the whole subtree String text = curParagraph.getValue(); - if (text == null || text.length() == 0) + if (StringUtils.isEmpty(text)) return; - // identify ref nodes, ref spans and ref positions - Map mapRefNodes = new HashMap<>(); - List refPositions = new ArrayList<>(); - List forbiddenPositions = new ArrayList<>(); - int pos = 0; - for(int i=0; i> rawMapRefNodes = identifyNestedNodes(curParagraph); - String chunk = theNode.getValue(); - forbiddenPositions.add(new OffsetPosition(pos, pos+chunk.length())); - pos += chunk.length(); - } - } - } + List forbiddenPositions = rawMapRefNodes.entrySet() + .stream() + .filter(entry -> ((Element) entry.getValue().getLeft()).getLocalName().equals("ref")) + .map(entry -> new OffsetPosition(entry.getKey(), entry.getValue().getRight().length() + entry.getKey())) + .collect(Collectors.toList()); - List theSentences = + List sentencesOffsetPosition = SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang)); - + + Map> mapRefNodes = splitMapNodesOverSentenceSplits(rawMapRefNodes, text, sentencesOffsetPosition); + + List refPositions = mapRefNodes.keySet().stream().sorted().collect(Collectors.toList()); + /*if (theSentences.size() == 0) { // this should normally not happen, but it happens (depending on sentence splitter, usually the text // is just a punctuation) @@ -1879,48 +1969,7 @@ public void segmentIntoSentences(Element curParagraph, List curPara }*/ // segment the list of layout tokens according to the sentence segmentation if the coordinates are needed - List> segmentedParagraphTokens = new ArrayList<>(); - List currentSentenceTokens = new ArrayList<>(); - pos = 0; - - if (config.isGenerateTeiCoordinates("s")) { - - int currentSentenceIndex = 0; - String sentenceChunk = text.substring(theSentences.get(currentSentenceIndex).start, theSentences.get(currentSentenceIndex).end); - - for(int i=0; i 0) { - segmentedParagraphTokens.add(currentSentenceTokens); - currentSentenceIndex++; - if (currentSentenceIndex >= theSentences.size()) { - currentSentenceTokens = new ArrayList<>(); - break; - } - sentenceChunk = text.substring(theSentences.get(currentSentenceIndex).start, theSentences.get(currentSentenceIndex).end); - } - currentSentenceTokens = new ArrayList<>(); - currentSentenceTokens.add(token); - pos = 0; - } - - if (currentSentenceIndex >= theSentences.size()) - break; - } - // last sentence - if (currentSentenceTokens.size() > 0) { - // check sentence index too ? - segmentedParagraphTokens.add(currentSentenceTokens); - } + List> segmentedParagraphTokens = segmentLayoutTokenLists(curParagraphTokens, text, sentencesOffsetPosition); /*if (segmentedParagraphTokens.size() != theSentences.size()) { System.out.println("ERROR, segmentedParagraphTokens size:" + segmentedParagraphTokens.size() + " vs theSentences size: " + theSentences.size()); @@ -1937,44 +1986,42 @@ public void segmentIntoSentences(Element curParagraph, List curPara k++; } }*/ - } + // update the xml paragraph element int currenChildIndex = 0; - pos = 0; + int pos = 0; int posInSentence = 0; int refIndex = 0; - for(int i=0; i=i+1) { - currentSentenceTokens = segmentedParagraphTokens.get(i); - String coords = LayoutTokensUtil.getCoordsString(currentSentenceTokens); - if (coords != null) { - sentenceElement.addAttribute(new Attribute("coords", coords)); - } + List currentSentenceTokens = segmentedParagraphTokens.get(i); + sentenceElement.addAttribute(new Attribute("coords", LayoutTokensUtil.getCoordsString(currentSentenceTokens))); } } - - int sentenceLength = theSentences.get(i).end - pos; + + int sentenceLength = sentencesOffsetPosition.get(i).end - pos; // check if we have a ref between pos and pos+sentenceLength for(int j=refIndex; j= pos+posInSentence && refPos <= pos+sentenceLength) { - Node valueNode = mapRefNodes.get(Integer.valueOf(refPos)); + Node valueNode = mapRefNodes.get(refPos).getLeft(); if (pos+posInSentence < refPos) { - String local_text_chunk = text.substring(pos+posInSentence, refPos); - local_text_chunk = XmlBuilderUtils.stripNonValidXMLCharacters(local_text_chunk); - sentenceElement.appendChild(local_text_chunk); + String localTextChunk = text.substring(pos+posInSentence, refPos); + localTextChunk = XmlBuilderUtils.stripNonValidXMLCharacters(localTextChunk); + sentenceElement.appendChild(localTextChunk); } valueNode.detach(); sentenceElement.appendChild(valueNode); @@ -1986,10 +2033,10 @@ public void segmentIntoSentences(Element curParagraph, List curPara } } - if (pos+posInSentence <= theSentences.get(i).end) { - String local_text_chunk = text.substring(pos+posInSentence, theSentences.get(i).end); - local_text_chunk = XmlBuilderUtils.stripNonValidXMLCharacters(local_text_chunk); - sentenceElement.appendChild(local_text_chunk); + if (pos + posInSentence <= sentencesOffsetPosition.get(i).end) { + String localTextChunk = text.substring(pos + posInSentence, sentencesOffsetPosition.get(i).end); + localTextChunk = XmlBuilderUtils.stripNonValidXMLCharacters(localTextChunk); + sentenceElement.appendChild(localTextChunk); curParagraph.appendChild(sentenceElement); } } @@ -2005,8 +2052,278 @@ public void segmentIntoSentences(Element curParagraph, List curPara } } } + } + + /** + * Adjust the nodes that could be over a sentence split. + * We know that refs cannot be split over sentences, so we can ignore them happily + **/ + protected Map> splitMapNodesOverSentenceSplits(Map> mapRefNodes, String text, List sentencesOffsetPosition) { + Map> adjustedMap = new TreeMap<>(); + + StringBuilder textAccumulator = new StringBuilder(); + List refPositions = mapRefNodes.keySet().stream().sorted().collect(Collectors.toList()); + + int currentNodeIdx = 0; + int previousSentenceOffsetStart = 0; + int previousPosInSentence = 0; + for(int i=0; i sentenceOffsetEnd) { + currentNodeIdx = j; + break; + } + adjustedMap.put(refPos, mapRefNodes.get(refPos)); + if (textAccumulator.length() < refPos) { + textAccumulator.append(text, textAccumulator.length(), refPos); + } + textAccumulator.append(mapRefNodes.get(refPos).getRight()); + posInSentence = refPos + currentNodeLength - sentenceOffsetStart; + continue; + } + + //The ref position is falling between sentence start and end + if (refPos >= sentenceOffsetStart+posInSentence && refPos < sentenceOffsetEnd) { + + //adding what's before the refPos to the accumulator + if (refPos > sentenceOffsetStart + posInSentence) { + textAccumulator.append(text, sentenceOffsetStart + posInSentence, refPos); + posInSentence = refPos - sentenceOffsetStart; + } + + //the node finishes before sentence ends - all good here :-) + if (sentenceOffsetStart + posInSentence + currentNodeLength < sentenceOffsetEnd) { + adjustedMap.put(refPos, mapRefNodes.get(refPos)); + textAccumulator.append(mapRefNodes.get(refPos).getRight()); + posInSentence = refPos + currentNodeLength - sentenceOffsetStart; + } else { + //The node exceed the sentence, we are in trouble! Cut it! + int splitElementSize = sentenceOffsetEnd - refPos; + + String substringPrefix = currentNode.getValue().substring(0, splitElementSize); + Element newElementPrefix = generateNewElement((Element) currentNode, substringPrefix); + adjustedMap.put(refPos, Pair.of(newElementPrefix, substringPrefix)); + textAccumulator.append(substringPrefix); + posInSentence = refPos + newElementPrefix.getValue().length() - sentenceOffsetStart; + currentNodeIdx = j; + break; + } + } else if (refPos > sentenceOffsetEnd) { + // add to accumulator the rest of the sentence and moving on to the next sentence + String textChunk = text.substring(sentenceOffsetStart + posInSentence, sentenceOffsetEnd); + textAccumulator.append(textChunk); + posInSentence += textChunk.length(); + currentNodeIdx = j; + break; + } else if (refPos < sentenceOffsetStart + && textAccumulator.length() > refPos + && textAccumulator.length() < refPos + currentNodeLength) { + //The node is between this sentence and the previous one - trouble again dude + + String exceeded = textAccumulator.substring(0, refPos) + mapRefNodes.get(refPos).getLeft().getValue(); + + if (exceeded.length() > sentenceOffsetEnd) { + String previousNodeSuffix = exceeded.substring(sentenceOffsetStart, sentenceOffsetEnd); + Element newElementSuffix = generateNewElement((Element) currentNode, previousNodeSuffix); + adjustedMap.put(sentenceOffsetStart, Pair.of(newElementSuffix, previousNodeSuffix)); + if (textAccumulator.length() < sentenceOffsetStart) { + textAccumulator.append(exceeded, textAccumulator.length(), sentenceOffsetStart); + } + textAccumulator.append(previousNodeSuffix); + + posInSentence = textAccumulator.length() - sentenceOffsetStart; + currentNodeIdx = j; + break; + } else { + //The item is within this sentence. Cool stuff. + String previousNodeSuffix = exceeded.substring(sentenceOffsetStart); + Element newElementSuffix = generateNewElement((Element) currentNode, previousNodeSuffix); + adjustedMap.put(sentenceOffsetStart, Pair.of(newElementSuffix, previousNodeSuffix)); + if (textAccumulator.length() < sentenceOffsetStart) { + textAccumulator.append(exceeded, textAccumulator.length(), sentenceOffsetStart); + } + textAccumulator.append(previousNodeSuffix); + posInSentence = textAccumulator.length() - sentenceOffsetStart; + } + } + } + previousSentenceOffsetStart = sentenceOffsetStart; + previousPosInSentence = posInSentence; + + if (sentenceOffsetStart + posInSentence < sentenceOffsetEnd) { + textAccumulator.append(text, sentenceOffsetStart + posInSentence, sentencesOffsetPosition.get(i).end); + } + } + + return adjustedMap; + } + + private Element generateNewElement(Element currentNode, String value) { + Element newElement = teiElement(currentNode.getLocalName(), value); + for (int i=0; i < currentNode.getAttributeCount(); i++) { + Attribute a = new Attribute(currentNode.getAttribute(i)); + newElement.addAttribute(a); + } + return newElement; + } + + protected Map> identifyNestedNodes(Element curParagraph) { + // identify ref nodes, ref spans and ref positions + Map> mapNodes = new HashMap<>(); + + int pos = 0; + for(int i = 0; i< curParagraph.getChildCount(); i++) { + Node theNode = curParagraph.getChild(i); + if (theNode instanceof Text) { + String chunk = theNode.getValue(); + pos += chunk.length(); + } else if (theNode instanceof Element) { + // for readability in another conditional + if (((Element) theNode).getLocalName().equals("ref")) { + String chunk = theNode.getValue(); + // map character offset of the node and the chunk text + mapNodes.put(pos, Pair.of(theNode, chunk)); + + pos += chunk.length(); + } else if (((Element) theNode).getLocalName().equals("hi")) { + String chunk = theNode.getValue(); + mapNodes.put(pos, Pair.of(theNode, chunk)); + + pos += chunk.length(); + } + } + } + + return mapNodes; + } + + private List> segmentLayoutTokenLists(List curParagraphTokens, String text, List sentencesOffsetPosition) { + int pos; + List> segmentedParagraphTokens = new ArrayList<>(); + List currentSentenceTokens = new ArrayList<>(); + pos = 0; + + int currentSentenceIndex = 0; +//System.out.println(text); +//System.out.println("theSentences.size(): " + theSentences.size()); + String sentenceChunk = text.substring(sentencesOffsetPosition.get(currentSentenceIndex).start, + sentencesOffsetPosition.get(currentSentenceIndex).end); + + for (LayoutToken token : curParagraphTokens) { + if (StringUtils.isEmpty(token.getText())) + continue; + + int newPos = sentenceChunk.indexOf(token.getText(), pos); + if ((newPos != -1) || SentenceUtilities.toSkipToken(token.getText())) { + // just move on + currentSentenceTokens.add(token); + if (newPos != -1 && !SentenceUtilities.toSkipToken(token.getText())) + pos = newPos; + } else { + if (currentSentenceTokens.size() > 0) { + segmentedParagraphTokens.add(currentSentenceTokens); + currentSentenceIndex++; + if (currentSentenceIndex >= sentencesOffsetPosition.size()) { + currentSentenceTokens = new ArrayList<>(); + break; + } + sentenceChunk = text.substring(sentencesOffsetPosition.get(currentSentenceIndex).start, sentencesOffsetPosition.get(currentSentenceIndex).end); + } + currentSentenceTokens = new ArrayList<>(); + currentSentenceTokens.add(token); + pos = 0; + } + + if (currentSentenceIndex >= sentencesOffsetPosition.size()) + break; + } + // last sentence + if (currentSentenceTokens.size() > 0) { + // check sentence index too ? + segmentedParagraphTokens.add(currentSentenceTokens); + } + return segmentedParagraphTokens; + } + + public static List> extractStylesList(List tokenList) { + return extractStylesList(tokenList, new ArrayList<>()); + } - } + + /** + * Extracts the stiles from the list of token. The additional parameter can ignore certain styles + * (e.g. to restrict only superscript/subscript when decorating formulas) + */ + public static List> extractStylesList(List tokenList, List ignoreStyles) { + List> styleList = new ArrayList<>(); + String previousStyleName = ""; + StringBuilder temporaryText = new StringBuilder(); + StringBuilder value = new StringBuilder(); + + for (int index = 0; index < tokenList.size(); index++) { + LayoutToken token = tokenList.get(index); + int startOffset = temporaryText.toString().length(); + temporaryText.append(token.getText()); + int endOffset = temporaryText.toString().length(); + + if (token.getText().equals(" ") || token.getText().equals("\n")) { + if (value.length() > 0) { + value.append(token.getText()); + } + continue; + } + + StringBuilder styleName = new StringBuilder(); + if (token.isBold() && !ignoreStyles.contains(TEI_STYLE_BOLD_NAME)) { + styleName.append(TEI_STYLE_BOLD_NAME).append(" "); + } + + if (token.isItalic() && !ignoreStyles.contains(TEI_STYLE_ITALIC_NAME)) { + styleName.append(TEI_STYLE_ITALIC_NAME).append(" "); + } + + if(token.isSuperscript() && !ignoreStyles.contains(TEI_STYLE_SUPERSCRIPT_NAME)) { + styleName.append(TEI_STYLE_SUPERSCRIPT_NAME); + } else if(token.isSubscript() && !ignoreStyles.contains(TEI_STYLE_SUBSCRIPT_NAME)) { + styleName.append(TEI_STYLE_SUBSCRIPT_NAME); + } + + String styleNameTrimmed = StringUtils.trim(styleName.toString()); + value.append(token.getText()); + + if (StringUtils.isEmpty(styleNameTrimmed)) { + previousStyleName = styleNameTrimmed; + value = new StringBuilder(); + continue; + } + + if (styleNameTrimmed.equals(previousStyleName)) { + Triple last = Iterables.getLast(styleList); + styleList.set(styleList.size()-1, Triple.of(last.getLeft(), value.toString(), new OffsetPosition(last.getRight().start, endOffset))); + } else { + styleList.add(Triple.of(styleNameTrimmed, value.toString(), new OffsetPosition(startOffset, endOffset))); +// value = new StringBuilder(); + } + + previousStyleName = styleNameTrimmed; + } +// List> postProcessedStyleList = styleList.stream().map(s -> Triple.of(s.getLeft(), s.getMiddle().substring(s.getRight().start, s.getRight().end), s.getRight())).collect(Collectors.toList()); + + return styleList; + } /** * Return the graphic objects in a given interval position in the document. @@ -2024,26 +2341,46 @@ private List getGraphicObject(List graphicObjects, return result; } - private org.grobid.core.utilities.Pair getSectionNumber(String text) { + protected Pair, String> getSectionNumber(List tokens) { + + String text = LayoutTokensUtil.toText(tokens); + Matcher m1 = BasicStructureBuilder.headerNumbering1.matcher(text); Matcher m2 = BasicStructureBuilder.headerNumbering2.matcher(text); Matcher m3 = BasicStructureBuilder.headerNumbering3.matcher(text); Matcher m = null; + OffsetPosition position = null; String numb = null; if (m1.find()) { numb = m1.group(0); + position = new OffsetPosition(m1.start(), m1.end()); m = m1; } else if (m2.find()) { numb = m2.group(0); + position = new OffsetPosition(m2.start(), m2.end()); m = m2; } else if (m3.find()) { numb = m3.group(0); + position = new OffsetPosition(m3.start(), m3.end()); m = m3; } if (numb != null) { - text = text.replace(numb, "").trim(); + int lastPosition = 0; + StringBuilder acc = new StringBuilder(); + List tokensWithoutSectionNumbers = new ArrayList<>(); + for (int idx=0; idx < tokens.size(); idx++) { + if (!(lastPosition >= position.start && lastPosition < position.end )) { + if (!(tokensWithoutSectionNumbers.size() == 0 && tokens.get(idx).getText().equals(" "))) { + //adding a space at the beginning of the accumulator should be ignored + tokensWithoutSectionNumbers.add(tokens.get(idx)); + } + } + acc.append(tokens.get(idx).getText()); + lastPosition = acc.toString().length(); + } + numb = numb.replace(" ", ""); - return new org.grobid.core.utilities.Pair<>(text, numb); + return Pair.of(tokensWithoutSectionNumbers, numb); } else { return null; } @@ -2109,7 +2446,7 @@ public List markReferencesTEILuceneBased(List refTokens, if ( (refTokens == null) || (refTokens.size() == 0) ) return null; String text = LayoutTokensUtil.toText(refTokens); - if (text == null || text.trim().length() == 0 || text.endsWith("") || text.startsWith("") || text.startsWith("singletonList(new Text(text)); boolean spaceEnd = false; @@ -2181,11 +2518,11 @@ public List markReferencesTEILuceneBased(List refTokens, } - public List markReferencesFigureTEI(String refText, + public List markReferencesFigureTEI(String refText, List allRefTokens, List
figures, boolean generateCoordinates) { - if (refText == null || + if (refText == null || refText.trim().isEmpty()) { return null; } @@ -2212,7 +2549,7 @@ public List markReferencesFigureTEI(String refText, } if (labels == null || labels.size() <= 1) { - org.grobid.core.utilities.Pair> localLabel = + org.grobid.core.utilities.Pair> localLabel = new org.grobid.core.utilities.Pair(refText, allRefTokens); labels = new ArrayList<>(); labels.add(localLabel); @@ -2260,7 +2597,7 @@ public List markReferencesFigureTEI(String refText, String andWordString = null; if (text.endsWith("and") || text.endsWith("&")) { - // the AND_WORD_PATTERN case, we want to exclude the AND word from the tagged chunk + // the AND_WORD_PATTERN case, we want to exclude the AND word from the tagged chunk if (text.endsWith("and")) { text = text.substring(0, text.length()-3); andWordString = "and"; @@ -2309,7 +2646,7 @@ else if (text.endsWith("&")) { public List markReferencesTableTEI(String refText, List allRefTokens, List tables, boolean generateCoordinates) { - if (refText == null || + if (refText == null || refText.trim().isEmpty()) { return null; } @@ -2336,7 +2673,7 @@ public List markReferencesTableTEI(String refText, List allRe } if (labels == null || labels.size() <= 1) { - org.grobid.core.utilities.Pair> localLabel = + org.grobid.core.utilities.Pair> localLabel = new org.grobid.core.utilities.Pair(refText, allRefTokens); labels = new ArrayList<>(); labels.add(localLabel); @@ -2384,7 +2721,7 @@ public List markReferencesTableTEI(String refText, List allRe String andWordString = null; if (text.endsWith("and") || text.endsWith("&")) { - // the AND_WORD_PATTERN case, we want to exclude the AND word from the tagged chunk + // the AND_WORD_PATTERN case, we want to exclude the AND word from the tagged chunk if (text.endsWith("and")) { text = text.substring(0, text.length()-3); andWordString = "and"; @@ -2422,7 +2759,7 @@ else if (text.endsWith("&")) { if (andWordString != null) { nodes.add(new Text(andWordString)); } - + if (spaceEnd) nodes.add(new Text(" ")); } diff --git a/grobid-core/src/main/java/org/grobid/core/document/xml/XmlBuilderUtils.java b/grobid-core/src/main/java/org/grobid/core/document/xml/XmlBuilderUtils.java index 5d4850f94e..0c549078df 100644 --- a/grobid-core/src/main/java/org/grobid/core/document/xml/XmlBuilderUtils.java +++ b/grobid-core/src/main/java/org/grobid/core/document/xml/XmlBuilderUtils.java @@ -102,7 +102,7 @@ public static void main(String[] args) throws ParsingException, IOException { } public static String stripNonValidXMLCharacters(String in) { - StringBuffer out = new StringBuffer(); // Used to hold the output. + StringBuilder out = new StringBuilder(); // Used to hold the output. char current; // Used to reference the current character. if (in == null || ("".equals(in))) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 28eda7e693..c0e7201fef 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -2382,7 +2382,9 @@ protected List processEquations(String rese, } List tokenizationEquation = cluster.concatTokens(); - String clusterContent = LayoutTokensUtil.normalizeText(LayoutTokensUtil.toText(cluster.concatTokens())); + //LF: I removed the normalisation to keep the content in sync with contentTokens. + // The normalisation "StringUtils.normaliseSpaces()" is called anyway when building the XML + String clusterContent = LayoutTokensUtil.toText(cluster.concatTokens()); if (currentResult == null) currentResult = new Equation(); @@ -2398,10 +2400,11 @@ protected List processEquations(String rese, currentResult = new Equation(); } currentResult.appendContent(clusterContent); - currentResult.addLayoutTokens(cluster.concatTokens()); + currentResult.addLayoutTokens(tokenizationEquation); + currentResult.addContentTokens(tokenizationEquation); } else if (clusterLabel.equals(TaggingLabels.EQUATION_LABEL)) { currentResult.appendLabel(clusterContent); - currentResult.addLayoutTokens(cluster.concatTokens()); + currentResult.addLayoutTokens(tokenizationEquation); } lastLabel = clusterLabel; diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java index 1e794f8765..fa3669156b 100644 --- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java +++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java @@ -1,18 +1,34 @@ package org.grobid.core.document; +import nu.xom.Element; +import nu.xom.Node; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.commons.lang3.tuple.Triple; import org.grobid.core.analyzers.GrobidAnalyzer; import org.grobid.core.data.Note; +import org.grobid.core.document.xml.XmlBuilderUtils; +import org.grobid.core.engines.config.GrobidAnalysisConfig; +import org.grobid.core.lang.Language; import org.grobid.core.layout.LayoutToken; import org.grobid.core.utilities.GrobidProperties; import org.grobid.core.utilities.LayoutTokensUtil; +import org.grobid.core.utilities.OffsetPosition; +import org.grobid.core.utilities.SentenceUtilities; import org.junit.BeforeClass; +import org.junit.Ignore; import org.junit.Test; +import java.util.ArrayList; +import java.util.Arrays; import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import static org.grobid.core.document.TEIFormatter.*; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.hasSize; import static org.junit.Assert.assertThat; public class TEIFormatterTest { @@ -37,8 +53,8 @@ public void testMakeFootNote() throws Exception { assertThat(LayoutTokensUtil.toText(footnote.getTokens()), is("This is a footnote")); assertThat(footnote.getLabel(), is("1")); } - - + + @Test public void testMakeNotes() throws Exception { String text = "198 U.S. Const. art. I, § §9 & 10. \n199 To be sure, there are revisionist arguments that the Ex Post Facto clause itself extends to retroactive civil laws too. See Eastern Enterprises v. Apfel, 524 U.S. 498, 538-39 (1998) (Thomas, J., concurring). And as with bills of attainder, in the wake of the Civil War the Supreme Court held that Ironclad Oath requirements were ex post facto laws as well. Cummings, 71 U.S. at 326-332; Garland, 71 U.S. at 377-368. But as discussed in the text, even these principles do not ensnare Section Three going forward, on a non-ex-post-facto basis \n200 3 U.S. at 378-80 (arguments of counsel). \n201 Id. \n202 Id. at 382. See Baude & Sachs, Eleventh Amendment, supra note 9, at 626-627. Electronic copy available at: https://ssrn.com/abstract=4532751"; @@ -64,4 +80,552 @@ public void testMakeNotes() throws Exception { + @Test + public void testSegmentIntoSentences_simpleText_ShouldSplitIntoSentencesAndAddSTag() throws Exception { + String text = "One sentence. Second sentence."; + + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); + List currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + currentParagraph.appendChild(text); + + new TEIFormatter(null, null) + .segmentIntoSentences(currentParagraph, currentParagraphTokens, config, "en"); + + assertThat(currentParagraph.toXML(), is("

One sentence.Second sentence.

")); + assertThat(currentParagraph.getChildElements().size(), is(2)); + } + + @Test + public void testSegmentIntoSentences_Bold_ShouldSplitIntoSentencesAndAddSTag() throws Exception { + String text = "One sentence. Second sentence."; + + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); + List currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + currentParagraphTokens.get(0).setBold(true); + currentParagraphTokens.get(2).setBold(true); + currentParagraphTokens.get(2).setItalic(true); + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + currentParagraph.appendChild(text); + + new TEIFormatter(null, null) + .segmentIntoSentences(currentParagraph, currentParagraphTokens, config, "en"); + + assertThat(currentParagraph.toXML(), is("

One sentence.Second sentence.

")); + assertThat(currentParagraph.getChildElements().size(), is(2)); + } + + @Test + public void testSegmentIntoSentences_NoStyle_ShouldWork() throws Exception { + String text = "One sentence (Foppiano et al.). Second sentence (Lopez et al.). "; + + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); + List currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + currentParagraphTokens.get(0).setBold(true); + currentParagraphTokens.get(2).setBold(true); + currentParagraphTokens.get(2).setItalic(true); + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + currentParagraph.appendChild("One sentence"); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Foppiano et al.)")); + currentParagraph.appendChild(". "); + currentParagraph.appendChild("Second sentence"); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)")); + currentParagraph.appendChild("."); + + new TEIFormatter(null, null) + .segmentIntoSentences(currentParagraph, currentParagraphTokens, config, "en"); + + assertThat(currentParagraph.toXML(), + is("

One sentence (Foppiano et al.).Second sentence (Lopez et al.).

")); + } + + + @Test + public void testSegmentIntoSentences_Style_ShouldWork() throws Exception { + String text1_0 = "One sentence "; + String text1_1 = ". "; + String text2_0 = "Second sentence "; + String text2_1 = "."; + + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder() + .withSentenceSegmentation(true) + .build(); + + List tokens = new ArrayList<>(); + List currentParagraphTokens1_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_0); + tokens.addAll(currentParagraphTokens1_0); + List currentParagraphTokens1_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_1); + tokens.addAll(currentParagraphTokens1_1); + List currentParagraphTokens2_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text2_0); + tokens.addAll(currentParagraphTokens2_0); + List currentParagraphTokens2_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text2_1); + tokens.addAll(currentParagraphTokens2_1); + + currentParagraphTokens1_0.get(0).setBold(true); + currentParagraphTokens1_0.get(2).setBold(true); + currentParagraphTokens1_0.get(2).setItalic(true); + + List> styles1_0 = extractStylesList(currentParagraphTokens1_0); + List> styles1_1 = extractStylesList(currentParagraphTokens1_1); + List> styles2_0 = extractStylesList(currentParagraphTokens2_0); + List> styles2_1 = extractStylesList(currentParagraphTokens2_1); + + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + + applyStyleList(currentParagraph, text1_0, styles1_0); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Foppiano et al.)")); + applyStyleList(currentParagraph, text1_1, styles1_1); + currentParagraph.appendChild(" "); + applyStyleList(currentParagraph, text2_0, styles2_0); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)")); + applyStyleList(currentParagraph, text2_1, styles2_1); + + //Assuming these are injected correctly + + new TEIFormatter(null, null).segmentIntoSentences(currentParagraph, tokens, config, "en"); + + assertThat(currentParagraph.toXML(), + is("

One sentence (Foppiano et al.).Second sentence (Lopez et al.).

")); + } + + @Test + public void testSegmentIntoSentences_StyleBetweenTwoSentences_ShouldWork() throws Exception { + String text1_0 = "One sentence"; + String text1_1 = ". "; + String text2_0 = "Second sentence"; + String text2_1 = "."; + + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder() + .withSentenceSegmentation(true) + .build(); + + List tokens = new ArrayList<>(); + List currentParagraphTokens1_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_0); + tokens.addAll(currentParagraphTokens1_0); + List currentParagraphTokens1_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_1); + tokens.addAll(currentParagraphTokens1_1); + List currentParagraphTokens2_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text2_0); + tokens.addAll(currentParagraphTokens2_0); + List currentParagraphTokens2_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text2_1); + tokens.addAll(currentParagraphTokens2_1); + + currentParagraphTokens1_0.get(0).setBold(true); //One + currentParagraphTokens1_0.get(2).setItalic(true); //sentence + currentParagraphTokens1_1.get(0).setItalic(true); //. + currentParagraphTokens2_0.get(0).setItalic(true); //Second + currentParagraphTokens2_0.get(2).setItalic(true); //sentence + + List> styles1_0 = extractStylesList(currentParagraphTokens1_0); + List> styles1_1 = extractStylesList(currentParagraphTokens1_1); + List> styles2_0 = extractStylesList(currentParagraphTokens2_0); + List> styles2_1 = extractStylesList(currentParagraphTokens2_1); + + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + + applyStyleList(currentParagraph, text1_0, styles1_0); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Foppiano et al.)")); + applyStyleList(currentParagraph, text1_1, styles1_1); + currentParagraph.appendChild(" "); + applyStyleList(currentParagraph, text2_0, styles2_0); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)")); + applyStyleList(currentParagraph, text2_1, styles2_1); + + //Assuming these are injected correctly + + new TEIFormatter(null, null).segmentIntoSentences(currentParagraph, tokens, config, "en"); + + assertThat(currentParagraph.toXML(), + is("

One sentence (Foppiano et al.).Second sentence (Lopez et al.).

")); + } + + @Test + public void testSegmentIntoSentences_StyleBetweenTwoSentences_oneRef_ShouldWork() throws Exception { + String text1_0 = "One sentence. Second sentence"; + String text1_1 = "."; + + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder() + .withSentenceSegmentation(true) + .build(); + + List tokens = new ArrayList<>(); + List currentParagraphTokens1_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_0); + tokens.addAll(currentParagraphTokens1_0); + List currentParagraphTokens1_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_1); + tokens.addAll(currentParagraphTokens1_1); + + currentParagraphTokens1_0.get(0).setBold(true); //One + currentParagraphTokens1_0.get(2).setItalic(true); //sentence + currentParagraphTokens1_0.get(3).setItalic(true); //. + currentParagraphTokens1_0.get(5).setItalic(true); //Second + + List> styles1_0 = extractStylesList(currentParagraphTokens1_0); + List> styles1_1 = extractStylesList(currentParagraphTokens1_1); + + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + + applyStyleList(currentParagraph, text1_0, styles1_0); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)")); + applyStyleList(currentParagraph, text1_1, styles1_1); + + new TEIFormatter(null, null).segmentIntoSentences(currentParagraph, tokens, config, "en"); + + assertThat(currentParagraph.toXML(), + is("

One sentence.Second sentence (Lopez et al.).

")); + } + + @Test + public void testSegmentIntoSentences_StyleBetweenTwoSentencesWithoutRefs_ShouldWork() throws Exception { + String text = "One sentence. Second sentence."; + + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder() + .withSentenceSegmentation(true) + .build(); + + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + + tokens.get(0).setBold(true); //One + tokens.get(2).setItalic(true); //sentence + tokens.get(3).setItalic(true); //. + tokens.get(5).setItalic(true); //Second +// currentParagraphTokens.get(7).setItalic(true); //sentence + + List> styles = extractStylesList(tokens); + + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + + applyStyleList(currentParagraph, text, styles); + + //Assuming these are injected correctly + new TEIFormatter(null, null).segmentIntoSentences(currentParagraph, tokens, config, "en"); + + assertThat(currentParagraph.toXML(), + is("

One sentence.Second sentence.

")); + } + + @Test + public void testSplitMapNodesOverSentenceSplits_shouldAdjustNodes() { + TEIFormatter teiFormatter = new TEIFormatter(null, null); + + String text1_0 = "One sentence. Second sentence"; + String text1_1 = "."; + + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder() + .withSentenceSegmentation(true) + .build(); + + List tokens = new ArrayList<>(); + List currentParagraphTokens1_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_0); + tokens.addAll(currentParagraphTokens1_0); + List currentParagraphTokens1_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_1); + tokens.addAll(currentParagraphTokens1_1); + + currentParagraphTokens1_0.get(0).setBold(true); //One + currentParagraphTokens1_0.get(2).setItalic(true); //sentence + currentParagraphTokens1_0.get(3).setItalic(true); //. + currentParagraphTokens1_0.get(5).setItalic(true); //Second + + List> styles1_0 = extractStylesList(currentParagraphTokens1_0); + List> styles1_1 = extractStylesList(currentParagraphTokens1_1); + + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + + applyStyleList(currentParagraph, text1_0, styles1_0); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)")); + applyStyleList(currentParagraph, text1_1, styles1_1); + + String text = currentParagraph.getValue(); + + Map> nestedNodes = teiFormatter.identifyNestedNodes(currentParagraph); + List forbiddenPositions = nestedNodes.entrySet() + .stream() + .filter(entry -> ((Element) entry.getValue().getLeft()).getLocalName().equals("ref")) + .map(entry -> new OffsetPosition(entry.getKey(), entry.getValue().getRight().length() + entry.getKey())) + .collect(Collectors.toList()); + + List sentencesOffsetPosition = + SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, tokens, new Language("en")); + + Map> adjustedNestedNodes = teiFormatter.splitMapNodesOverSentenceSplits(nestedNodes, text, sentencesOffsetPosition); + + assertThat(adjustedNestedNodes.size(), is(4)); + + assertThat(new ArrayList<>(adjustedNestedNodes.keySet()), is(Arrays.asList(0, 4, 14, 30))); + + assertThat(adjustedNestedNodes.get(0).getRight(), is("One")); + assertThat(adjustedNestedNodes.get(4).getRight(), is("sentence.")); + assertThat(adjustedNestedNodes.get(14).getRight(), is("Second")); + assertThat(adjustedNestedNodes.get(30).getRight(), is("(Lopez et al.)")); + } + + @Test + public void testSplitMapNodesOverThreeSentenceSplits_shouldAdjustNodes() { + TEIFormatter teiFormatter = new TEIFormatter(null, null); + + String text1_0 = "One sentence. Second sentence. Third sentence"; + String text1_1 = "."; + + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder() + .withSentenceSegmentation(true) + .build(); + + List tokens = new ArrayList<>(); + List currentParagraphTokens1_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_0); + tokens.addAll(currentParagraphTokens1_0); + List currentParagraphTokens1_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_1); + tokens.addAll(currentParagraphTokens1_1); + + currentParagraphTokens1_0.get(0).setBold(true); //One + currentParagraphTokens1_0.get(2).setItalic(true); //sentence + currentParagraphTokens1_0.get(3).setItalic(true); //. + currentParagraphTokens1_0.get(5).setItalic(true); //Second + currentParagraphTokens1_0.get(7).setItalic(true); //sentence + currentParagraphTokens1_0.get(8).setItalic(true); //. + currentParagraphTokens1_0.get(10).setItalic(true); //Third +// currentParagraphTokens1_0.get(12).setItalic(true); //sentence + + List> styles1_0 = extractStylesList(currentParagraphTokens1_0); + List> styles1_1 = extractStylesList(currentParagraphTokens1_1); + + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + + applyStyleList(currentParagraph, text1_0, styles1_0); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)")); + applyStyleList(currentParagraph, text1_1, styles1_1); + + String text = currentParagraph.getValue(); + + Map> nestedNodes = teiFormatter.identifyNestedNodes(currentParagraph); + List forbiddenPositions = nestedNodes.entrySet() + .stream() + .filter(entry -> ((Element) entry.getValue().getLeft()).getLocalName().equals("ref")) + .map(entry -> new OffsetPosition(entry.getKey(), entry.getValue().getRight().length() + entry.getKey())) + .collect(Collectors.toList()); + + List sentencesOffsetPosition = + SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, tokens, new Language("en")); + + Map> adjustedNestedNodes = teiFormatter.splitMapNodesOverSentenceSplits(nestedNodes, text, sentencesOffsetPosition); + + assertThat(adjustedNestedNodes.size(), is(5)); + + assertThat(new ArrayList<>(adjustedNestedNodes.keySet()), is(Arrays.asList(0, 4, 14, 31, 46))); + + assertThat(adjustedNestedNodes.get(0).getRight(), is("One")); + assertThat(adjustedNestedNodes.get(4).getRight(), is("sentence.")); + assertThat(adjustedNestedNodes.get(14).getRight(), is("Second sentence.")); + assertThat(adjustedNestedNodes.get(31).getRight(), is("Third")); + assertThat(adjustedNestedNodes.get(46).getRight(), is("(Lopez et al.)")); + } + + @Test + public void testIdentifyRefNotes() throws Exception { + Element currentParagraph = XmlBuilderUtils.teiElement("p"); + currentParagraph.appendChild("One sentence"); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Foppiano et al.)")); + currentParagraph.appendChild(". "); + currentParagraph.appendChild("Second sentence"); + currentParagraph.appendChild(" "); + currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)")); + currentParagraph.appendChild("."); + + Map> integerPairMap = new TEIFormatter(null, null).identifyNestedNodes(currentParagraph); + + assertThat(integerPairMap.keySet(), hasSize(2)); + assertThat(integerPairMap.keySet().stream().toArray()[1], is(13)); + assertThat(integerPairMap.get(13).getRight(), is("(Foppiano et al.)")); + + assertThat(integerPairMap.keySet().stream().toArray()[0], is(48)); + assertThat(integerPairMap.get(48).getRight(), is("(Lopez et al.)")); + } + + @Test + public void testExtractStylesList_single_shouldWork() throws Exception { + String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure "; + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); + List currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + + currentParagraphTokens.get(26).setSubscript(true); + currentParagraphTokens.get(30).setSubscript(true); + + List> pairs = extractStylesList(currentParagraphTokens); + + assertThat(pairs, hasSize(2)); + assertThat(pairs.get(0).getLeft(), is("subscript")); + assertThat(pairs.get(0).getMiddle(), is("2")); + assertThat(pairs.get(0).getRight().start, is(79)); + assertThat(pairs.get(0).getRight().end, is(80)); + + assertThat(pairs.get(1).getLeft(), is("subscript")); + assertThat(pairs.get(1).getMiddle(), is("14")); + assertThat(pairs.get(1).getRight().start, is(84)); + assertThat(pairs.get(1).getRight().end, is(86)); + } + + @Test + public void applyStyleList_simpleStyles_shouldWork() throws Exception { + String text = "This is bold and italic."; + List> styles = new ArrayList<>(); + styles.add(Triple.of("bold", "bold", new OffsetPosition(8, 12))); + styles.add(Triple.of("italic", "italic", new OffsetPosition(17, 23))); + Element rootElement = XmlBuilderUtils.teiElement("p"); + TEIFormatter.applyStyleList(rootElement, text, styles); + + assertThat(rootElement.toXML(), is("

This is " + + "bold and italic.

")); + } + + @Test + public void applyStyleList_complexStyles_shouldWork() throws Exception { + String text = "This is bold and italic."; + List> styles = new ArrayList<>(); + styles.add(Triple.of("subscript", "is", new OffsetPosition(5, 7))); + styles.add(Triple.of("bold subscript", "bold", new OffsetPosition(8, 12))); + styles.add(Triple.of("italic superscript", "and", new OffsetPosition(13, 16))); + styles.add(Triple.of("italic", "italic", new OffsetPosition(17, 23))); + Element rootElement = XmlBuilderUtils.teiElement("p"); + TEIFormatter.applyStyleList(rootElement, text, styles); + + assertThat(rootElement.toXML(), is("

This " + + "is " + + "bold " + + "and " + + "italic.

")); + } + + @Test + public void testExtractStylesList_combined_shouldWork() throws Exception { + String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure "; + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); + List currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + + currentParagraphTokens.get(26).setSubscript(true); + currentParagraphTokens.get(26).setBold(true); + currentParagraphTokens.get(26).setItalic(true); + currentParagraphTokens.get(30).setSubscript(true); + + List> pairs = extractStylesList(currentParagraphTokens); + + assertThat(pairs, hasSize(2)); + assertThat(pairs.get(0).getLeft(), is("bold italic subscript")); + assertThat(pairs.get(0).getMiddle(), is("2")); + assertThat(pairs.get(0).getRight().start, is(79)); + assertThat(pairs.get(0).getRight().end, is(80)); + + assertThat(pairs.get(1).getLeft(), is("subscript")); + assertThat(pairs.get(1).getMiddle(), is("14")); + assertThat(pairs.get(1).getRight().start, is(84)); + assertThat(pairs.get(1).getRight().end, is(86)); + } + + @Test + public void testExtractStylesList_continuousTokens_shouldWork() throws Exception { + String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure "; + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); + List currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + + currentParagraphTokens.get(24).setBold(true); + currentParagraphTokens.get(26).setBold(true); + currentParagraphTokens.get(28).setBold(true); + currentParagraphTokens.get(30).setBold(true); + + List> pairs = extractStylesList(currentParagraphTokens); + + assertThat(pairs, hasSize(1)); + assertThat(pairs.get(0).getLeft(), is("bold")); + assertThat(pairs.get(0).getMiddle(), is("Nd 2 Fe 14")); + assertThat(pairs.get(0).getRight().start, is(76)); + assertThat(pairs.get(0).getRight().end, is(86)); + } + + @Test + public void testExtractStylesList_ignoreBold_shouldWork() throws Exception { + String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure "; + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); + List currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + + currentParagraphTokens.get(26).setSubscript(true); + currentParagraphTokens.get(26).setBold(true); + currentParagraphTokens.get(26).setItalic(true); + currentParagraphTokens.get(30).setSubscript(true); + + List> pairs = extractStylesList(currentParagraphTokens, Arrays.asList(TEI_STYLE_BOLD_NAME)); + + assertThat(pairs, hasSize(2)); + assertThat(pairs.get(0).getLeft(), is("italic subscript")); + assertThat(pairs.get(0).getMiddle(), is("2")); + assertThat(pairs.get(0).getRight().start, is(79)); + assertThat(pairs.get(0).getRight().end, is(80)); + + assertThat(pairs.get(1).getLeft(), is("subscript")); + assertThat(pairs.get(1).getMiddle(), is("14")); + assertThat(pairs.get(1).getRight().start, is(84)); + assertThat(pairs.get(1).getRight().end, is(86)); + } + + @Ignore("The middle is actually not used") + public void testExtractStylesList_checkProducedText_ShouldWork() throws Exception { + String text = "I. Introduction 1.1. Généralités et rappels "; + List textTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + + textTokens.get(0).setBold(true); + textTokens.get(1).setBold(true); + textTokens.get(3).setBold(true); + + textTokens.get(6).setItalic(true); + textTokens.get(7).setItalic(true); + textTokens.get(8).setItalic(true); + textTokens.get(9).setItalic(true); + textTokens.get(11).setItalic(true); + textTokens.get(13).setItalic(true); + textTokens.get(15).setItalic(true); + + List> pairs = extractStylesList(textTokens); + + assertThat(pairs, hasSize(2)); + assertThat(pairs.get(0).getLeft(), is("bold")); + assertThat(pairs.get(0).getMiddle(), is("I. Introduction")); + assertThat(pairs.get(1).getLeft(), is("italic")); + assertThat(pairs.get(1).getMiddle(), is("1.1. Généralités et rappels")); + } + + @Test + public void testGetSectionNumber_simple_ShouldWork() throws Exception { + String text = "3 Supercon 2"; + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); + List currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + + currentParagraphTokens.get(4).setSubscript(true); + Pair, String> sectionNumber = new TEIFormatter(null, null) + .getSectionNumber(currentParagraphTokens); + + String output = LayoutTokensUtil.toText(sectionNumber.getLeft()); + assertThat(output, is("Supercon 2")); + assertThat(sectionNumber.getRight(), is("3")); + } + + @Test + public void testGetSectionNumber_doubleSpace_ShouldWork() throws Exception { + String text = "3 Supercon 2"; + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build(); + List currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + + currentParagraphTokens.get(6).setSubscript(true); + Pair, String> sectionNumber = new TEIFormatter(null, null) + .getSectionNumber(currentParagraphTokens); + + String output = LayoutTokensUtil.toText(sectionNumber.getLeft()); + assertThat(output, is("Supercon 2")); + assertThat(sectionNumber.getRight(), is("3")); + } + } \ No newline at end of file