From 140c7ec088a758a9a885e64b71580a3eb1a40a5c Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Fri, 22 Jul 2022 11:44:25 +0900
Subject: [PATCH 01/23] implement style for paragraphs

---
 .../grobid/core/document/TEIFormatter.java    | 213 ++++++++++++++++--
 .../core/document/TEIFormatterTest.java       | 105 ++++++++-
 2 files changed, 289 insertions(+), 29 deletions(-)
diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index ff5a3783be..5a273b4d50 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1,8 +1,11 @@
 package org.grobid.core.document;
 
 import com.google.common.base.Joiner;
+import com.google.common.collect.Iterables;
 import com.google.common.collect.Sets;
+import org.apache.commons.collections4.CollectionUtils;
 import org.apache.commons.lang3.tuple.Pair;
+import org.apache.commons.lang3.tuple.Triple;
 import org.apache.commons.lang3.StringUtils;
 
 import nu.xom.Attribute;
@@ -18,12 +21,11 @@
 import org.grobid.core.engines.FullTextParser;
 import org.grobid.core.engines.label.SegmentationLabels;
 import org.grobid.core.engines.config.GrobidAnalysisConfig;
-import org.grobid.core.engines.counters.ReferenceMarkerMatcherCounters;
 import org.grobid.core.engines.label.TaggingLabel;
 import org.grobid.core.engines.label.TaggingLabels;
 import org.grobid.core.exceptions.GrobidException;
 import org.grobid.core.lang.Language;
-import org.grobid.core.utilities.SentenceUtilities;
+import org.grobid.core.utilities.*;
 import org.grobid.core.layout.BoundingBox;
 import org.grobid.core.layout.GraphicObject;
 import org.grobid.core.layout.LayoutToken;
@@ -31,8 +33,6 @@
 import org.grobid.core.layout.Page;
 import org.grobid.core.tokenization.TaggingTokenCluster;
 import org.grobid.core.tokenization.TaggingTokenClusteror;
-import org.grobid.core.utilities.*;
-import org.grobid.core.utilities.counters.CntManager;
 import org.grobid.core.utilities.matching.EntityMatcherException;
 import org.grobid.core.utilities.matching.ReferenceMarkerMatcher;
 import org.grobid.core.engines.citations.CalloutAnalyzer.MarkerType;
@@ -45,7 +45,6 @@
 import java.util.*;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
-import java.io.*;
 
 import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement;
 import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId;
@@ -1155,16 +1154,24 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
             TaggingLabel clusterLabel = cluster.getTaggingLabel();
             Engine.getCntManager().i(clusterLabel);
             if (clusterLabel.equals(TaggingLabels.SECTION)) {
-                String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
+                List<LayoutToken> dehyphenized = LayoutTokensUtil.dehyphenize(cluster.concatTokens());
+                String text = LayoutTokensUtil.toText(dehyphenized).replace("\n", " ");
+
                 curDiv = teiElement("div");
                 Element head = teiElement("head");
                 // section numbers
-                org.grobid.core.utilities.Pair<String, String> numb = getSectionNumber(clusterContent);
+                Pair<List<LayoutToken>, String> numb = getSectionNumber(dehyphenized);
                 if (numb != null) {
-                    head.addAttribute(new Attribute("n", numb.b));
-                    head.appendChild(numb.a);
+                    head.addAttribute(new Attribute("n", numb.getRight()));
+                    dehyphenized = numb.getLeft();
+                    text = LayoutTokensUtil.toText(dehyphenized);
+                }
+                List<Triple<String, String, OffsetPosition>> stylesList = extractStylesList(dehyphenized);
+
+                if (CollectionUtils.isNotEmpty(stylesList)) {
+                    applyStyleList(head, text, stylesList);
                 } else {
-                    head.appendChild(clusterContent);
+                    head.appendChild(text);
                 }
 
                 if (config.isGenerateTeiIds()) {
@@ -1173,10 +1180,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                 }
 
                 if (config.isGenerateTeiCoordinates("head") ) {
-                    String coords = LayoutTokensUtil.getCoordsString(cluster.concatTokens());
-                    if (coords != null) {
-                        head.addAttribute(new Attribute("coords", coords));
-                    }
+                    head.addAttribute(new Attribute("coords", LayoutTokensUtil.getCoordsString(cluster.concatTokens())));
                 }
 
                 curDiv.appendChild(head);
@@ -1185,7 +1189,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                     clusterLabel.equals(TaggingLabels.EQUATION_LABEL)) {
                 // get starting position of the cluster
                 int start = -1;
-                if ( (cluster.concatTokens() != null) && (cluster.concatTokens().size() > 0) ) {
+                if ( CollectionUtils.isEmpty(cluster.concatTokens()) ) {
                     start = cluster.concatTokens().get(0).getOffset();
                 }
                 // get the corresponding equation
@@ -1210,9 +1214,17 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                     }
                 }
             } else if (clusterLabel.equals(TaggingLabels.ITEM)) {
-                String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
-                //curDiv.appendChild(teiElement("item", clusterContent));
-                Element itemNode = teiElement("item", clusterContent);
+                String text = LayoutTokensUtil.toText(cluster.concatTokens()).replace("\n", " ");
+                Element itemNode = teiElement("item");
+
+                List<Triple<String, String, OffsetPosition>> stylesList = extractStylesList(cluster.concatTokens());
+
+                if (CollectionUtils.isNotEmpty(stylesList)) {
+                    applyStyleList(itemNode, text, stylesList);
+                } else {
+                    itemNode.appendChild(text);
+                }
+
                 if (!MARKER_LABELS.contains(lastClusterLabel) && (lastClusterLabel != TaggingLabels.ITEM)) {
                     curList = teiElement("list");
                     curDiv.appendChild(curList);
@@ -1230,7 +1242,9 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                 }
                 curDiv.appendChild(note);
             } else if (clusterLabel.equals(TaggingLabels.PARAGRAPH)) {
-                String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
+                List<LayoutToken> dehyphenized = LayoutTokensUtil.dehyphenize(cluster.concatTokens());
+                String text = LayoutTokensUtil.toText(dehyphenized).replace("\n", " ");
+
                 if (isNewParagraph(lastClusterLabel, curParagraph)) {
                     if (curParagraph != null && config.isWithSentenceSegmentation()) {
                         segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
@@ -1243,7 +1257,14 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                     curDiv.appendChild(curParagraph);
                     curParagraphTokens = new ArrayList<>();
                 }
-                curParagraph.appendChild(clusterContent);
+
+                List<Triple<String, String, OffsetPosition>> stylesList = extractStylesList(dehyphenized);
+
+                if (CollectionUtils.isEmpty(stylesList)) {
+
+                } else {
+                    applyStyleList(curParagraph, text, stylesList);
+                }
                 curParagraphTokens.addAll(cluster.concatTokens());
             } else if (MARKER_LABELS.contains(clusterLabel)) {
                 List<LayoutToken> refTokens = cluster.concatTokens();
@@ -1356,6 +1377,32 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
         return buffer;
     }
 
+    private Element applyStyleList(Element paragraphElem, String paragraphText, List<Triple<String, String, OffsetPosition>> stylesList) {
+//        if (CollectionUtils.isEmpty(stylesList)) {
+//            paragraphElem.appendChild(StringUtils.normalizeSpace(paragraphText));
+//            return paragraphElem;
+//        }
+
+        int lastPosition = 0;
+        for (Triple<String, String, OffsetPosition> style : stylesList) {
+            OffsetPosition offsetStyle = style.getRight();
+            String subString = paragraphText.substring(lastPosition, offsetStyle.start);
+            String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : "";
+            String suffixSpace = StringUtils.endsWith(subString, " ") ? " " : "";
+            paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString) + suffixSpace);
+            Element rend = teiElement("hi");
+            rend.addAttribute(new Attribute("rend", style.getLeft()));
+            rend.appendChild(StringUtils.normalizeSpace(paragraphText.substring(offsetStyle.start, offsetStyle.end)));
+            lastPosition = offsetStyle.end;
+            paragraphElem.appendChild(rend);
+        }
+        String subString = paragraphText.substring(lastPosition);
+        String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : "";
+        paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString));
+
+        return paragraphElem;
+    }
+
     public static boolean isNewParagraph(TaggingLabel lastClusterLabel, Element curParagraph) {
         return (!MARKER_LABELS.contains(lastClusterLabel) && lastClusterLabel != TaggingLabels.FIGURE
                 && lastClusterLabel != TaggingLabels.TABLE) || curParagraph == null;
@@ -1537,7 +1584,103 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
             }
         }
 
-    }   
+    }
+
+    private List<List<LayoutToken>> segmentLayoutTokenLists(List<LayoutToken> curParagraphTokens, String text, List<OffsetPosition> sentencesOffsetPosition) {
+        int pos;
+        List<List<LayoutToken>> segmentedParagraphTokens = new ArrayList<>();
+        List<LayoutToken> currentSentenceTokens = new ArrayList<>();
+        pos = 0;
+
+        int currentSentenceIndex = 0;
+//System.out.println(text);
+//System.out.println("theSentences.size(): " + theSentences.size());
+        String sentenceChunk = text.substring(sentencesOffsetPosition.get(currentSentenceIndex).start,
+            sentencesOffsetPosition.get(currentSentenceIndex).end);
+
+        for (LayoutToken token : curParagraphTokens) {
+            if (StringUtils.isEmpty(token.getText()))
+                continue;
+
+            int newPos = sentenceChunk.indexOf(token.getText(), pos);
+            if ((newPos != -1) || SentenceUtilities.toSkipToken(token.getText())) {
+                // just move on
+                currentSentenceTokens.add(token);
+                if (newPos != -1 && !SentenceUtilities.toSkipToken(token.getText()))
+                    pos = newPos;
+            } else {
+                if (currentSentenceTokens.size() > 0) {
+                    segmentedParagraphTokens.add(currentSentenceTokens);
+                    currentSentenceIndex++;
+                    if (currentSentenceIndex >= sentencesOffsetPosition.size()) {
+                        currentSentenceTokens = new ArrayList<>();
+                        break;
+                    }
+                    sentenceChunk = text.substring(sentencesOffsetPosition.get(currentSentenceIndex).start, sentencesOffsetPosition.get(currentSentenceIndex).end);
+                }
+                currentSentenceTokens = new ArrayList<>();
+                currentSentenceTokens.add(token);
+                pos = 0;
+            }
+
+            if (currentSentenceIndex >= sentencesOffsetPosition.size())
+                break;
+        }
+        // last sentence
+        if (currentSentenceTokens.size() > 0) {
+            // check sentence index too ?
+            segmentedParagraphTokens.add(currentSentenceTokens);
+        }
+        return segmentedParagraphTokens;
+    }
+
+    protected List<Triple<String, String, OffsetPosition>> extractStylesList(List<LayoutToken> tokenList) {
+        List<Triple<String, String, OffsetPosition>> styleList = new ArrayList<>();
+        String previousStyleName = "";
+        StringBuilder temporaryText = new StringBuilder();
+        StringBuilder value = new StringBuilder();
+
+        for (int index = 0; index < tokenList.size(); index++) {
+            LayoutToken token = tokenList.get(index);
+            int startOffset = temporaryText.toString().length();
+            temporaryText.append(token.getText());
+            int endOffset = temporaryText.toString().length();
+
+            StringBuilder styleName = new StringBuilder();
+            if (token.isBold()) {
+                styleName.append("bold").append(" ");
+            }
+
+            if (token.isItalic()) {
+                styleName.append("italic").append(" ");
+            }
+
+            if(token.isSuperscript()) {
+                styleName.append("superscript");
+            } else if(token.isSubscript()) {
+                styleName.append("subscript");
+            }
+
+            String styleNameTrimmed = StringUtils.trim(styleName.toString());
+            value.append(token.getText());
+
+            if (StringUtils.isEmpty(styleNameTrimmed)) {
+                previousStyleName = styleNameTrimmed;
+                value = new StringBuilder();
+                continue;
+            }
+
+            if (styleNameTrimmed.equals(previousStyleName)) {
+                Iterables.getLast(styleList).getRight().end = endOffset;
+            } else {
+                styleList.add(Triple.of(styleNameTrimmed, value.toString(), new OffsetPosition(startOffset, endOffset)));
+            }
+
+            previousStyleName = styleNameTrimmed;
+        }
+
+        return styleList;
+    }
 
     /**
      * Return the graphic objects in a given interval position in the document.
@@ -1555,26 +1698,46 @@ private List<GraphicObject> getGraphicObject(List<GraphicObject> graphicObjects,
         return result;
     }
 
-    private org.grobid.core.utilities.Pair<String, String> getSectionNumber(String text) {
+    protected Pair<List<LayoutToken>, String> getSectionNumber(List<LayoutToken> tokens) {
+
+        String text = LayoutTokensUtil.toText(tokens);
+
         Matcher m1 = BasicStructureBuilder.headerNumbering1.matcher(text);
         Matcher m2 = BasicStructureBuilder.headerNumbering2.matcher(text);
         Matcher m3 = BasicStructureBuilder.headerNumbering3.matcher(text);
         Matcher m = null;
+        OffsetPosition position = null;
         String numb = null;
         if (m1.find()) {
             numb = m1.group(0);
+            position = new OffsetPosition(m1.start(), m1.end());
             m = m1;
         } else if (m2.find()) {
             numb = m2.group(0);
+            position = new OffsetPosition(m2.start(), m2.end());
             m = m2;
         } else if (m3.find()) {
             numb = m3.group(0);
+            position = new OffsetPosition(m3.start(), m3.end());
             m = m3;
         }
         if (numb != null) {
-            text = text.replace(numb, "").trim();
+            int lastPosition = 0;
+            StringBuilder acc = new StringBuilder();
+            List<LayoutToken> tokensWithoutSectionNumbers = new ArrayList<>();
+            for (int idx=0; idx < tokens.size(); idx++) {
+                if (!(lastPosition >= position.start && lastPosition < position.end )) {
+                    if (!(tokensWithoutSectionNumbers.size() == 0 && tokens.get(idx).getText().equals(" "))) {
+                        //adding a space at the beginning of the accumulator should be ignored
+                        tokensWithoutSectionNumbers.add(tokens.get(idx));
+                    }
+                }
+                acc.append(tokens.get(idx).getText());
+                lastPosition = acc.toString().length();
+            }
+
             numb = numb.replace(" ", "");
-            return new org.grobid.core.utilities.Pair<>(text, numb);
+            return Pair.of(tokensWithoutSectionNumbers, numb);
         } else {
             return null;
         }
@@ -1640,7 +1803,7 @@ public List<Node> markReferencesTEILuceneBased(List<LayoutToken> refTokens,
         if ( (refTokens == null) || (refTokens.size() == 0) ) 
             return null;
         String text = LayoutTokensUtil.toText(refTokens);
-        if (text == null || text.trim().length() == 0 || text.endsWith("</ref>") || text.startsWith("<ref") || markerMatcher == null)
+        if (StringUtils.isEmpty(text) || text.endsWith("</ref>") || text.startsWith("<ref") || markerMatcher == null)
             return Collections.<Node>singletonList(new Text(text));
 
         boolean spaceEnd = false;
diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
index bd4d3d118a..b7a1052538 100644
--- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
@@ -1,15 +1,22 @@
 package org.grobid.core.document;
 
+import nu.xom.Element;
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.commons.lang3.tuple.Triple;
+import org.grobid.core.analyzers.GrobidAnalyzer;
+import org.grobid.core.document.xml.XmlBuilderUtils;
+import org.grobid.core.engines.config.GrobidAnalysisConfig;
+import org.grobid.core.layout.LayoutToken;
 import org.grobid.core.utilities.GrobidProperties;
+import org.grobid.core.utilities.LayoutTokensUtil;
+import org.grobid.core.utilities.OffsetPosition;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
+import java.util.List;
 
 import static org.hamcrest.CoreMatchers.is;
-import static org.hamcrest.core.IsNull.notNullValue;
-import static org.hamcrest.core.IsNull.nullValue;
+import static org.hamcrest.Matchers.hasSize;
 import static org.junit.Assert.assertThat;
 
 public class TEIFormatterTest {
@@ -19,4 +26,94 @@ public static void setInitialContext() throws Exception {
         GrobidProperties.getInstance();
     }
 
+    @Test
+    public void testSegmentIntoSentences_simpleText_ShouldSplitIntoSentencesAndAddSTag() throws Exception {
+        String text = "One sentence. Second sentence.";
+
+        GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build();
+        List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
+        Element currentParagraph = XmlBuilderUtils.teiElement("p");
+        currentParagraph.appendChild(text);
+
+        new TEIFormatter(null, null)
+            .segmentIntoSentences(currentParagraph, currentParagraphTokens, config, "en");
+
+        assertThat(currentParagraph.toXML(), is("<p xmlns=\"http://www.tei-c.org/ns/1.0\"><s>One sentence.</s><s>Second sentence.</s></p>"));
+        assertThat(currentParagraph.getChildElements().size(), is(2));
+    }
+
+    @Test
+    public void testSegmentIntoSentences_Bold_ShouldSplitIntoSentencesAndAddSTag() throws Exception {
+        String text = "One sentence. Second sentence.";
+
+        GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build();
+        List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
+        currentParagraphTokens.get(0).setBold(true);
+        currentParagraphTokens.get(2).setBold(true);
+        currentParagraphTokens.get(2).setItalic(true);
+        Element currentParagraph = XmlBuilderUtils.teiElement("p");
+        currentParagraph.appendChild(text);
+
+        new TEIFormatter(null, null)
+            .segmentIntoSentences(currentParagraph, currentParagraphTokens, config, "en");
+
+        assertThat(currentParagraph.toXML(), is("<p xmlns=\"http://www.tei-c.org/ns/1.0\"><s>One sentence.</s><s>Second sentence.</s></p>"));
+        assertThat(currentParagraph.getChildElements().size(), is(2));
+    }
+
+    @Test
+    public void testExtractStylesList_1_shouldWork() throws Exception {
+        String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure ";
+        GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build();
+        List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
+
+        currentParagraphTokens.get(26).setSubscript(true);
+        currentParagraphTokens.get(30).setSubscript(true);
+
+        List<Triple<String, String, OffsetPosition>> pairs = new TEIFormatter(null, null).extractStylesList(currentParagraphTokens);
+
+        assertThat(pairs, hasSize(2));
+        assertThat(pairs.get(0).getLeft(), is("subscript"));
+        assertThat(pairs.get(0).getMiddle(), is("2"));
+        assertThat(pairs.get(0).getRight().start, is(79));
+        assertThat(pairs.get(0).getRight().end, is(80));
+
+        assertThat(pairs.get(1).getLeft(), is("subscript"));
+        assertThat(pairs.get(1).getMiddle(), is("14"));
+        assertThat(pairs.get(1).getRight().start, is(84));
+        assertThat(pairs.get(1).getRight().end, is(86));
+
+
+    }
+
+    @Test
+    public void testGetSectionNumber_simple_ShouldWork() throws Exception {
+        String text = "3 Supercon 2";
+        GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build();
+        List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
+
+        currentParagraphTokens.get(4).setSubscript(true);
+        Pair<List<LayoutToken>, String> sectionNumber = new TEIFormatter(null, null)
+            .getSectionNumber(currentParagraphTokens);
+
+        String output = LayoutTokensUtil.toText(sectionNumber.getLeft());
+        assertThat(output, is("Supercon 2"));
+        assertThat(sectionNumber.getRight(), is("3"));
+    }
+
+    @Test
+    public void testGetSectionNumber_doubleSpace_ShouldWork() throws Exception {
+        String text = "3   Supercon 2";
+        GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build();
+        List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
+
+        currentParagraphTokens.get(6).setSubscript(true);
+        Pair<List<LayoutToken>, String> sectionNumber = new TEIFormatter(null, null)
+            .getSectionNumber(currentParagraphTokens);
+
+        String output = LayoutTokensUtil.toText(sectionNumber.getLeft());
+        assertThat(output, is("Supercon 2"));
+        assertThat(sectionNumber.getRight(), is("3"));
+    }
+
 }
\ No newline at end of file

From e3986861b88c584267a8a149b4092546ed0324ff Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Fri, 22 Jul 2022 13:59:54 +0900
Subject: [PATCH 02/23] correct missing paragraphs

---
 .../org/grobid/core/document/TEIFormatter.java   | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 5a273b4d50..197828c3d4 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1171,7 +1171,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                 if (CollectionUtils.isNotEmpty(stylesList)) {
                     applyStyleList(head, text, stylesList);
                 } else {
-                    head.appendChild(text);
+                    head.appendChild(StringUtils.normalizeSpace(text.replace("\n", "")));
                 }
 
                 if (config.isGenerateTeiIds()) {
@@ -1222,7 +1222,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                 if (CollectionUtils.isNotEmpty(stylesList)) {
                     applyStyleList(itemNode, text, stylesList);
                 } else {
-                    itemNode.appendChild(text);
+                    itemNode.appendChild(StringUtils.normalizeSpace(text));
                 }
 
                 if (!MARKER_LABELS.contains(lastClusterLabel) && (lastClusterLabel != TaggingLabels.ITEM)) {
@@ -1260,10 +1260,10 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
 
                 List<Triple<String, String, OffsetPosition>> stylesList = extractStylesList(dehyphenized);
 
-                if (CollectionUtils.isEmpty(stylesList)) {
-
-                } else {
+                if (CollectionUtils.isNotEmpty(stylesList)) {
                     applyStyleList(curParagraph, text, stylesList);
+                } else {
+                    curParagraph.appendChild(StringUtils.normalizeSpace(text));
                 }
                 curParagraphTokens.addAll(cluster.concatTokens());
             } else if (MARKER_LABELS.contains(clusterLabel)) {
@@ -1389,16 +1389,16 @@ private Element applyStyleList(Element paragraphElem, String paragraphText, List
             String subString = paragraphText.substring(lastPosition, offsetStyle.start);
             String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : "";
             String suffixSpace = StringUtils.endsWith(subString, " ") ? " " : "";
-            paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString) + suffixSpace);
+            paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " ")) + suffixSpace);
             Element rend = teiElement("hi");
             rend.addAttribute(new Attribute("rend", style.getLeft()));
-            rend.appendChild(StringUtils.normalizeSpace(paragraphText.substring(offsetStyle.start, offsetStyle.end)));
+            rend.appendChild(StringUtils.normalizeSpace(paragraphText.substring(offsetStyle.start, offsetStyle.end).replace("\n", " ")));
             lastPosition = offsetStyle.end;
             paragraphElem.appendChild(rend);
         }
         String subString = paragraphText.substring(lastPosition);
         String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : "";
-        paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString));
+        paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " ")));
 
         return paragraphElem;
     }

From e6ba12be4d28e699de745c1013189a8ca356face Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Fri, 22 Jul 2022 15:21:50 +0900
Subject: [PATCH 03/23] add style to figure and table caption

---
 .../java/org/grobid/core/data/Figure.java     | 22 +++++++++++++------
 .../main/java/org/grobid/core/data/Table.java | 17 ++++++++++++--
 .../grobid/core/document/TEIFormatter.java    |  5 ++---
 .../core/document/xml/XmlBuilderUtils.java    |  2 +-
 4 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java
index e9417e9217..b4784a8e70 100644
--- a/grobid-core/src/main/java/org/grobid/core/data/Figure.java
+++ b/grobid-core/src/main/java/org/grobid/core/data/Figure.java
@@ -5,6 +5,7 @@
 import com.google.common.collect.Lists;
 import com.google.common.base.Joiner;
 
+import org.apache.commons.lang3.tuple.Triple;
 import org.grobid.core.GrobidModels;
 import org.apache.commons.collections4.CollectionUtils;
 import org.apache.commons.lang3.StringUtils;
@@ -17,12 +18,9 @@
 import org.grobid.core.layout.GraphicObjectType;
 import org.grobid.core.layout.LayoutToken;
 import org.grobid.core.layout.VectorGraphicBoxCalculator;
-import org.grobid.core.utilities.BoundingBoxCalculator;
-import org.grobid.core.utilities.LayoutTokensUtil;
-import org.grobid.core.utilities.TextUtilities;
+import org.grobid.core.utilities.*;
 import org.grobid.core.tokenization.TaggingTokenCluster;
 import org.grobid.core.tokenization.TaggingTokenClusteror;
-import org.grobid.core.utilities.KeyGen;
 import org.grobid.core.engines.label.TaggingLabels;
 import org.grobid.core.engines.label.TaggingLabel;
 import org.grobid.core.engines.citations.CalloutAnalyzer.MarkerType;
@@ -41,6 +39,8 @@
 import java.util.SortedSet;
 import java.util.Collections;
 
+import static org.grobid.core.document.TEIFormatter.applyStyleList;
+import static org.grobid.core.document.TEIFormatter.extractStylesList;
 import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement;
 import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId;
 import static org.grobid.core.document.xml.XmlBuilderUtils.textNode;
@@ -388,7 +388,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
 
             // if the segment has been parsed with the full text model we further extract the clusters
             // to get the bibliographical references
-            if ( (labeledCaption != null) && (labeledCaption.length() > 0) ) {
+            if (StringUtils.isNotEmpty(labeledCaption))  {
                 TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens);
                 List<TaggingTokenCluster> clusters = clusteror.cluster();
                 
@@ -404,7 +404,9 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
 
                     TaggingLabel clusterLabel = cluster.getTaggingLabel();
                     //String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
-                    String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
+                    List<LayoutToken> dehyphenized = LayoutTokensUtil.dehyphenize(cluster.concatTokens());
+                    String text = LayoutTokensUtil.toText(dehyphenized).replace("\n", " ");
+
                     if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) {
                         try {
                             List<Node> refNodes = formatter.markReferencesTEILuceneBased(
@@ -422,7 +424,13 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
                             LOGGER.warn("Problem when serializing TEI fragment for figure caption", e);
                         }
                     } else {
-                        desc.appendChild(textNode(clusterContent));
+                        List<Triple<String, String, OffsetPosition>> stylesList = extractStylesList(dehyphenized);
+
+                        if (CollectionUtils.isNotEmpty(stylesList)) {
+                            applyStyleList(desc, text, stylesList);
+                        } else {
+                            desc.appendChild(StringUtils.normalizeSpace(text));
+                        }
                     }
                 }
             } else {
diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java
index 95b0bf8704..9a9aa6cf3e 100644
--- a/grobid-core/src/main/java/org/grobid/core/data/Table.java
+++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java
@@ -1,5 +1,7 @@
 package org.grobid.core.data;
 
+import org.apache.commons.collections4.CollectionUtils;
+import org.apache.commons.lang3.tuple.Triple;
 import org.grobid.core.GrobidModels;
 import org.apache.commons.lang3.StringUtils;
 import org.grobid.core.data.table.Cell;
@@ -15,6 +17,7 @@
 import org.grobid.core.layout.LayoutToken;
 import org.grobid.core.utilities.BoundingBoxCalculator;
 import org.grobid.core.utilities.LayoutTokensUtil;
+import org.grobid.core.utilities.OffsetPosition;
 import org.grobid.core.utilities.counters.CntManager;
 import org.grobid.core.engines.counters.TableRejectionCounters;
 import org.grobid.core.tokenization.TaggingTokenCluster;
@@ -32,6 +35,8 @@
 import nu.xom.Node;
 import nu.xom.Text;
 
+import static org.grobid.core.document.TEIFormatter.applyStyleList;
+import static org.grobid.core.document.TEIFormatter.extractStylesList;
 import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement;
 import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId;
 import static org.grobid.core.document.xml.XmlBuilderUtils.textNode;
@@ -119,7 +124,9 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
 
                     TaggingLabel clusterLabel = cluster.getTaggingLabel();
                     //String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
-                    String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
+                    List<LayoutToken> dehyphenized = LayoutTokensUtil.dehyphenize(cluster.concatTokens());
+                    String text = LayoutTokensUtil.toText(dehyphenized).replace("\n", " ");
+
                     if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) {
                         try {
                             List<Node> refNodes = formatter.markReferencesTEILuceneBased(
@@ -137,7 +144,13 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
                             LOGGER.warn("Problem when serializing TEI fragment for table caption", e);
                         }
                     } else {
-                        desc.appendChild(textNode(clusterContent));
+                        List<Triple<String, String, OffsetPosition>> stylesList = extractStylesList(dehyphenized);
+
+                        if (CollectionUtils.isNotEmpty(stylesList)) {
+                            applyStyleList(desc, text, stylesList);
+                        } else {
+                            desc.appendChild(StringUtils.normalizeSpace(text));
+                        }
                     }
 
                     if (desc != null && config.isWithSentenceSegmentation()) {
diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 197828c3d4..1b7c12dacd 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1377,7 +1377,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
         return buffer;
     }
 
-    private Element applyStyleList(Element paragraphElem, String paragraphText, List<Triple<String, String, OffsetPosition>> stylesList) {
+    public static Element applyStyleList(Element paragraphElem, String paragraphText, List<Triple<String, String, OffsetPosition>> stylesList) {
 //        if (CollectionUtils.isEmpty(stylesList)) {
 //            paragraphElem.appendChild(StringUtils.normalizeSpace(paragraphText));
 //            return paragraphElem;
@@ -1515,7 +1515,6 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
     k++;
 }
 }*/
-        }
 
         // update the xml paragraph element
         int currenChildIndex = 0;
@@ -1634,7 +1633,7 @@ private List<List<LayoutToken>> segmentLayoutTokenLists(List<LayoutToken> curPar
         return segmentedParagraphTokens;
     }
 
-    protected List<Triple<String, String, OffsetPosition>> extractStylesList(List<LayoutToken> tokenList) {
+    public static List<Triple<String, String, OffsetPosition>> extractStylesList(List<LayoutToken> tokenList) {
         List<Triple<String, String, OffsetPosition>> styleList = new ArrayList<>();
         String previousStyleName = "";
         StringBuilder temporaryText = new StringBuilder();
diff --git a/grobid-core/src/main/java/org/grobid/core/document/xml/XmlBuilderUtils.java b/grobid-core/src/main/java/org/grobid/core/document/xml/XmlBuilderUtils.java
index 5d4850f94e..0c549078df 100644
--- a/grobid-core/src/main/java/org/grobid/core/document/xml/XmlBuilderUtils.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/xml/XmlBuilderUtils.java
@@ -102,7 +102,7 @@ public static void main(String[] args) throws ParsingException, IOException {
     }
 
     public static String stripNonValidXMLCharacters(String in) {
-        StringBuffer out = new StringBuffer(); // Used to hold the output.
+        StringBuilder out = new StringBuilder(); // Used to hold the output.
         char current; // Used to reference the current character.
 
         if (in == null || ("".equals(in))) 

From 17a914666bf24732454ae6f8d58eb279d5a54b2e Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Fri, 22 Jul 2022 15:38:34 +0900
Subject: [PATCH 04/23] add style to title

---
 .../grobid/core/document/TEIFormatter.java    | 28 ++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 1b7c12dacd..1b30dac357 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -155,7 +155,33 @@ public StringBuilder toTEIHeader(BiblioItem biblio,
         }
 
         if (biblio.getTitle() != null) {
-            tei.append(TextUtilities.HTMLEncode(biblio.getTitle()));
+            List<LayoutToken> layoutTokens = biblio.getLayoutTokens(TaggingLabels.HEADER_TITLE);
+
+            String text = LayoutTokensUtil.toText(layoutTokens).replace("\n", " ");
+
+            List<Triple<String, String, OffsetPosition>> stylesList = extractStylesList(layoutTokens);
+
+            if (CollectionUtils.isNotEmpty(stylesList)) {
+                int lastPosition = 0;
+                for (Triple<String, String, OffsetPosition> style : stylesList) {
+                    OffsetPosition offsetStyle = style.getRight();
+                    String subString = text.substring(lastPosition, offsetStyle.start);
+                    String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : "";
+                    String suffixSpace = StringUtils.endsWith(subString, " ") ? " " : "";
+                    tei.append(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " ")) + suffixSpace);
+                    tei.append("<hi rend=\"").append(style.getLeft()).append("\"").append(">")
+                        .append(StringUtils.normalizeSpace(text.substring(offsetStyle.start, offsetStyle.end).replace("\n", " ")))
+                        .append("</hi>");
+                    lastPosition = offsetStyle.end;
+                }
+                String subString = text.substring(lastPosition);
+                String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : "";
+                tei.append(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " ")));
+
+            } else {
+                String title = biblio.getTitle();
+                tei.append(TextUtilities.HTMLEncode(title));
+            }
         }
 
         tei.append("</title>\n\t\t\t</titleStmt>\n");

From cc3a0e5494ace00b71ef1fe2ee6a5cda5c4c5d53 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Mon, 25 Jul 2022 08:48:10 +0900
Subject: [PATCH 05/23] wrongly inverted if

---
 .../src/main/java/org/grobid/core/document/TEIFormatter.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 1b30dac357..dd7f1959f0 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1215,7 +1215,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                     clusterLabel.equals(TaggingLabels.EQUATION_LABEL)) {
                 // get starting position of the cluster
                 int start = -1;
-                if ( CollectionUtils.isEmpty(cluster.concatTokens()) ) {
+                if ( CollectionUtils.isNotEmpty(cluster.concatTokens()) ) {
                     start = cluster.concatTokens().get(0).getOffset();
                 }
                 // get the corresponding equation

From d5ae544febd66739b48a847e1f9c64677467986f Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Mon, 25 Jul 2022 08:57:34 +0900
Subject: [PATCH 06/23] missing parenthesis

---
 .../java/org/grobid/core/document/TEIFormatter.java  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index dd7f1959f0..8c6f37fd5c 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1486,7 +1486,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
         pos = 0;
         
         if (config.isGenerateTeiCoordinates("s")) {
-            
+
             int currentSentenceIndex = 0;
 //System.out.println(text);            
 //System.out.println("theSentences.size(): " + theSentences.size());
@@ -1494,7 +1494,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
 
             for(int i=0; i<curParagraphTokens.size(); i++) {
                 LayoutToken token = curParagraphTokens.get(i);
-                if (token.getText() == null || token.getText().length() == 0) 
+                if (token.getText() == null || token.getText().length() == 0)
                     continue;
                 int newPos = sentenceChunk.indexOf(token.getText(), pos);
                 if ((newPos != -1) || SentenceUtilities.toSkipToken(token.getText())) {
@@ -1516,7 +1516,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
                     currentSentenceTokens.add(token);
                     pos = 0;
                 }
-                
+
                 if (currentSentenceIndex >= theSentences.size())
                     break;
             }
@@ -1541,7 +1541,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
     k++;
 }
 }*/
-
+        }
         // update the xml paragraph element
         int currenChildIndex = 0;
         pos = 0;
@@ -1564,12 +1564,12 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
                     }
                 }
             }
-            
+
             int sentenceLength = theSentences.get(i).end - pos;
             // check if we have a ref between pos and pos+sentenceLength
             for(int j=refIndex; j<refPositions.size(); j++) {
                 int refPos = refPositions.get(j).intValue();
-                if (refPos < pos+posInSentence) 
+                if (refPos < pos+posInSentence)
                     continue;
 
                 if (refPos >= pos+posInSentence && refPos <= pos+sentenceLength) {

From 064c2f6bcee3978e9b7ba96cf824c8225f0a2ad8 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Mon, 25 Jul 2022 10:04:27 +0900
Subject: [PATCH 07/23] add decoration in equation/formula

---
 .../java/org/grobid/core/data/Equation.java   | 32 +++++++++--
 .../grobid/core/document/TEIFormatter.java    | 25 +++++----
 .../grobid/core/engines/FullTextParser.java   |  7 +--
 .../core/document/TEIFormatterTest.java       | 54 ++++++++++++++++++-
 4 files changed, 99 insertions(+), 19 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/data/Equation.java b/grobid-core/src/main/java/org/grobid/core/data/Equation.java
index 141660d848..753e3d5908 100644
--- a/grobid-core/src/main/java/org/grobid/core/data/Equation.java
+++ b/grobid-core/src/main/java/org/grobid/core/data/Equation.java
@@ -2,7 +2,9 @@
 
 import nu.xom.Attribute;
 import nu.xom.Element;
+import org.apache.commons.collections4.CollectionUtils;
 import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.lang3.tuple.Triple;
 import org.grobid.core.document.xml.XmlBuilderUtils;
 import org.grobid.core.engines.Engine;
 import org.grobid.core.engines.config.GrobidAnalysisConfig;
@@ -10,13 +12,17 @@
 import org.grobid.core.layout.LayoutToken;
 import org.grobid.core.utilities.BoundingBoxCalculator;
 import org.grobid.core.utilities.LayoutTokensUtil;
+import org.grobid.core.utilities.OffsetPosition;
 import org.grobid.core.utilities.counters.CntManager;
 import org.grobid.core.utilities.TextUtilities;
 
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 import java.util.SortedSet;
 
+import static org.grobid.core.document.TEIFormatter.*;
+
 /**
  * Class for representing an equation.
  *
@@ -56,9 +62,15 @@ public Element toTEIElement(GrobidAnalysisConfig config) {
 			XmlBuilderUtils.addCoords(formulaElement, LayoutTokensUtil.getCoordsStringForOneBox(getLayoutTokens()));
 		}
 
-		formulaElement.appendChild(LayoutTokensUtil.normalizeText(content.toString()).trim());
+        List<Triple<String, String, OffsetPosition>> stylesList = extractStylesList(getContentTokens(), Arrays.asList(TEI_STYLE_BOLD_NAME, TEI_STYLE_ITALIC_NAME))   ;
+
+        if (CollectionUtils.isNotEmpty(stylesList)) {
+            applyStyleList(formulaElement, getContent(), stylesList);
+        } else {
+            formulaElement.appendChild(LayoutTokensUtil.normalizeText(content.toString()).trim());
+        }
 
-		if ( (label != null) && (label.length()>0) ) {
+		if ( StringUtils.isNotEmpty(label) ) {
 			Element labelEl = XmlBuilderUtils.teiElement("label",
     	    		LayoutTokensUtil.normalizeText(label.toString()));
 			formulaElement.appendChild(labelEl);
@@ -79,6 +91,16 @@ public List<LayoutToken> getContentTokens() {
 		return contentTokens;
 	}
 
+    public void addContentTokens(List<LayoutToken> tokens) {
+        if (tokens == null)
+            return;
+
+        if (contentTokens == null)
+            contentTokens = new ArrayList<>();
+
+        contentTokens.addAll(tokens);
+    }
+
 	public List<LayoutToken> getLabelTokens() {
 		return labelTokens;
 	}
@@ -181,9 +203,9 @@ public void addLayoutTokens(List<LayoutToken> tokens) {
     	if (tokens == null)
     		return;
     	if (layoutTokens == null)
-    		layoutTokens = new ArrayList<LayoutToken>();
-    	for(LayoutToken token : tokens)
-	    	layoutTokens.add(token);
+    		layoutTokens = new ArrayList<>();
+
+        layoutTokens.addAll(tokens);
     }
 
     public List<BoundingBox> getCoordinates() {
diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 8c6f37fd5c..6845dccd96 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -57,6 +57,10 @@
 @SuppressWarnings("StringConcatenationInsideStringBuilderAppend")
 public class TEIFormatter {
     private static final Logger LOGGER = LoggerFactory.getLogger(TEIFormatter.class);
+    public static final String TEI_STYLE_ITALIC_NAME = "italic";
+    public static String TEI_STYLE_BOLD_NAME = "bold";
+    public static String TEI_STYLE_SUPERSCRIPT_NAME = "superscript";
+    public static String TEI_STYLE_SUBSCRIPT_NAME = "subscript";
 
     private Document doc = null;
     private FullTextParser fullTextParser = null;
@@ -1221,7 +1225,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                 // get the corresponding equation
                 if (start != -1) {
                     Equation theEquation = null;
-                    if (equations != null) {
+                    if (CollectionUtils.isNotEmpty(equations)) {
                         for(int i=0; i<equations.size(); i++) {
                             if (i < equationIndex) 
                                 continue;
@@ -1660,6 +1664,9 @@ private List<List<LayoutToken>> segmentLayoutTokenLists(List<LayoutToken> curPar
     }
 
     public static List<Triple<String, String, OffsetPosition>> extractStylesList(List<LayoutToken> tokenList) {
+        return extractStylesList(tokenList, new ArrayList<>());
+    }
+    public static List<Triple<String, String, OffsetPosition>> extractStylesList(List<LayoutToken> tokenList, List<String> ignoreStyles) {
         List<Triple<String, String, OffsetPosition>> styleList = new ArrayList<>();
         String previousStyleName = "";
         StringBuilder temporaryText = new StringBuilder();
@@ -1672,18 +1679,18 @@ public static List<Triple<String, String, OffsetPosition>> extractStylesList(Lis
             int endOffset = temporaryText.toString().length();
 
             StringBuilder styleName = new StringBuilder();
-            if (token.isBold()) {
-                styleName.append("bold").append(" ");
+            if (token.isBold() && !ignoreStyles.contains(TEI_STYLE_BOLD_NAME)) {
+                styleName.append(TEI_STYLE_BOLD_NAME).append(" ");
             }
 
-            if (token.isItalic()) {
-                styleName.append("italic").append(" ");
+            if (token.isItalic() && !ignoreStyles.contains(TEI_STYLE_ITALIC_NAME)) {
+                styleName.append(TEI_STYLE_ITALIC_NAME).append(" ");
             }
 
-            if(token.isSuperscript()) {
-                styleName.append("superscript");
-            } else if(token.isSubscript()) {
-                styleName.append("subscript");
+            if(token.isSuperscript() && !ignoreStyles.contains(TEI_STYLE_SUPERSCRIPT_NAME)) {
+                styleName.append(TEI_STYLE_SUPERSCRIPT_NAME);
+            } else if(token.isSubscript() && !ignoreStyles.contains(TEI_STYLE_SUBSCRIPT_NAME)) {
+                styleName.append(TEI_STYLE_SUBSCRIPT_NAME);
             }
 
             String styleNameTrimmed = StringUtils.trim(styleName.toString());
diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
index 5febe8ea14..980c17a14c 100755
--- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
+++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
@@ -2272,7 +2272,7 @@ protected List<Equation> processEquations(String rese,
 			}
 
 			List<LayoutToken> tokenizationEquation = cluster.concatTokens();
-			String clusterContent = LayoutTokensUtil.normalizeText(LayoutTokensUtil.toText(cluster.concatTokens()));
+			String clusterContent = LayoutTokensUtil.toText(cluster.concatTokens());
 
 			if (currentResult == null)
 				currentResult = new Equation();
@@ -2288,10 +2288,11 @@ protected List<Equation> processEquations(String rese,
 					currentResult = new Equation();
 				}
 	            currentResult.appendContent(clusterContent);
-            	currentResult.addLayoutTokens(cluster.concatTokens());
+            	currentResult.addLayoutTokens(tokenizationEquation);
+            	currentResult.addContentTokens(tokenizationEquation);
             } else if (clusterLabel.equals(TaggingLabels.EQUATION_LABEL)) {
                 currentResult.appendLabel(clusterContent);
-	            currentResult.addLayoutTokens(cluster.concatTokens());
+	            currentResult.addLayoutTokens(tokenizationEquation);
             }
 
 			lastLabel = clusterLabel;
diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
index b7a1052538..e652846343 100644
--- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
@@ -13,8 +13,10 @@
 import org.junit.BeforeClass;
 import org.junit.Test;
 
+import java.util.Arrays;
 import java.util.List;
 
+import static org.grobid.core.document.TEIFormatter.TEI_STYLE_BOLD_NAME;
 import static org.hamcrest.CoreMatchers.is;
 import static org.hamcrest.Matchers.hasSize;
 import static org.junit.Assert.assertThat;
@@ -62,7 +64,7 @@ public void testSegmentIntoSentences_Bold_ShouldSplitIntoSentencesAndAddSTag() t
     }
 
     @Test
-    public void testExtractStylesList_1_shouldWork() throws Exception {
+    public void testExtractStylesList_single_shouldWork() throws Exception {
         String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure ";
         GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build();
         List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
@@ -70,7 +72,7 @@ public void testExtractStylesList_1_shouldWork() throws Exception {
         currentParagraphTokens.get(26).setSubscript(true);
         currentParagraphTokens.get(30).setSubscript(true);
 
-        List<Triple<String, String, OffsetPosition>> pairs = new TEIFormatter(null, null).extractStylesList(currentParagraphTokens);
+        List<Triple<String, String, OffsetPosition>> pairs = TEIFormatter.extractStylesList(currentParagraphTokens);
 
         assertThat(pairs, hasSize(2));
         assertThat(pairs.get(0).getLeft(), is("subscript"));
@@ -82,8 +84,56 @@ public void testExtractStylesList_1_shouldWork() throws Exception {
         assertThat(pairs.get(1).getMiddle(), is("14"));
         assertThat(pairs.get(1).getRight().start, is(84));
         assertThat(pairs.get(1).getRight().end, is(86));
+    }
+
+    @Test
+    public void testExtractStylesList_combined_shouldWork() throws Exception {
+        String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure ";
+        GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build();
+        List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
+
+        currentParagraphTokens.get(26).setSubscript(true);
+        currentParagraphTokens.get(26).setBold(true);
+        currentParagraphTokens.get(26).setItalic(true);
+        currentParagraphTokens.get(30).setSubscript(true);
+
+        List<Triple<String, String, OffsetPosition>> pairs = TEIFormatter.extractStylesList(currentParagraphTokens);
+
+        assertThat(pairs, hasSize(2));
+        assertThat(pairs.get(0).getLeft(), is("bold italic subscript"));
+        assertThat(pairs.get(0).getMiddle(), is("2"));
+        assertThat(pairs.get(0).getRight().start, is(79));
+        assertThat(pairs.get(0).getRight().end, is(80));
+
+        assertThat(pairs.get(1).getLeft(), is("subscript"));
+        assertThat(pairs.get(1).getMiddle(), is("14"));
+        assertThat(pairs.get(1).getRight().start, is(84));
+        assertThat(pairs.get(1).getRight().end, is(86));
+    }
+
+    @Test
+    public void testExtractStylesList_ignoreBold_shouldWork() throws Exception {
+        String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure ";
+        GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build();
+        List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
+
+        currentParagraphTokens.get(26).setSubscript(true);
+        currentParagraphTokens.get(26).setBold(true);
+        currentParagraphTokens.get(26).setItalic(true);
+        currentParagraphTokens.get(30).setSubscript(true);
 
+        List<Triple<String, String, OffsetPosition>> pairs = TEIFormatter.extractStylesList(currentParagraphTokens, Arrays.asList(TEI_STYLE_BOLD_NAME));
 
+        assertThat(pairs, hasSize(2));
+        assertThat(pairs.get(0).getLeft(), is("italic subscript"));
+        assertThat(pairs.get(0).getMiddle(), is("2"));
+        assertThat(pairs.get(0).getRight().start, is(79));
+        assertThat(pairs.get(0).getRight().end, is(80));
+
+        assertThat(pairs.get(1).getLeft(), is("subscript"));
+        assertThat(pairs.get(1).getMiddle(), is("14"));
+        assertThat(pairs.get(1).getRight().start, is(84));
+        assertThat(pairs.get(1).getRight().end, is(86));
     }
 
     @Test

From c5f607b33c64b2f5bff71fa862f146f01180f63b Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Mon, 25 Jul 2022 11:42:55 +0900
Subject: [PATCH 08/23] Fix spaces

---
 .../grobid/core/document/TEIFormatter.java    | 13 ++++--
 .../grobid/core/engines/FullTextParser.java   |  2 +
 .../core/document/TEIFormatterTest.java       | 46 ++++++++++++++++---
 3 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 6845dccd96..1a59321547 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1407,7 +1407,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
         return buffer;
     }
 
-    public static Element applyStyleList(Element paragraphElem, String paragraphText, List<Triple<String, String, OffsetPosition>> stylesList) {
+    public static Element applyStyleList(Element paragraphElem, String text, List<Triple<String, String, OffsetPosition>> stylesList) {
 //        if (CollectionUtils.isEmpty(stylesList)) {
 //            paragraphElem.appendChild(StringUtils.normalizeSpace(paragraphText));
 //            return paragraphElem;
@@ -1416,17 +1416,20 @@ public static Element applyStyleList(Element paragraphElem, String paragraphText
         int lastPosition = 0;
         for (Triple<String, String, OffsetPosition> style : stylesList) {
             OffsetPosition offsetStyle = style.getRight();
-            String subString = paragraphText.substring(lastPosition, offsetStyle.start);
+            String subString = text.substring(lastPosition, offsetStyle.start);
             String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : "";
-            String suffixSpace = StringUtils.endsWith(subString, " ") ? " " : "";
+            String suffixSpace = "";
+            if (subString.length() > 1) {
+                suffixSpace = StringUtils.endsWith(subString, " ") ? " " : "";
+            }
             paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " ")) + suffixSpace);
             Element rend = teiElement("hi");
             rend.addAttribute(new Attribute("rend", style.getLeft()));
-            rend.appendChild(StringUtils.normalizeSpace(paragraphText.substring(offsetStyle.start, offsetStyle.end).replace("\n", " ")));
+            rend.appendChild(StringUtils.normalizeSpace(text.substring(offsetStyle.start, offsetStyle.end).replace("\n", " ")));
             lastPosition = offsetStyle.end;
             paragraphElem.appendChild(rend);
         }
-        String subString = paragraphText.substring(lastPosition);
+        String subString = text.substring(lastPosition);
         String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : "";
         paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " ")));
 
diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
index 980c17a14c..4aaa818803 100755
--- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
+++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
@@ -2272,6 +2272,8 @@ protected List<Equation> processEquations(String rese,
 			}
 
 			List<LayoutToken> tokenizationEquation = cluster.concatTokens();
+            //LF: I removed the normalisation to keep the content in sync with contentTokens.
+            //      The normalisation "StringUtils.normaliseSpaces()" is called anyway when building the XML
 			String clusterContent = LayoutTokensUtil.toText(cluster.concatTokens());
 
 			if (currentResult == null)
diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
index e652846343..bed87b58da 100644
--- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
@@ -13,6 +13,7 @@
 import org.junit.BeforeClass;
 import org.junit.Test;
 
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 
@@ -33,7 +34,7 @@ public void testSegmentIntoSentences_simpleText_ShouldSplitIntoSentencesAndAddST
         String text = "One sentence. Second sentence.";
 
         GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build();
-        List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
+        List<LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
         Element currentParagraph = XmlBuilderUtils.teiElement("p");
         currentParagraph.appendChild(text);
 
@@ -49,7 +50,7 @@ public void testSegmentIntoSentences_Bold_ShouldSplitIntoSentencesAndAddSTag() t
         String text = "One sentence. Second sentence.";
 
         GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build();
-        List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
+        List<LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
         currentParagraphTokens.get(0).setBold(true);
         currentParagraphTokens.get(2).setBold(true);
         currentParagraphTokens.get(2).setItalic(true);
@@ -67,7 +68,7 @@ public void testSegmentIntoSentences_Bold_ShouldSplitIntoSentencesAndAddSTag() t
     public void testExtractStylesList_single_shouldWork() throws Exception {
         String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure ";
         GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build();
-        List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
+        List<LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
 
         currentParagraphTokens.get(26).setSubscript(true);
         currentParagraphTokens.get(30).setSubscript(true);
@@ -86,11 +87,42 @@ public void testExtractStylesList_single_shouldWork() throws Exception {
         assertThat(pairs.get(1).getRight().end, is(86));
     }
 
+    @Test
+    public void applyStyleList_simpleStyles_shouldWork() throws Exception {
+        String text = "This is bold and italic.";
+        List<Triple<String, String, OffsetPosition>> styles = new ArrayList<>();
+        styles.add(Triple.of("bold", "bold", new OffsetPosition(8, 12)));
+        styles.add(Triple.of("italic", "italic", new OffsetPosition(17, 23)));
+        Element rootElement = XmlBuilderUtils.teiElement("p");
+        TEIFormatter.applyStyleList(rootElement, text, styles);
+
+        assertThat(rootElement.toXML(), is("<p xmlns=\"http://www.tei-c.org/ns/1.0\">This is " +
+            "<hi rend=\"bold\">bold</hi> and <hi rend=\"italic\">italic</hi>.</p>"));
+    }
+
+    @Test
+    public void applyStyleList_complexStyles_shouldWork() throws Exception {
+        String text = "This is bold and italic.";
+        List<Triple<String, String, OffsetPosition>> styles = new ArrayList<>();
+        styles.add(Triple.of("subscript", "is", new OffsetPosition(5, 7)));
+        styles.add(Triple.of("bold subscript", "bold", new OffsetPosition(8, 12)));
+        styles.add(Triple.of("italic superscript", "and", new OffsetPosition(13, 16)));
+        styles.add(Triple.of("italic", "italic", new OffsetPosition(17, 23)));
+        Element rootElement = XmlBuilderUtils.teiElement("p");
+        TEIFormatter.applyStyleList(rootElement, text, styles);
+
+        assertThat(rootElement.toXML(), is("<p xmlns=\"http://www.tei-c.org/ns/1.0\">This " +
+            "<hi rend=\"subscript\">is</hi> " +
+            "<hi rend=\"bold subscript\">bold</hi> " +
+            "<hi rend=\"italic superscript\">and</hi> " +
+            "<hi rend=\"italic\">italic</hi>.</p>"));
+    }
+
     @Test
     public void testExtractStylesList_combined_shouldWork() throws Exception {
         String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure ";
         GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build();
-        List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
+        List<LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
 
         currentParagraphTokens.get(26).setSubscript(true);
         currentParagraphTokens.get(26).setBold(true);
@@ -115,7 +147,7 @@ public void testExtractStylesList_combined_shouldWork() throws Exception {
     public void testExtractStylesList_ignoreBold_shouldWork() throws Exception {
         String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure ";
         GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build();
-        List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
+        List<LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
 
         currentParagraphTokens.get(26).setSubscript(true);
         currentParagraphTokens.get(26).setBold(true);
@@ -140,7 +172,7 @@ public void testExtractStylesList_ignoreBold_shouldWork() throws Exception {
     public void testGetSectionNumber_simple_ShouldWork() throws Exception {
         String text = "3 Supercon 2";
         GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build();
-        List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
+        List<LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
 
         currentParagraphTokens.get(4).setSubscript(true);
         Pair<List<LayoutToken>, String> sectionNumber = new TEIFormatter(null, null)
@@ -155,7 +187,7 @@ public void testGetSectionNumber_simple_ShouldWork() throws Exception {
     public void testGetSectionNumber_doubleSpace_ShouldWork() throws Exception {
         String text = "3   Supercon 2";
         GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build();
-        List< LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
+        List<LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
 
         currentParagraphTokens.get(6).setSubscript(true);
         Pair<List<LayoutToken>, String> sectionNumber = new TEIFormatter(null, null)

From 386e6b4e48ec8effaba43188870da16866c0afa7 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Mon, 25 Jul 2022 11:48:23 +0900
Subject: [PATCH 09/23] add comments

---
 .../java/org/grobid/core/document/TEIFormatter.java    | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 1a59321547..579ccb722c 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1407,6 +1407,10 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
         return buffer;
     }
 
+    /**
+     * Apply the styles as described in the stylesList.
+     * This method modifies the input paragraphElem.
+     */
     public static Element applyStyleList(Element paragraphElem, String text, List<Triple<String, String, OffsetPosition>> stylesList) {
 //        if (CollectionUtils.isEmpty(stylesList)) {
 //            paragraphElem.appendChild(StringUtils.normalizeSpace(paragraphText));
@@ -1669,6 +1673,12 @@ private List<List<LayoutToken>> segmentLayoutTokenLists(List<LayoutToken> curPar
     public static List<Triple<String, String, OffsetPosition>> extractStylesList(List<LayoutToken> tokenList) {
         return extractStylesList(tokenList, new ArrayList<>());
     }
+
+
+    /**
+     * Extracts the stiles from the list of token. The additional parameter can ignore certain styles
+     * (e.g. to restrict only superscript/subscript when decorating formulas)
+     */
     public static List<Triple<String, String, OffsetPosition>> extractStylesList(List<LayoutToken> tokenList, List<String> ignoreStyles) {
         List<Triple<String, String, OffsetPosition>> styleList = new ArrayList<>();
         String previousStyleName = "";

From 44e70f383c721361e69de65b6f24e85bb39c817d Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Mon, 25 Jul 2022 15:06:46 +0900
Subject: [PATCH 10/23] add some more tests

---
 .../core/document/TEIFormatterTest.java       | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
index bed87b58da..9a64b8141f 100644
--- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
@@ -64,6 +64,34 @@ public void testSegmentIntoSentences_Bold_ShouldSplitIntoSentencesAndAddSTag() t
         assertThat(currentParagraph.getChildElements().size(), is(2));
     }
 
+    @Test
+    public void testSegmentIntoSentences_Bold_ShouldWork() throws Exception {
+        String text = "One sentence (Foppiano et al.). Second sentence (Lopez et al.). ";
+
+        GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build();
+        List<LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
+        currentParagraphTokens.get(0).setBold(true);
+        currentParagraphTokens.get(2).setBold(true);
+        currentParagraphTokens.get(2).setItalic(true);
+        Element currentParagraph = XmlBuilderUtils.teiElement("p");
+        currentParagraph.appendChild("One sentence");
+        currentParagraph.appendChild(" ");
+        currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Foppiano et al.)"));
+        currentParagraph.appendChild(". ");
+        currentParagraph.appendChild("Second sentence");
+        currentParagraph.appendChild(" ");
+        currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)"));
+        currentParagraph.appendChild(".");
+
+        System.out.println(currentParagraph.toXML());
+
+        new TEIFormatter(null, null)
+            .segmentIntoSentences(currentParagraph, currentParagraphTokens, config, "en");
+
+        assertThat(currentParagraph.toXML(),
+            is("<p xmlns=\"http://www.tei-c.org/ns/1.0\"><s>One sentence <ref>(Foppiano et al.)</ref>.</s><s>Second sentence <ref>(Lopez et al.)</ref>.</s></p>"));
+    }
+
     @Test
     public void testExtractStylesList_single_shouldWork() throws Exception {
         String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure ";

From a090297f761ce35181cb8ad398e0c16872d28362 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Mon, 25 Jul 2022 15:22:10 +0900
Subject: [PATCH 11/23] some refactoring

---
 .../grobid/core/document/TEIFormatter.java    | 65 ++++---------------
 1 file changed, 13 insertions(+), 52 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 579ccb722c..aa98bd1019 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1454,7 +1454,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
 
         // in xom, the following gives all the text under the element, for the whole subtree
         String text = curParagraph.getValue();
-        if (text == null || text.length() == 0)
+        if (StringUtils.isEmpty(text))
             return;
 
         // identify ref nodes, ref spans and ref positions
@@ -1481,7 +1481,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
             }
         }
 
-        List<OffsetPosition> theSentences = 
+        List<OffsetPosition> sentencesOffsetPosition =
             SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang));
     
         /*if (theSentences.size() == 0) {
@@ -1492,50 +1492,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
         }*/
 
         // segment the list of layout tokens according to the sentence segmentation if the coordinates are needed
-        List<List<LayoutToken>> segmentedParagraphTokens = new ArrayList<>();
-        List<LayoutToken> currentSentenceTokens = new ArrayList<>();
-        pos = 0;
-        
-        if (config.isGenerateTeiCoordinates("s")) {
-
-            int currentSentenceIndex = 0;
-//System.out.println(text);            
-//System.out.println("theSentences.size(): " + theSentences.size());
-            String sentenceChunk = text.substring(theSentences.get(currentSentenceIndex).start, theSentences.get(currentSentenceIndex).end);
-
-            for(int i=0; i<curParagraphTokens.size(); i++) {
-                LayoutToken token = curParagraphTokens.get(i);
-                if (token.getText() == null || token.getText().length() == 0)
-                    continue;
-                int newPos = sentenceChunk.indexOf(token.getText(), pos);
-                if ((newPos != -1) || SentenceUtilities.toSkipToken(token.getText())) {
-                    // just move on
-                    currentSentenceTokens.add(token);
-                    if (newPos != -1 && !SentenceUtilities.toSkipToken(token.getText()))
-                        pos = newPos;
-                } else {
-                    if (currentSentenceTokens.size() > 0) {
-                        segmentedParagraphTokens.add(currentSentenceTokens);
-                        currentSentenceIndex++;
-                        if (currentSentenceIndex >= theSentences.size()) {
-                            currentSentenceTokens = new ArrayList<>();
-                            break;
-                        }
-                        sentenceChunk = text.substring(theSentences.get(currentSentenceIndex).start, theSentences.get(currentSentenceIndex).end);
-                    }
-                    currentSentenceTokens = new ArrayList<>();
-                    currentSentenceTokens.add(token);
-                    pos = 0;
-                }
-
-                if (currentSentenceIndex >= theSentences.size())
-                    break;
-            }
-            // last sentence
-            if (currentSentenceTokens.size() > 0) {
-                // check sentence index too ?
-                segmentedParagraphTokens.add(currentSentenceTokens);
-            }
+        List<List<LayoutToken>> segmentedParagraphTokens = segmentLayoutTokenLists(curParagraphTokens, text, sentencesOffsetPosition);
 
 /*if (segmentedParagraphTokens.size() != theSentences.size()) {
 System.out.println("ERROR, segmentedParagraphTokens size:" + segmentedParagraphTokens.size() + " vs theSentences size: " + theSentences.size());
@@ -1552,16 +1509,20 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
     k++;
 }
 }*/
-        }
+
+
         // update the xml paragraph element
         int currenChildIndex = 0;
         pos = 0;
         int posInSentence = 0;
         int refIndex = 0;
-        for(int i=0; i<theSentences.size(); i++) {
-            pos = theSentences.get(i).start;
+        for(int i=0; i<sentencesOffsetPosition.size(); i++) {
+            pos = sentencesOffsetPosition.get(i).start;
             posInSentence = 0;
             Element sentenceElement = teiElement("s");
+
+            List<LayoutToken> currentSentenceTokens = segmentedParagraphTokens.get(i);
+
             if (config.isGenerateTeiIds()) {
                 String sID = KeyGen.getKey().substring(0, 7);
                 addXmlId(sentenceElement, "_" + sID);
@@ -1576,7 +1537,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
                 }
             }
 
-            int sentenceLength = theSentences.get(i).end - pos;
+            int sentenceLength = sentencesOffsetPosition.get(i).end - pos;
             // check if we have a ref between pos and pos+sentenceLength
             for(int j=refIndex; j<refPositions.size(); j++) {
                 int refPos = refPositions.get(j).intValue();
@@ -1600,8 +1561,8 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
                 }
             }
 
-            if (pos+posInSentence <= theSentences.get(i).end) {
-                String local_text_chunk = text.substring(pos+posInSentence, theSentences.get(i).end);
+            if (pos + posInSentence <= sentencesOffsetPosition.get(i).end) {
+                String local_text_chunk = text.substring(pos + posInSentence, sentencesOffsetPosition.get(i).end);
                 local_text_chunk = XmlBuilderUtils.stripNonValidXMLCharacters(local_text_chunk);
                 sentenceElement.appendChild(local_text_chunk);
                 curParagraph.appendChild(sentenceElement);

From 599559e6f4dfa14d089d31a9bccaa9021b63b96b Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Mon, 25 Jul 2022 15:40:19 +0900
Subject: [PATCH 12/23] minor changes

---
 .../java/org/grobid/core/document/TEIFormatter.java | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index aa98bd1019..06b4360347 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1522,6 +1522,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
             Element sentenceElement = teiElement("s");
 
             List<LayoutToken> currentSentenceTokens = segmentedParagraphTokens.get(i);
+//            List<Triple<String, String, OffsetPosition>> styleList = extractStylesList(currentSentenceTokens);
 
             if (config.isGenerateTeiIds()) {
                 String sID = KeyGen.getKey().substring(0, 7);
@@ -1547,9 +1548,9 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
                 if (refPos >= pos+posInSentence && refPos <= pos+sentenceLength) {
                     Node valueNode = mapRefNodes.get(new Integer(refPos));
                     if (pos+posInSentence < refPos) {
-                        String local_text_chunk = text.substring(pos+posInSentence, refPos);
-                        local_text_chunk = XmlBuilderUtils.stripNonValidXMLCharacters(local_text_chunk);
-                        sentenceElement.appendChild(local_text_chunk);
+                        String localTextChunk = text.substring(pos+posInSentence, refPos);
+                        localTextChunk = XmlBuilderUtils.stripNonValidXMLCharacters(localTextChunk);
+                        sentenceElement.appendChild(localTextChunk);
                     }
                     valueNode.detach();
                     sentenceElement.appendChild(valueNode);
@@ -1562,9 +1563,9 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
             }
 
             if (pos + posInSentence <= sentencesOffsetPosition.get(i).end) {
-                String local_text_chunk = text.substring(pos + posInSentence, sentencesOffsetPosition.get(i).end);
-                local_text_chunk = XmlBuilderUtils.stripNonValidXMLCharacters(local_text_chunk);
-                sentenceElement.appendChild(local_text_chunk);
+                String localTextChunk = text.substring(pos + posInSentence, sentencesOffsetPosition.get(i).end);
+                localTextChunk = XmlBuilderUtils.stripNonValidXMLCharacters(localTextChunk);
+                sentenceElement.appendChild(localTextChunk);
                 curParagraph.appendChild(sentenceElement);
             }
         }

From eee28ab2e0bdd3ada837a418ae39c7985bcd3faa Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Mon, 25 Jul 2022 16:06:41 +0900
Subject: [PATCH 13/23] implement change when segmenting paragraphs in
 sentences

---
 .../grobid/core/document/TEIFormatter.java    | 74 +++++++++++--------
 .../core/document/TEIFormatterTest.java       | 24 ++++++
 2 files changed, 68 insertions(+), 30 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 06b4360347..00a54e28dd 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -45,6 +45,7 @@
 import java.util.*;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import java.util.stream.Collectors;
 
 import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement;
 import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId;
@@ -1457,29 +1458,14 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
         if (StringUtils.isEmpty(text))
             return;
 
-        // identify ref nodes, ref spans and ref positions
-        Map<Integer,Node> mapRefNodes = new HashMap<>();
-        List<Integer> refPositions = new ArrayList<>();
-        List<OffsetPosition> forbiddenPositions = new ArrayList<>();
-        int pos = 0;
-        for(int i=0; i<curParagraph.getChildCount(); i++) {
-            Node theNode = curParagraph.getChild(i);
-            if (theNode instanceof Text) {
-                String chunk = theNode.getValue();
-                pos += chunk.length();
-            } else if (theNode instanceof Element) {
-                // for readability in another conditional
-                if (((Element) theNode).getLocalName().equals("ref")) {
-                    // map character offset of the node
-                    mapRefNodes.put(new Integer(pos), theNode);
-                    refPositions.add(new Integer(pos));
+        Map<Integer, Pair<Node, String>> mapRefNodes = identifyNestedNodes(curParagraph);
 
-                    String chunk = theNode.getValue();
-                    forbiddenPositions.add(new OffsetPosition(pos, pos+chunk.length()));
-                    pos += chunk.length();                    
-                }
-            }
-        }
+        List<OffsetPosition> forbiddenPositions = mapRefNodes.entrySet()
+            .stream()
+            .map(entry -> new OffsetPosition(entry.getKey(), entry.getValue().getRight().length() + entry.getKey()))
+            .collect(Collectors.toList());
+
+        List<Integer> refPositions = mapRefNodes.keySet().stream().sorted().collect(Collectors.toList());
 
         List<OffsetPosition> sentencesOffsetPosition =
             SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang));
@@ -1513,7 +1499,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
 
         // update the xml paragraph element
         int currenChildIndex = 0;
-        pos = 0;
+        int pos = 0;
         int posInSentence = 0;
         int refIndex = 0;
         for(int i=0; i<sentencesOffsetPosition.size(); i++) {
@@ -1522,7 +1508,6 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
             Element sentenceElement = teiElement("s");
 
             List<LayoutToken> currentSentenceTokens = segmentedParagraphTokens.get(i);
-//            List<Triple<String, String, OffsetPosition>> styleList = extractStylesList(currentSentenceTokens);
 
             if (config.isGenerateTeiIds()) {
                 String sID = KeyGen.getKey().substring(0, 7);
@@ -1531,22 +1516,21 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
             if (config.isGenerateTeiCoordinates("s")) {
                 if (segmentedParagraphTokens.size()>=i+1) {
                     currentSentenceTokens = segmentedParagraphTokens.get(i);
-                    String coords = LayoutTokensUtil.getCoordsString(currentSentenceTokens);
-                    if (coords != null) {
-                        sentenceElement.addAttribute(new Attribute("coords", coords));
-                    }
+                    sentenceElement.addAttribute(new Attribute("coords", LayoutTokensUtil.getCoordsString(currentSentenceTokens)));
                 }
             }
 
+            List<Triple<String, String, OffsetPosition>> styleList = extractStylesList(currentSentenceTokens);
+
             int sentenceLength = sentencesOffsetPosition.get(i).end - pos;
             // check if we have a ref between pos and pos+sentenceLength
             for(int j=refIndex; j<refPositions.size(); j++) {
-                int refPos = refPositions.get(j).intValue();
+                int refPos = refPositions.get(j);
                 if (refPos < pos+posInSentence)
                     continue;
 
                 if (refPos >= pos+posInSentence && refPos <= pos+sentenceLength) {
-                    Node valueNode = mapRefNodes.get(new Integer(refPos));
+                    Node valueNode = mapRefNodes.get(refPos).getLeft();
                     if (pos+posInSentence < refPos) {
                         String localTextChunk = text.substring(pos+posInSentence, refPos);
                         localTextChunk = XmlBuilderUtils.stripNonValidXMLCharacters(localTextChunk);
@@ -1584,6 +1568,36 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
 
     }
 
+    protected Map<Integer, Pair<Node, String>> identifyNestedNodes(Element curParagraph) {
+        // identify ref nodes, ref spans and ref positions
+        Map<Integer,Pair<Node, String>> mapRefNodes = new HashMap<>();
+
+        int pos = 0;
+        for(int i = 0; i< curParagraph.getChildCount(); i++) {
+            Node theNode = curParagraph.getChild(i);
+            if (theNode instanceof Text) {
+                String chunk = theNode.getValue();
+                pos += chunk.length();
+            } else if (theNode instanceof Element) {
+                // for readability in another conditional
+                if (((Element) theNode).getLocalName().equals("ref")) {
+                    String chunk = theNode.getValue();
+                    // map character offset of the node and the chunk text
+                    mapRefNodes.put(pos, Pair.of(theNode, chunk));
+
+                    pos += chunk.length();
+                } else if (((Element) theNode).getLocalName().equals("hi")) {
+                    String chunk = theNode.getValue();
+                    mapRefNodes.put(pos, Pair.of(theNode, chunk));
+
+                    pos += chunk.length();
+                }
+            }
+        }
+
+        return mapRefNodes;
+    }
+
     private List<List<LayoutToken>> segmentLayoutTokenLists(List<LayoutToken> curParagraphTokens, String text, List<OffsetPosition> sentencesOffsetPosition) {
         int pos;
         List<List<LayoutToken>> segmentedParagraphTokens = new ArrayList<>();
diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
index 9a64b8141f..56b2d26afa 100644
--- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
@@ -1,6 +1,7 @@
 package org.grobid.core.document;
 
 import nu.xom.Element;
+import nu.xom.Node;
 import org.apache.commons.lang3.tuple.Pair;
 import org.apache.commons.lang3.tuple.Triple;
 import org.grobid.core.analyzers.GrobidAnalyzer;
@@ -16,6 +17,7 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Map;
 
 import static org.grobid.core.document.TEIFormatter.TEI_STYLE_BOLD_NAME;
 import static org.hamcrest.CoreMatchers.is;
@@ -92,6 +94,28 @@ public void testSegmentIntoSentences_Bold_ShouldWork() throws Exception {
             is("<p xmlns=\"http://www.tei-c.org/ns/1.0\"><s>One sentence <ref>(Foppiano et al.)</ref>.</s><s>Second sentence <ref>(Lopez et al.)</ref>.</s></p>"));
     }
 
+    @Test
+    public void testIdentifyRefNotes() throws Exception {
+        Element currentParagraph = XmlBuilderUtils.teiElement("p");
+        currentParagraph.appendChild("One sentence");
+        currentParagraph.appendChild(" ");
+        currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Foppiano et al.)"));
+        currentParagraph.appendChild(". ");
+        currentParagraph.appendChild("Second sentence");
+        currentParagraph.appendChild(" ");
+        currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)"));
+        currentParagraph.appendChild(".");
+
+        Map<Integer, Pair<Node, String>> integerPairMap = new TEIFormatter(null, null).identifyNestedNodes(currentParagraph);
+
+        assertThat(integerPairMap.keySet(), hasSize(2));
+        assertThat(integerPairMap.keySet().stream().toArray()[1], is(13));
+        assertThat(integerPairMap.get(13).getRight(), is("(Foppiano et al.)"));
+
+        assertThat(integerPairMap.keySet().stream().toArray()[0], is(48));
+        assertThat(integerPairMap.get(48).getRight(), is("(Lopez et al.)"));
+    }
+
     @Test
     public void testExtractStylesList_single_shouldWork() throws Exception {
         String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure ";

From af9442846041b92afafbec9b88f5c02a8f44a860 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Mon, 25 Jul 2022 17:32:11 +0900
Subject: [PATCH 14/23] Test sentence segmentation with decoration and
 references

---
 .../core/document/TEIFormatterTest.java       | 57 +++++++++++++++++--
 1 file changed, 53 insertions(+), 4 deletions(-)

diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
index 56b2d26afa..eda76d2cd7 100644
--- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
@@ -19,7 +19,7 @@
 import java.util.List;
 import java.util.Map;
 
-import static org.grobid.core.document.TEIFormatter.TEI_STYLE_BOLD_NAME;
+import static org.grobid.core.document.TEIFormatter.*;
 import static org.hamcrest.CoreMatchers.is;
 import static org.hamcrest.Matchers.hasSize;
 import static org.junit.Assert.assertThat;
@@ -67,7 +67,7 @@ public void testSegmentIntoSentences_Bold_ShouldSplitIntoSentencesAndAddSTag() t
     }
 
     @Test
-    public void testSegmentIntoSentences_Bold_ShouldWork() throws Exception {
+    public void testSegmentIntoSentences_NoStyle_ShouldWork() throws Exception {
         String text = "One sentence (Foppiano et al.). Second sentence (Lopez et al.). ";
 
         GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build();
@@ -85,8 +85,6 @@ public void testSegmentIntoSentences_Bold_ShouldWork() throws Exception {
         currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)"));
         currentParagraph.appendChild(".");
 
-        System.out.println(currentParagraph.toXML());
-
         new TEIFormatter(null, null)
             .segmentIntoSentences(currentParagraph, currentParagraphTokens, config, "en");
 
@@ -94,6 +92,57 @@ public void testSegmentIntoSentences_Bold_ShouldWork() throws Exception {
             is("<p xmlns=\"http://www.tei-c.org/ns/1.0\"><s>One sentence <ref>(Foppiano et al.)</ref>.</s><s>Second sentence <ref>(Lopez et al.)</ref>.</s></p>"));
     }
 
+
+    @Test
+    public void testSegmentIntoSentences_Style_ShouldWork() throws Exception {
+        String text1_0 = "One sentence ";
+        String text1_1 = ". ";
+        String text2_0 = "Second sentence ";
+        String text2_1 = ".";
+
+        GrobidAnalysisConfig config = GrobidAnalysisConfig.builder()
+            .withSentenceSegmentation(true)
+            .build();
+
+        List<LayoutToken> tokens = new ArrayList<>();
+        List<LayoutToken> currentParagraphTokens1_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_0);
+        tokens.addAll(currentParagraphTokens1_0);
+        List<LayoutToken> currentParagraphTokens1_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_1);
+        tokens.addAll(currentParagraphTokens1_1);
+        List<LayoutToken> currentParagraphTokens2_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text2_0);
+        tokens.addAll(currentParagraphTokens2_0);
+        List<LayoutToken> currentParagraphTokens2_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text2_1);
+        tokens.addAll(currentParagraphTokens2_1);
+
+        currentParagraphTokens1_0.get(0).setBold(true);
+        currentParagraphTokens1_0.get(2).setBold(true);
+        currentParagraphTokens1_0.get(2).setItalic(true);
+
+        List<Triple<String, String, OffsetPosition>> styles1_0 = extractStylesList(currentParagraphTokens1_0);
+        List<Triple<String, String, OffsetPosition>> styles1_1 = extractStylesList(currentParagraphTokens1_1);
+        List<Triple<String, String, OffsetPosition>> styles2_0 = extractStylesList(currentParagraphTokens2_0);
+        List<Triple<String, String, OffsetPosition>> styles2_1 = extractStylesList(currentParagraphTokens2_1);
+
+        Element currentParagraph = XmlBuilderUtils.teiElement("p");
+
+        applyStyleList(currentParagraph, text1_0, styles1_0);
+        currentParagraph.appendChild(" ");
+        currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Foppiano et al.)"));
+        applyStyleList(currentParagraph, text1_1, styles1_1);
+        currentParagraph.appendChild(" ");
+        applyStyleList(currentParagraph, text2_0, styles2_0);
+        currentParagraph.appendChild(" ");
+        currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)"));
+        applyStyleList(currentParagraph, text2_1, styles2_1);
+
+        //Assuming these are injected correctly
+
+        new TEIFormatter(null, null).segmentIntoSentences(currentParagraph, tokens, config, "en");
+
+        assertThat(currentParagraph.toXML(),
+            is("<p xmlns=\"http://www.tei-c.org/ns/1.0\"><s><hi rend=\"bold\">One</hi> <hi rend=\"bold italic\">sentence</hi>  <ref>(Foppiano et al.)</ref>.</s><s>Second sentence <ref>(Lopez et al.)</ref>.</s></p>"));
+    }
+
     @Test
     public void testIdentifyRefNotes() throws Exception {
         Element currentParagraph = XmlBuilderUtils.teiElement("p");

From e8a00fea993c2be4ce65342f4071ae7f7d5adeba Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Tue, 26 Jul 2022 14:14:32 +0900
Subject: [PATCH 15/23] Fix style extraction + adding more tests

---
 .../grobid/core/document/TEIFormatter.java    | 10 ++-
 .../core/document/TEIFormatterTest.java       | 72 +++++++++++++++++++
 2 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 00a54e28dd..e57edb4cdf 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1667,6 +1667,13 @@ public static List<Triple<String, String, OffsetPosition>> extractStylesList(Lis
             temporaryText.append(token.getText());
             int endOffset = temporaryText.toString().length();
 
+            if (token.getText().equals(" ")) {
+                if (value.length() > 0) {
+                    value.append(token.getText());
+                }
+                continue;
+            }
+
             StringBuilder styleName = new StringBuilder();
             if (token.isBold() && !ignoreStyles.contains(TEI_STYLE_BOLD_NAME)) {
                 styleName.append(TEI_STYLE_BOLD_NAME).append(" ");
@@ -1692,7 +1699,8 @@ public static List<Triple<String, String, OffsetPosition>> extractStylesList(Lis
             }
 
             if (styleNameTrimmed.equals(previousStyleName)) {
-                Iterables.getLast(styleList).getRight().end = endOffset;
+                Triple<String, String, OffsetPosition> last = Iterables.getLast(styleList);
+                styleList.set(styleList.size()-1, Triple.of(last.getLeft(), value.toString(), new OffsetPosition(last.getRight().start, endOffset)));
             } else {
                 styleList.add(Triple.of(styleNameTrimmed, value.toString(), new OffsetPosition(startOffset, endOffset)));
             }
diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
index eda76d2cd7..41ea80e4d5 100644
--- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
@@ -143,6 +143,58 @@ public void testSegmentIntoSentences_Style_ShouldWork() throws Exception {
             is("<p xmlns=\"http://www.tei-c.org/ns/1.0\"><s><hi rend=\"bold\">One</hi> <hi rend=\"bold italic\">sentence</hi>  <ref>(Foppiano et al.)</ref>.</s><s>Second sentence <ref>(Lopez et al.)</ref>.</s></p>"));
     }
 
+    @Test
+    public void testSegmentIntoSentences_StyleBetweenTwoSentences_ShouldWork() throws Exception {
+        String text1_0 = "One sentence";
+        String text1_1 = ". ";
+        String text2_0 = "Second sentence";
+        String text2_1 = ".";
+
+        GrobidAnalysisConfig config = GrobidAnalysisConfig.builder()
+            .withSentenceSegmentation(true)
+            .build();
+
+        List<LayoutToken> tokens = new ArrayList<>();
+        List<LayoutToken> currentParagraphTokens1_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_0);
+        tokens.addAll(currentParagraphTokens1_0);
+        List<LayoutToken> currentParagraphTokens1_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_1);
+        tokens.addAll(currentParagraphTokens1_1);
+        List<LayoutToken> currentParagraphTokens2_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text2_0);
+        tokens.addAll(currentParagraphTokens2_0);
+        List<LayoutToken> currentParagraphTokens2_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text2_1);
+        tokens.addAll(currentParagraphTokens2_1);
+
+        currentParagraphTokens1_0.get(0).setBold(true); //One
+        currentParagraphTokens1_0.get(2).setItalic(true); //sentence
+        currentParagraphTokens1_1.get(0).setItalic(true); //.
+        currentParagraphTokens2_0.get(0).setItalic(true); //Second
+        currentParagraphTokens2_0.get(2).setItalic(true); //sentence
+
+        List<Triple<String, String, OffsetPosition>> styles1_0 = extractStylesList(currentParagraphTokens1_0);
+        List<Triple<String, String, OffsetPosition>> styles1_1 = extractStylesList(currentParagraphTokens1_1);
+        List<Triple<String, String, OffsetPosition>> styles2_0 = extractStylesList(currentParagraphTokens2_0);
+        List<Triple<String, String, OffsetPosition>> styles2_1 = extractStylesList(currentParagraphTokens2_1);
+
+        Element currentParagraph = XmlBuilderUtils.teiElement("p");
+
+        applyStyleList(currentParagraph, text1_0, styles1_0);
+        currentParagraph.appendChild(" ");
+        currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Foppiano et al.)"));
+        applyStyleList(currentParagraph, text1_1, styles1_1);
+        currentParagraph.appendChild(" ");
+        applyStyleList(currentParagraph, text2_0, styles2_0);
+        currentParagraph.appendChild(" ");
+        currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)"));
+        applyStyleList(currentParagraph, text2_1, styles2_1);
+
+        //Assuming these are injected correctly
+
+        new TEIFormatter(null, null).segmentIntoSentences(currentParagraph, tokens, config, "en");
+
+        assertThat(currentParagraph.toXML(),
+            is("<p xmlns=\"http://www.tei-c.org/ns/1.0\"><s><hi rend=\"bold\">One</hi> <hi rend=\"italic\">sentence</hi> <ref>(Foppiano et al.)</ref><hi rend=\"italic\">.</hi></s><s><hi rend=\"italic\">Second sentence</hi> <ref>(Lopez et al.)</ref>.</s></p>"));
+    }
+
     @Test
     public void testIdentifyRefNotes() throws Exception {
         Element currentParagraph = XmlBuilderUtils.teiElement("p");
@@ -244,6 +296,26 @@ public void testExtractStylesList_combined_shouldWork() throws Exception {
         assertThat(pairs.get(1).getRight().end, is(86));
     }
 
+    @Test
+    public void testExtractStylesList_continuousTokens_shouldWork() throws Exception {
+        String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure ";
+        GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().build();
+        List<LayoutToken> currentParagraphTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
+
+        currentParagraphTokens.get(24).setBold(true);
+        currentParagraphTokens.get(26).setBold(true);
+        currentParagraphTokens.get(28).setBold(true);
+        currentParagraphTokens.get(30).setBold(true);
+
+        List<Triple<String, String, OffsetPosition>> pairs = TEIFormatter.extractStylesList(currentParagraphTokens);
+
+        assertThat(pairs, hasSize(1));
+        assertThat(pairs.get(0).getLeft(), is("bold"));
+        assertThat(pairs.get(0).getMiddle(), is("Nd 2 Fe 14"));
+        assertThat(pairs.get(0).getRight().start, is(76));
+        assertThat(pairs.get(0).getRight().end, is(86));
+    }
+
     @Test
     public void testExtractStylesList_ignoreBold_shouldWork() throws Exception {
         String text = "The room temperature magnetic hysteresis loop for melt-spun ribbons of pure Nd 2 Fe 14 B is shown in Figure ";

From ccce04939a2708215a17f6124b12820e9dad9976 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Wed, 27 Jul 2022 15:15:05 +0900
Subject: [PATCH 16/23] Split decoration between sentences if neeed

---
 .../grobid/core/document/TEIFormatter.java    | 132 +++++++++++--
 .../core/document/TEIFormatterTest.java       | 186 ++++++++++++++++++
 2 files changed, 306 insertions(+), 12 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index e57edb4cdf..4e9c05513f 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1462,14 +1462,17 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
 
         List<OffsetPosition> forbiddenPositions = mapRefNodes.entrySet()
             .stream()
+            .filter(entry -> ((Element) entry.getValue().getLeft()).getLocalName().equals("ref"))
             .map(entry -> new OffsetPosition(entry.getKey(), entry.getValue().getRight().length() + entry.getKey()))
             .collect(Collectors.toList());
 
-        List<Integer> refPositions = mapRefNodes.keySet().stream().sorted().collect(Collectors.toList());
-
         List<OffsetPosition> sentencesOffsetPosition =
             SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang));
-    
+
+        mapRefNodes = splitMapNodesOverSentenceSplits(mapRefNodes, text, sentencesOffsetPosition);
+
+        List<Integer> refPositions = mapRefNodes.keySet().stream().sorted().collect(Collectors.toList());
+
         /*if (theSentences.size() == 0) {
             // this should normally not happen, but it happens (depending on sentence splitter, usually the text 
             // is just a punctuation)
@@ -1507,21 +1510,17 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
             posInSentence = 0;
             Element sentenceElement = teiElement("s");
 
-            List<LayoutToken> currentSentenceTokens = segmentedParagraphTokens.get(i);
-
             if (config.isGenerateTeiIds()) {
                 String sID = KeyGen.getKey().substring(0, 7);
                 addXmlId(sentenceElement, "_" + sID);
             }
             if (config.isGenerateTeiCoordinates("s")) {
                 if (segmentedParagraphTokens.size()>=i+1) {
-                    currentSentenceTokens = segmentedParagraphTokens.get(i);
+                    List<LayoutToken> currentSentenceTokens = segmentedParagraphTokens.get(i);
                     sentenceElement.addAttribute(new Attribute("coords", LayoutTokensUtil.getCoordsString(currentSentenceTokens)));
                 }
             }
 
-            List<Triple<String, String, OffsetPosition>> styleList = extractStylesList(currentSentenceTokens);
-
             int sentenceLength = sentencesOffsetPosition.get(i).end - pos;
             // check if we have a ref between pos and pos+sentenceLength
             for(int j=refIndex; j<refPositions.size(); j++) {
@@ -1565,12 +1564,121 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
                 }
             }
         }
+    }
+
+    /**
+     * Adjust the nodes that could be over a sentence split.
+     * We know that refs cannot be split over sentences, so we can ignore them happily
+     **/
+    protected Map<Integer, Pair<Node, String>> splitMapNodesOverSentenceSplits(Map<Integer, Pair<Node, String>> mapRefNodes, String text, List<OffsetPosition> sentencesOffsetPosition) {
+        Map<Integer, Pair<Node, String>> adjustedMap = new TreeMap<>();
+
+        StringBuilder textAccumulator = new StringBuilder();
+        List<Integer> refPositions = mapRefNodes.keySet().stream().sorted().collect(Collectors.toList());
+
+        int currentNodeIdx = 0;
+        for(int i=0; i<sentencesOffsetPosition.size(); i++) {
+            OffsetPosition offsetPosition = sentencesOffsetPosition.get(i);
+            int posInSentence = 0;
+            int sentenceOffsetStart = offsetPosition.start;
+            int sentenceOffsetEnd = offsetPosition.end;
+            StringBuilder sentenceAccumulator = new StringBuilder();
+
+            for(int j=currentNodeIdx; j<refPositions.size(); j++) {
+                int refPos = refPositions.get(j);
+                Node currentNode = mapRefNodes.get(refPos).getLeft();
+                if (((Element) currentNode).getLocalName().equals("ref")) {
+                    adjustedMap.put(refPos, mapRefNodes.get(refPos));
+                    textAccumulator.append(mapRefNodes.get(refPos).getRight());
+                    sentenceAccumulator.append(mapRefNodes.get(refPos).getRight());
+                    continue;
+                }
+                int currentNodeLength = currentNode.getValue().length();
+
+                //The ref position is falling between sentence start and end
+                if (refPos >= sentenceOffsetStart+posInSentence && refPos < sentenceOffsetEnd) {
+
+                    //adding what's before the refPos to the accumulator
+                    if (refPos > sentenceOffsetStart + posInSentence) {
+                        textAccumulator.append(text, sentenceOffsetStart + posInSentence, refPos);
+                        sentenceAccumulator.append(text, sentenceOffsetStart + posInSentence, refPos);
+                    }
+
+                    //the node finishes before sentence ends - all good here :-)
+                    if (sentenceOffsetStart + posInSentence + currentNodeLength < sentenceOffsetEnd) {
+                        adjustedMap.put(refPos, mapRefNodes.get(refPos));
+                        textAccumulator.append(mapRefNodes.get(refPos).getRight());
+                        sentenceAccumulator.append(mapRefNodes.get(refPos).getRight());
+                        posInSentence = refPos + currentNodeLength - sentenceOffsetStart;
+                        continue;
+                    } else {
+                        //The node exceed the sentence, we are in trouble! Cut it!
+                        int splitElementSize = sentenceOffsetEnd - refPos;
+
+                        String substringPrefix = currentNode.getValue().substring(0, splitElementSize);
+                        Element newElementPrefix = generateNewElement((Element) currentNode, substringPrefix);
+                        adjustedMap.put(refPos, Pair.of(newElementPrefix, substringPrefix));
+                        textAccumulator.append(substringPrefix);
+                        posInSentence = refPos + newElementPrefix.getValue().length() - sentenceOffsetStart;
+                        currentNodeIdx = j;
+                        break;
+                    }
+                } else if (refPos > sentenceOffsetEnd) {
+                    // add to accumulator the rest of the sentence and moving on to the next sentence
+                    textAccumulator.append(text, sentenceOffsetStart + posInSentence, sentenceOffsetEnd);
+                    sentenceAccumulator.append(text, sentenceOffsetStart + posInSentence, sentenceOffsetEnd);
+                    break;
+                } else if (refPos < sentenceOffsetStart && textAccumulator.length() > refPos
+                    && textAccumulator.length() < refPos + currentNodeLength) {
+                    //The node is between this sentence and the previous one - trouble again dude
+
+                    String exceeded = textAccumulator.substring(0, refPos) + mapRefNodes.get(refPos).getLeft().getValue();
+
+                    if (exceeded.length() > sentenceOffsetEnd) {
+                        String previousNodeSuffix = exceeded.substring(sentenceOffsetStart, sentenceOffsetEnd);
+                        Element newElementSuffix = generateNewElement((Element) currentNode, previousNodeSuffix);
+                        adjustedMap.put(sentenceOffsetStart, Pair.of(newElementSuffix, previousNodeSuffix));
+                        if (textAccumulator.length() < sentenceOffsetStart) {
+                            textAccumulator.append(exceeded, textAccumulator.length(), sentenceOffsetStart);
+                        }
+                        textAccumulator.append(previousNodeSuffix);
+
+                        posInSentence = sentenceOffsetStart + previousNodeSuffix.length();
+                        currentNodeIdx = j;
+                        break;
+                    } else {
+                        String previousNodeSuffix = exceeded.substring(sentenceOffsetStart);
+                        Element newElementSuffix = generateNewElement((Element) currentNode, previousNodeSuffix);
+                        adjustedMap.put(sentenceOffsetStart, Pair.of(newElementSuffix, previousNodeSuffix));
+                        if (textAccumulator.length() < sentenceOffsetStart) {
+                            textAccumulator.append(exceeded, textAccumulator.length(), sentenceOffsetStart);
+                        }
+                        textAccumulator.append(previousNodeSuffix);
+                        posInSentence = sentenceOffsetStart + previousNodeSuffix.length();
+                    }
+                }
+            }
+
+            if (sentenceOffsetStart + posInSentence <= sentenceOffsetEnd) {
+                textAccumulator.append(text, sentenceOffsetStart + posInSentence, sentencesOffsetPosition.get(i).end);
+            }
+        }
 
+        return adjustedMap;
+    }
+
+    private Element generateNewElement(Element currentNode, String value) {
+        Element newElement = teiElement(currentNode.getLocalName(), value);
+        for (int i=0; i < currentNode.getAttributeCount(); i++) {
+            Attribute a = new Attribute(currentNode.getAttribute(i));
+            newElement.addAttribute(a);
+        }
+        return newElement;
     }
 
     protected Map<Integer, Pair<Node, String>> identifyNestedNodes(Element curParagraph) {
         // identify ref nodes, ref spans and ref positions
-        Map<Integer,Pair<Node, String>> mapRefNodes = new HashMap<>();
+        Map<Integer,Pair<Node, String>> mapNodes = new HashMap<>();
 
         int pos = 0;
         for(int i = 0; i< curParagraph.getChildCount(); i++) {
@@ -1583,19 +1691,19 @@ protected Map<Integer, Pair<Node, String>> identifyNestedNodes(Element curParagr
                 if (((Element) theNode).getLocalName().equals("ref")) {
                     String chunk = theNode.getValue();
                     // map character offset of the node and the chunk text
-                    mapRefNodes.put(pos, Pair.of(theNode, chunk));
+                    mapNodes.put(pos, Pair.of(theNode, chunk));
 
                     pos += chunk.length();
                 } else if (((Element) theNode).getLocalName().equals("hi")) {
                     String chunk = theNode.getValue();
-                    mapRefNodes.put(pos, Pair.of(theNode, chunk));
+                    mapNodes.put(pos, Pair.of(theNode, chunk));
 
                     pos += chunk.length();
                 }
             }
         }
 
-        return mapRefNodes;
+        return mapNodes;
     }
 
     private List<List<LayoutToken>> segmentLayoutTokenLists(List<LayoutToken> curParagraphTokens, String text, List<OffsetPosition> sentencesOffsetPosition) {
diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
index 41ea80e4d5..dd3575fa67 100644
--- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
@@ -7,10 +7,12 @@
 import org.grobid.core.analyzers.GrobidAnalyzer;
 import org.grobid.core.document.xml.XmlBuilderUtils;
 import org.grobid.core.engines.config.GrobidAnalysisConfig;
+import org.grobid.core.lang.Language;
 import org.grobid.core.layout.LayoutToken;
 import org.grobid.core.utilities.GrobidProperties;
 import org.grobid.core.utilities.LayoutTokensUtil;
 import org.grobid.core.utilities.OffsetPosition;
+import org.grobid.core.utilities.SentenceUtilities;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
@@ -18,8 +20,10 @@
 import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
+import java.util.stream.Collectors;
 
 import static org.grobid.core.document.TEIFormatter.*;
+import static org.hamcrest.CoreMatchers.any;
 import static org.hamcrest.CoreMatchers.is;
 import static org.hamcrest.Matchers.hasSize;
 import static org.junit.Assert.assertThat;
@@ -195,6 +199,188 @@ public void testSegmentIntoSentences_StyleBetweenTwoSentences_ShouldWork() throw
             is("<p xmlns=\"http://www.tei-c.org/ns/1.0\"><s><hi rend=\"bold\">One</hi> <hi rend=\"italic\">sentence</hi> <ref>(Foppiano et al.)</ref><hi rend=\"italic\">.</hi></s><s><hi rend=\"italic\">Second sentence</hi> <ref>(Lopez et al.)</ref>.</s></p>"));
     }
 
+    @Test
+    public void testSegmentIntoSentences_StyleBetweenTwoSentences_oneRef_ShouldWork() throws Exception {
+        String text1_0 = "One sentence. Second sentence";
+        String text1_1 = ".";
+
+        GrobidAnalysisConfig config = GrobidAnalysisConfig.builder()
+            .withSentenceSegmentation(true)
+            .build();
+
+        List<LayoutToken> tokens = new ArrayList<>();
+        List<LayoutToken> currentParagraphTokens1_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_0);
+        tokens.addAll(currentParagraphTokens1_0);
+        List<LayoutToken> currentParagraphTokens1_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_1);
+        tokens.addAll(currentParagraphTokens1_1);
+
+        currentParagraphTokens1_0.get(0).setBold(true); //One
+        currentParagraphTokens1_0.get(2).setItalic(true); //sentence
+        currentParagraphTokens1_0.get(3).setItalic(true); //.
+        currentParagraphTokens1_0.get(5).setItalic(true); //Second
+
+        List<Triple<String, String, OffsetPosition>> styles1_0 = extractStylesList(currentParagraphTokens1_0);
+        List<Triple<String, String, OffsetPosition>> styles1_1 = extractStylesList(currentParagraphTokens1_1);
+
+        Element currentParagraph = XmlBuilderUtils.teiElement("p");
+
+        applyStyleList(currentParagraph, text1_0, styles1_0);
+        currentParagraph.appendChild(" ");
+        currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)"));
+        applyStyleList(currentParagraph, text1_1, styles1_1);
+
+        new TEIFormatter(null, null).segmentIntoSentences(currentParagraph, tokens, config, "en");
+
+        assertThat(currentParagraph.toXML(),
+            is("<p xmlns=\"http://www.tei-c.org/ns/1.0\"><s><hi rend=\"bold\">One</hi> <hi rend=\"italic\">sentence.</hi></s><s><hi rend=\"italic\">Second</hi> sentence <ref>(Lopez et al.)</ref>.</s></p>"));
+    }
+
+    @Test
+    public void testSegmentIntoSentences_StyleBetweenTwoSentencesWithoutRefs_ShouldWork() throws Exception {
+        String text = "One sentence. Second sentence.";
+
+        GrobidAnalysisConfig config = GrobidAnalysisConfig.builder()
+            .withSentenceSegmentation(true)
+            .build();
+
+        List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
+
+        tokens.get(0).setBold(true); //One
+        tokens.get(2).setItalic(true); //sentence
+        tokens.get(3).setItalic(true); //.
+        tokens.get(5).setItalic(true); //Second
+//        currentParagraphTokens.get(7).setItalic(true); //sentence
+
+        List<Triple<String, String, OffsetPosition>> styles = extractStylesList(tokens);
+
+        Element currentParagraph = XmlBuilderUtils.teiElement("p");
+
+        applyStyleList(currentParagraph, text, styles);
+
+        //Assuming these are injected correctly
+        new TEIFormatter(null, null).segmentIntoSentences(currentParagraph, tokens, config, "en");
+
+        assertThat(currentParagraph.toXML(),
+            is("<p xmlns=\"http://www.tei-c.org/ns/1.0\"><s><hi rend=\"bold\">One</hi> <hi rend=\"italic\">sentence.</hi></s><s><hi rend=\"italic\">Second</hi> sentence.</s></p>"));
+    }
+
+    @Test
+    public void testSplitMapNodesOverSentenceSplits_shouldAdjustNodes() {
+        TEIFormatter teiFormatter = new TEIFormatter(null, null);
+
+        String text1_0 = "One sentence. Second sentence";
+        String text1_1 = ".";
+
+        GrobidAnalysisConfig config = GrobidAnalysisConfig.builder()
+            .withSentenceSegmentation(true)
+            .build();
+
+        List<LayoutToken> tokens = new ArrayList<>();
+        List<LayoutToken> currentParagraphTokens1_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_0);
+        tokens.addAll(currentParagraphTokens1_0);
+        List<LayoutToken> currentParagraphTokens1_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_1);
+        tokens.addAll(currentParagraphTokens1_1);
+
+        currentParagraphTokens1_0.get(0).setBold(true); //One
+        currentParagraphTokens1_0.get(2).setItalic(true); //sentence
+        currentParagraphTokens1_0.get(3).setItalic(true); //.
+        currentParagraphTokens1_0.get(5).setItalic(true); //Second
+
+        List<Triple<String, String, OffsetPosition>> styles1_0 = extractStylesList(currentParagraphTokens1_0);
+        List<Triple<String, String, OffsetPosition>> styles1_1 = extractStylesList(currentParagraphTokens1_1);
+
+        Element currentParagraph = XmlBuilderUtils.teiElement("p");
+
+        applyStyleList(currentParagraph, text1_0, styles1_0);
+        currentParagraph.appendChild(" ");
+        currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)"));
+        applyStyleList(currentParagraph, text1_1, styles1_1);
+
+        String text = currentParagraph.getValue();
+
+        Map<Integer, Pair<Node, String>> nestedNodes = teiFormatter.identifyNestedNodes(currentParagraph);
+        List<OffsetPosition> forbiddenPositions = nestedNodes.entrySet()
+            .stream()
+            .filter(entry -> ((Element) entry.getValue().getLeft()).getLocalName().equals("ref"))
+            .map(entry -> new OffsetPosition(entry.getKey(), entry.getValue().getRight().length() + entry.getKey()))
+            .collect(Collectors.toList());
+
+        List<OffsetPosition> sentencesOffsetPosition =
+            SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, tokens, new Language("en"));
+
+        Map<Integer, Pair<Node, String>> adjustedNestedNodes = teiFormatter.splitMapNodesOverSentenceSplits(nestedNodes, text, sentencesOffsetPosition);
+
+        assertThat(adjustedNestedNodes.size(), is(4));
+
+        assertThat(new ArrayList<>(adjustedNestedNodes.keySet()), is(Arrays.asList(0, 4, 14, 30)));
+
+        assertThat(adjustedNestedNodes.get(0).getRight(), is("One"));
+        assertThat(adjustedNestedNodes.get(4).getRight(), is("sentence."));
+        assertThat(adjustedNestedNodes.get(14).getRight(), is("Second"));
+        assertThat(adjustedNestedNodes.get(30).getRight(), is("(Lopez et al.)"));
+    }
+
+    @Test
+    public void testSplitMapNodesOverThreeSentenceSplits_shouldAdjustNodes() {
+        TEIFormatter teiFormatter = new TEIFormatter(null, null);
+
+        String text1_0 = "One sentence. Second sentence. Third sentence";
+        String text1_1 = ".";
+
+        GrobidAnalysisConfig config = GrobidAnalysisConfig.builder()
+            .withSentenceSegmentation(true)
+            .build();
+
+        List<LayoutToken> tokens = new ArrayList<>();
+        List<LayoutToken> currentParagraphTokens1_0 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_0);
+        tokens.addAll(currentParagraphTokens1_0);
+        List<LayoutToken> currentParagraphTokens1_1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text1_1);
+        tokens.addAll(currentParagraphTokens1_1);
+
+        currentParagraphTokens1_0.get(0).setBold(true); //One
+        currentParagraphTokens1_0.get(2).setItalic(true); //sentence
+        currentParagraphTokens1_0.get(3).setItalic(true); //.
+        currentParagraphTokens1_0.get(5).setItalic(true); //Second
+        currentParagraphTokens1_0.get(7).setItalic(true); //sentence
+        currentParagraphTokens1_0.get(8).setItalic(true); //.
+        currentParagraphTokens1_0.get(10).setItalic(true); //Third
+//        currentParagraphTokens1_0.get(12).setItalic(true); //sentence
+
+        List<Triple<String, String, OffsetPosition>> styles1_0 = extractStylesList(currentParagraphTokens1_0);
+        List<Triple<String, String, OffsetPosition>> styles1_1 = extractStylesList(currentParagraphTokens1_1);
+
+        Element currentParagraph = XmlBuilderUtils.teiElement("p");
+
+        applyStyleList(currentParagraph, text1_0, styles1_0);
+        currentParagraph.appendChild(" ");
+        currentParagraph.appendChild(XmlBuilderUtils.teiElement("ref", "(Lopez et al.)"));
+        applyStyleList(currentParagraph, text1_1, styles1_1);
+
+        String text = currentParagraph.getValue();
+
+        Map<Integer, Pair<Node, String>> nestedNodes = teiFormatter.identifyNestedNodes(currentParagraph);
+        List<OffsetPosition> forbiddenPositions = nestedNodes.entrySet()
+            .stream()
+            .filter(entry -> ((Element) entry.getValue().getLeft()).getLocalName().equals("ref"))
+            .map(entry -> new OffsetPosition(entry.getKey(), entry.getValue().getRight().length() + entry.getKey()))
+            .collect(Collectors.toList());
+
+        List<OffsetPosition> sentencesOffsetPosition =
+            SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, tokens, new Language("en"));
+
+        Map<Integer, Pair<Node, String>> adjustedNestedNodes = teiFormatter.splitMapNodesOverSentenceSplits(nestedNodes, text, sentencesOffsetPosition);
+
+        assertThat(adjustedNestedNodes.size(), is(5));
+
+        assertThat(new ArrayList<>(adjustedNestedNodes.keySet()), is(Arrays.asList(0, 4, 14, 31, 46)));
+
+        assertThat(adjustedNestedNodes.get(0).getRight(), is("One"));
+        assertThat(adjustedNestedNodes.get(4).getRight(), is("sentence."));
+        assertThat(adjustedNestedNodes.get(14).getRight(), is("Second sentence."));
+        assertThat(adjustedNestedNodes.get(31).getRight(), is("Third"));
+        assertThat(adjustedNestedNodes.get(46).getRight(), is("(Lopez et al.)"));
+    }
+
     @Test
     public void testIdentifyRefNotes() throws Exception {
         Element currentParagraph = XmlBuilderUtils.teiElement("p");

From aaf211d69d9528a4508b4b2f61ce85a61a85f36f Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Wed, 27 Jul 2022 16:10:56 +0900
Subject: [PATCH 17/23] Fix bugs with the text accumulator

---
 .../grobid/core/document/TEIFormatter.java    | 35 +++++++++++--------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 4e9c05513f..442f77c81f 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1458,9 +1458,9 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
         if (StringUtils.isEmpty(text))
             return;
 
-        Map<Integer, Pair<Node, String>> mapRefNodes = identifyNestedNodes(curParagraph);
+        Map<Integer, Pair<Node, String>> rawMapRefNodes = identifyNestedNodes(curParagraph);
 
-        List<OffsetPosition> forbiddenPositions = mapRefNodes.entrySet()
+        List<OffsetPosition> forbiddenPositions = rawMapRefNodes.entrySet()
             .stream()
             .filter(entry -> ((Element) entry.getValue().getLeft()).getLocalName().equals("ref"))
             .map(entry -> new OffsetPosition(entry.getKey(), entry.getValue().getRight().length() + entry.getKey()))
@@ -1469,7 +1469,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
         List<OffsetPosition> sentencesOffsetPosition =
             SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang));
 
-        mapRefNodes = splitMapNodesOverSentenceSplits(mapRefNodes, text, sentencesOffsetPosition);
+        Map<Integer, Pair<Node, String>> mapRefNodes = splitMapNodesOverSentenceSplits(rawMapRefNodes, text, sentencesOffsetPosition);
 
         List<Integer> refPositions = mapRefNodes.keySet().stream().sorted().collect(Collectors.toList());
 
@@ -1582,18 +1582,25 @@ protected Map<Integer, Pair<Node, String>> splitMapNodesOverSentenceSplits(Map<I
             int posInSentence = 0;
             int sentenceOffsetStart = offsetPosition.start;
             int sentenceOffsetEnd = offsetPosition.end;
-            StringBuilder sentenceAccumulator = new StringBuilder();
 
             for(int j=currentNodeIdx; j<refPositions.size(); j++) {
                 int refPos = refPositions.get(j);
                 Node currentNode = mapRefNodes.get(refPos).getLeft();
+                int currentNodeLength = currentNode.getValue().length();
+
                 if (((Element) currentNode).getLocalName().equals("ref")) {
+                    if (refPos > sentenceOffsetEnd) {
+                        currentNodeIdx = j;
+                        break;
+                    }
                     adjustedMap.put(refPos, mapRefNodes.get(refPos));
+                    if (textAccumulator.length() < refPos) {
+                        textAccumulator.append(text, textAccumulator.length(), refPos);
+                    }
                     textAccumulator.append(mapRefNodes.get(refPos).getRight());
-                    sentenceAccumulator.append(mapRefNodes.get(refPos).getRight());
+                    posInSentence = refPos + currentNodeLength - sentenceOffsetStart;
                     continue;
                 }
-                int currentNodeLength = currentNode.getValue().length();
 
                 //The ref position is falling between sentence start and end
                 if (refPos >= sentenceOffsetStart+posInSentence && refPos < sentenceOffsetEnd) {
@@ -1601,16 +1608,14 @@ protected Map<Integer, Pair<Node, String>> splitMapNodesOverSentenceSplits(Map<I
                     //adding what's before the refPos to the accumulator
                     if (refPos > sentenceOffsetStart + posInSentence) {
                         textAccumulator.append(text, sentenceOffsetStart + posInSentence, refPos);
-                        sentenceAccumulator.append(text, sentenceOffsetStart + posInSentence, refPos);
+                        posInSentence = refPos - sentenceOffsetStart;
                     }
 
                     //the node finishes before sentence ends - all good here :-)
                     if (sentenceOffsetStart + posInSentence + currentNodeLength < sentenceOffsetEnd) {
                         adjustedMap.put(refPos, mapRefNodes.get(refPos));
                         textAccumulator.append(mapRefNodes.get(refPos).getRight());
-                        sentenceAccumulator.append(mapRefNodes.get(refPos).getRight());
                         posInSentence = refPos + currentNodeLength - sentenceOffsetStart;
-                        continue;
                     } else {
                         //The node exceed the sentence, we are in trouble! Cut it!
                         int splitElementSize = sentenceOffsetEnd - refPos;
@@ -1621,12 +1626,14 @@ protected Map<Integer, Pair<Node, String>> splitMapNodesOverSentenceSplits(Map<I
                         textAccumulator.append(substringPrefix);
                         posInSentence = refPos + newElementPrefix.getValue().length() - sentenceOffsetStart;
                         currentNodeIdx = j;
-                        break;
+//                        break;
                     }
                 } else if (refPos > sentenceOffsetEnd) {
                     // add to accumulator the rest of the sentence and moving on to the next sentence
-                    textAccumulator.append(text, sentenceOffsetStart + posInSentence, sentenceOffsetEnd);
-                    sentenceAccumulator.append(text, sentenceOffsetStart + posInSentence, sentenceOffsetEnd);
+                    String textChunk = text.substring(sentenceOffsetStart + posInSentence, sentenceOffsetEnd);
+                    textAccumulator.append(textChunk);
+                    posInSentence += textChunk.length();
+                    currentNodeIdx = j;
                     break;
                 } else if (refPos < sentenceOffsetStart && textAccumulator.length() > refPos
                     && textAccumulator.length() < refPos + currentNodeLength) {
@@ -1659,7 +1666,7 @@ protected Map<Integer, Pair<Node, String>> splitMapNodesOverSentenceSplits(Map<I
                 }
             }
 
-            if (sentenceOffsetStart + posInSentence <= sentenceOffsetEnd) {
+            if (sentenceOffsetStart + posInSentence < sentenceOffsetEnd) {
                 textAccumulator.append(text, sentenceOffsetStart + posInSentence, sentencesOffsetPosition.get(i).end);
             }
         }
@@ -1775,7 +1782,7 @@ public static List<Triple<String, String, OffsetPosition>> extractStylesList(Lis
             temporaryText.append(token.getText());
             int endOffset = temporaryText.toString().length();
 
-            if (token.getText().equals(" ")) {
+            if (token.getText().equals(" ") || token.getText().equals("\n")) {
                 if (value.length() > 0) {
                     value.append(token.getText());
                 }

From b82cc4322f37eecd273cc14523ad7d48e1eaeb1f Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Thu, 28 Jul 2022 11:26:07 +0900
Subject: [PATCH 18/23] Fix incorrect split and position in sentence markers

---
 .../main/java/org/grobid/core/document/TEIFormatter.java   | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 442f77c81f..2498ddb6e7 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1626,7 +1626,7 @@ protected Map<Integer, Pair<Node, String>> splitMapNodesOverSentenceSplits(Map<I
                         textAccumulator.append(substringPrefix);
                         posInSentence = refPos + newElementPrefix.getValue().length() - sentenceOffsetStart;
                         currentNodeIdx = j;
-//                        break;
+                        break;
                     }
                 } else if (refPos > sentenceOffsetEnd) {
                     // add to accumulator the rest of the sentence and moving on to the next sentence
@@ -1650,10 +1650,11 @@ protected Map<Integer, Pair<Node, String>> splitMapNodesOverSentenceSplits(Map<I
                         }
                         textAccumulator.append(previousNodeSuffix);
 
-                        posInSentence = sentenceOffsetStart + previousNodeSuffix.length();
+                        posInSentence = textAccumulator.length() - sentenceOffsetStart;
                         currentNodeIdx = j;
                         break;
                     } else {
+                        //The item is within this sentence. Cool stuff.
                         String previousNodeSuffix = exceeded.substring(sentenceOffsetStart);
                         Element newElementSuffix = generateNewElement((Element) currentNode, previousNodeSuffix);
                         adjustedMap.put(sentenceOffsetStart, Pair.of(newElementSuffix, previousNodeSuffix));
@@ -1661,7 +1662,7 @@ protected Map<Integer, Pair<Node, String>> splitMapNodesOverSentenceSplits(Map<I
                             textAccumulator.append(exceeded, textAccumulator.length(), sentenceOffsetStart);
                         }
                         textAccumulator.append(previousNodeSuffix);
-                        posInSentence = sentenceOffsetStart + previousNodeSuffix.length();
+                        posInSentence = textAccumulator.length() - sentenceOffsetStart;
                     }
                 }
             }

From 053b235564e48a35a2690da886c67534197023ca Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Mon, 12 Sep 2022 12:44:07 +0900
Subject: [PATCH 19/23] remove suffix space when there is no more text

---
 .../main/java/org/grobid/core/data/Table.java |  2 --
 .../grobid/core/document/TEIFormatter.java    | 10 +++++--
 .../core/document/TEIFormatterTest.java       | 30 +++++++++++++++++--
 3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java
index 9a9aa6cf3e..a2adc428a2 100644
--- a/grobid-core/src/main/java/org/grobid/core/data/Table.java
+++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java
@@ -33,11 +33,9 @@
 import nu.xom.Attribute;
 import nu.xom.Element;
 import nu.xom.Node;
-import nu.xom.Text;
 
 import static org.grobid.core.document.TEIFormatter.applyStyleList;
 import static org.grobid.core.document.TEIFormatter.extractStylesList;
-import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement;
 import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId;
 import static org.grobid.core.document.xml.XmlBuilderUtils.textNode;
 
diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 2498ddb6e7..671b887834 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1424,7 +1424,7 @@ public static Element applyStyleList(Element paragraphElem, String text, List<Tr
             String subString = text.substring(lastPosition, offsetStyle.start);
             String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : "";
             String suffixSpace = "";
-            if (subString.length() > 1) {
+            if (subString.length() > prefixSpace.length()) {
                 suffixSpace = StringUtils.endsWith(subString, " ") ? " " : "";
             }
             paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " ")) + suffixSpace);
@@ -1435,7 +1435,11 @@ public static Element applyStyleList(Element paragraphElem, String text, List<Tr
             paragraphElem.appendChild(rend);
         }
         String subString = text.substring(lastPosition);
-        String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : "";
+        String subStringNormalized = StringUtils.normalizeSpace(subString);
+        String prefixSpace = "";
+        if (subStringNormalized.length() > 0) {
+            prefixSpace = StringUtils.startsWith(subString, " ") ? " " : "";
+        }
         paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " ")));
 
         return paragraphElem;
@@ -1819,10 +1823,12 @@ public static List<Triple<String, String, OffsetPosition>> extractStylesList(Lis
                 styleList.set(styleList.size()-1, Triple.of(last.getLeft(), value.toString(), new OffsetPosition(last.getRight().start, endOffset)));
             } else {
                 styleList.add(Triple.of(styleNameTrimmed, value.toString(), new OffsetPosition(startOffset, endOffset)));
+//                value = new StringBuilder();
             }
 
             previousStyleName = styleNameTrimmed;
         }
+//        List<Triple<String, String, OffsetPosition>> postProcessedStyleList = styleList.stream().map(s -> Triple.of(s.getLeft(), s.getMiddle().substring(s.getRight().start, s.getRight().end), s.getRight())).collect(Collectors.toList());
 
         return styleList;
     }
diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
index dd3575fa67..9d2dc3d2cd 100644
--- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java
@@ -14,6 +14,7 @@
 import org.grobid.core.utilities.OffsetPosition;
 import org.grobid.core.utilities.SentenceUtilities;
 import org.junit.BeforeClass;
+import org.junit.Ignore;
 import org.junit.Test;
 
 import java.util.ArrayList;
@@ -23,7 +24,6 @@
 import java.util.stream.Collectors;
 
 import static org.grobid.core.document.TEIFormatter.*;
-import static org.hamcrest.CoreMatchers.any;
 import static org.hamcrest.CoreMatchers.is;
 import static org.hamcrest.Matchers.hasSize;
 import static org.junit.Assert.assertThat;
@@ -144,7 +144,7 @@ public void testSegmentIntoSentences_Style_ShouldWork() throws Exception {
         new TEIFormatter(null, null).segmentIntoSentences(currentParagraph, tokens, config, "en");
 
         assertThat(currentParagraph.toXML(),
-            is("<p xmlns=\"http://www.tei-c.org/ns/1.0\"><s><hi rend=\"bold\">One</hi> <hi rend=\"bold italic\">sentence</hi>  <ref>(Foppiano et al.)</ref>.</s><s>Second sentence <ref>(Lopez et al.)</ref>.</s></p>"));
+            is("<p xmlns=\"http://www.tei-c.org/ns/1.0\"><s><hi rend=\"bold\">One</hi> <hi rend=\"bold italic\">sentence</hi> <ref>(Foppiano et al.)</ref>.</s><s>Second sentence <ref>(Lopez et al.)</ref>.</s></p>"));
     }
 
     @Test
@@ -527,6 +527,32 @@ public void testExtractStylesList_ignoreBold_shouldWork() throws Exception {
         assertThat(pairs.get(1).getRight().end, is(86));
     }
 
+    @Ignore("The middle is actually not used")
+    public void testExtractStylesList_checkProducedText_ShouldWork() throws Exception {
+        String text = "I. Introduction  1.1. Généralités et rappels  ";
+        List<LayoutToken> textTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
+
+        textTokens.get(0).setBold(true);
+        textTokens.get(1).setBold(true);
+        textTokens.get(3).setBold(true);
+
+        textTokens.get(6).setItalic(true);
+        textTokens.get(7).setItalic(true);
+        textTokens.get(8).setItalic(true);
+        textTokens.get(9).setItalic(true);
+        textTokens.get(11).setItalic(true);
+        textTokens.get(13).setItalic(true);
+        textTokens.get(15).setItalic(true);
+
+        List<Triple<String, String, OffsetPosition>> pairs = TEIFormatter.extractStylesList(textTokens);
+
+        assertThat(pairs, hasSize(2));
+        assertThat(pairs.get(0).getLeft(), is("bold"));
+        assertThat(pairs.get(0).getMiddle(), is("I. Introduction"));
+        assertThat(pairs.get(1).getLeft(), is("italic"));
+        assertThat(pairs.get(1).getMiddle(), is("1.1. Généralités et rappels"));
+    }
+
     @Test
     public void testGetSectionNumber_simple_ShouldWork() throws Exception {
         String text = "3 Supercon 2";

From 80b98c498da27139f6b5067d3638dc5ac9185021 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Tue, 13 Sep 2022 12:11:09 +0900
Subject: [PATCH 20/23] fix OOBE when applying sentence splitting

---
 .../java/org/grobid/core/document/TEIFormatter.java    | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 671b887834..e3925f7eab 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1581,12 +1581,17 @@ protected Map<Integer, Pair<Node, String>> splitMapNodesOverSentenceSplits(Map<I
         List<Integer> refPositions = mapRefNodes.keySet().stream().sorted().collect(Collectors.toList());
 
         int currentNodeIdx = 0;
+        int previousSentenceOffsetStart = 0;
+        int previousPosInSentence = 0;
         for(int i=0; i<sentencesOffsetPosition.size(); i++) {
             OffsetPosition offsetPosition = sentencesOffsetPosition.get(i);
             int posInSentence = 0;
             int sentenceOffsetStart = offsetPosition.start;
             int sentenceOffsetEnd = offsetPosition.end;
 
+            if (previousSentenceOffsetStart + previousPosInSentence < sentenceOffsetStart) {
+                textAccumulator.append(text, previousSentenceOffsetStart + previousPosInSentence, sentenceOffsetStart);
+            }
             for(int j=currentNodeIdx; j<refPositions.size(); j++) {
                 int refPos = refPositions.get(j);
                 Node currentNode = mapRefNodes.get(refPos).getLeft();
@@ -1639,7 +1644,8 @@ protected Map<Integer, Pair<Node, String>> splitMapNodesOverSentenceSplits(Map<I
                     posInSentence += textChunk.length();
                     currentNodeIdx = j;
                     break;
-                } else if (refPos < sentenceOffsetStart && textAccumulator.length() > refPos
+                } else if (refPos < sentenceOffsetStart
+                    && textAccumulator.length() > refPos
                     && textAccumulator.length() < refPos + currentNodeLength) {
                     //The node is between this sentence and the previous one - trouble again dude
 
@@ -1670,6 +1676,8 @@ protected Map<Integer, Pair<Node, String>> splitMapNodesOverSentenceSplits(Map<I
                     }
                 }
             }
+            previousSentenceOffsetStart = sentenceOffsetStart;
+            previousPosInSentence = posInSentence;
 
             if (sentenceOffsetStart + posInSentence < sentenceOffsetEnd) {
                 textAccumulator.append(text, sentenceOffsetStart + posInSentence, sentencesOffsetPosition.get(i).end);

From d57c82c4ec5d87c84f4f4d67c2ef1e29a2445f20 Mon Sep 17 00:00:00 2001
From: lfoppiano <luca@foppiano.org>
Date: Wed, 17 May 2023 14:47:04 +0900
Subject: [PATCH 21/23] avoid adding styles in head sections

---
 .../java/org/grobid/core/document/TEIFormatter.java    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 7872eefcb4..8ef3740304 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1321,13 +1321,13 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                     dehyphenized = numb.getLeft();
                     text = LayoutTokensUtil.toText(dehyphenized);
                 }
-                List<Triple<String, String, OffsetPosition>> stylesList = extractStylesList(dehyphenized);
+//                List<Triple<String, String, OffsetPosition>> stylesList = extractStylesList(dehyphenized);
 
-                if (CollectionUtils.isNotEmpty(stylesList)) {
-                    applyStyleList(head, text, stylesList);
-                } else {
+//                if (CollectionUtils.isNotEmpty(stylesList)) {
+//                    applyStyleList(head, text, stylesList);
+//                } else {
                     head.appendChild(StringUtils.normalizeSpace(text.replace("\n", "")));
-                }
+//                }
 
                 if (config.isGenerateTeiIds()) {
                     String divID = KeyGen.getKey().substring(0, 7);

From 9adb8d864198d0724a7ebda57d23f4e2c999ae6e Mon Sep 17 00:00:00 2001
From: lfoppiano <luca@foppiano.org>
Date: Wed, 17 May 2023 15:56:00 +0900
Subject: [PATCH 22/23] fix inconsistency when having notes in the same page

---
 .../grobid/core/document/TEIFormatter.java    | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 8ef3740304..c22c4dff8a 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1398,7 +1398,8 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                 curDiv.appendChild(note);
             } else if (clusterLabel.equals(TaggingLabels.PARAGRAPH)) {
                 List<LayoutToken> clusterTokens = cluster.concatTokens();
-                int clusterPage = Iterables.getLast(clusterTokens).getPage();
+                List<LayoutToken> dehyphenized = LayoutTokensUtil.dehyphenize(clusterTokens);
+                int clusterPage = Iterables.getLast(dehyphenized).getPage();
 
                 List<Note> notesSamePage = null;
                 if (notes != null && notes.size() > 0) {
@@ -1408,7 +1409,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                 }
 
                 if (notesSamePage == null) {
-                    List<LayoutToken> dehyphenized = LayoutTokensUtil.dehyphenize(clusterTokens);
+                    
                     String text = LayoutTokensUtil.toText(dehyphenized).replace("\n", " ");
 
                     if (isNewParagraph(lastClusterLabel, curParagraph)) {
@@ -1460,13 +1461,13 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                     List<Pair<String,OffsetPosition>> matchedLabelPosition = new ArrayList<>();
 
                     for (Note note : notesSamePage) {
-                        Optional<LayoutToken> matching = clusterTokens
+                        Optional<LayoutToken> matching = dehyphenized
                             .stream()
                             .filter(t -> t.getText().equals(note.getLabel()) && t.isSuperscript())
                             .findFirst();
 
                         if (matching.isPresent()) {
-                            int idx = clusterTokens.indexOf(matching.get());
+                            int idx = dehyphenized.indexOf(matching.get());
                             note.setIgnored(true);
                             OffsetPosition matchingPosition = new OffsetPosition();
                             matchingPosition.start = idx;
@@ -1490,8 +1491,8 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                         Note note = labels2Notes.get(matching.getLeft());
                         OffsetPosition matchingPosition = matching.getRight();
 
-                        List<LayoutToken> before = clusterTokens.subList(pos, matchingPosition.start);
-                        String clusterContentBefore = LayoutTokensUtil.normalizeDehyphenizeText(before);
+                        List<LayoutToken> before = dehyphenized.subList(pos, matchingPosition.start);
+                        String clusterContentBefore = LayoutTokensUtil.toText(before);
 
                         if (CollectionUtils.isNotEmpty(before) && before.get(0).getText().equals(" ")) {
                             curParagraph.appendChild(new Text(" "));
@@ -1506,7 +1507,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                         }
                         curParagraphTokens.addAll(cluster.concatTokens());
 
-                        List<LayoutToken> calloutTokens = clusterTokens.subList(matchingPosition.start, matchingPosition.end);
+                        List<LayoutToken> calloutTokens = dehyphenized.subList(matchingPosition.start, matchingPosition.end);
 
                         Element ref = teiElement("ref");
                         ref.addAttribute(new Attribute("type", "foot"));
@@ -1526,8 +1527,8 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                     }
 
                     // add last chunk of paragraph stuff (or whole paragraph if no note callout matching)
-                    List<LayoutToken> remaining = clusterTokens.subList(pos, clusterTokens.size());
-                    String remainingClusterContent = LayoutTokensUtil.normalizeDehyphenizeText(remaining);
+                    List<LayoutToken> remaining = dehyphenized.subList(pos, dehyphenized.size());
+                    String remainingClusterContent = LayoutTokensUtil.toText(remaining);
 
                     if (CollectionUtils.isNotEmpty(remaining) && remaining.get(0).getText().equals(" ")) {
                         curParagraph.appendChild(new Text(" "));

From 188cda5841047f9f1307a3cad51bf97418ec9227 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Mon, 15 Apr 2024 15:00:40 +0900
Subject: [PATCH 23/23] Merge master into features/add-styles-xml

---
 .../src/main/java/org/grobid/core/document/TEIFormatter.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
index 393c6f9531..a8df9f310b 100755
--- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
+++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1206,7 +1206,7 @@ protected List<Note> makeNotes(List<LayoutToken> noteTokens, String footText, No
 
         // add possible subsequent notes concatenated in the same note sequence (this is a common error,
         // which is addressed here by heuristics, it may not be necessary in the future with a better
-        // segmentation model using more foot notes training data)
+        // segmentation model using more footnotes training data)
         if (currentNumber != -1) {
             String nextLabel = " " + (currentNumber+1);
             // sugar characters after note number must be consistent with the previous ones to avoid false match