Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -507,17 +507,18 @@ project(":grobid-trainer") {
]

def complexTrainerTasks = [
"train_header" : ["org.grobid.trainer.HeaderTrainer", ""],
"train_header_article_light" : ["org.grobid.trainer.HeaderTrainer", "article/light"],
"train_header_article_light_ref" : ["org.grobid.trainer.HeaderTrainer", "article/light-ref"],
"train_header_ietf" : ["org.grobid.trainer.HeaderTrainer", "sdo/ietf"],
"train_segmentation" : ["org.grobid.trainer.SegmentationTrainer", ""],
"train_segmentation_article_light" : ["org.grobid.trainer.SegmentationTrainer", "article/light"],
"train_segmentation_article_light_ref": ["org.grobid.trainer.SegmentationTrainer", "article/light-ref"],
"train_segmentation_ietf" : ["org.grobid.trainer.SegmentationTrainer", "sdo/ietf"],
"train_fulltext" : ["org.grobid.trainer.FulltextTrainer", ""],
"train_fulltext_article_light" : ["org.grobid.trainer.FulltextTrainer", "article/light"],
"train_fulltext_article_light_ref" : ["org.grobid.trainer.FulltextTrainer", "article/light-ref"],
"train_header" : ["org.grobid.trainer.HeaderTrainer", ""],
"train_header_article_light" : ["org.grobid.trainer.HeaderTrainer", "article/light"],
"train_header_article_light_ref" : ["org.grobid.trainer.HeaderTrainer", "article/light-ref"],
"train_header_ietf" : ["org.grobid.trainer.HeaderTrainer", "sdo/ietf"],
"train_segmentation" : ["org.grobid.trainer.SegmentationTrainer", ""],
"train_segmentation_article_light" : ["org.grobid.trainer.SegmentationTrainer", "article/light"],
"train_segmentation_article_light_ref" : ["org.grobid.trainer.SegmentationTrainer", "article/light-ref"],
"train_segmentation_article_dh_law_footnotes" : ["org.grobid.trainer.SegmentationTrainer", "article/dh-law-footnotes"],
"train_segmentation_ietf" : ["org.grobid.trainer.SegmentationTrainer", "sdo/ietf"],
"train_fulltext" : ["org.grobid.trainer.FulltextTrainer", ""],
"train_fulltext_article_light" : ["org.grobid.trainer.FulltextTrainer", "article/light"],
"train_fulltext_article_light_ref" : ["org.grobid.trainer.FulltextTrainer", "article/light-ref"],
]

def libraries = ""
Expand Down
66 changes: 66 additions & 0 deletions doc/benchmarks/flavors/article_dh_law/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Benchmarks

## Iterations batch 1,2,3
Date: 2025-10-25

### List of files used for training:

10.1093$1$gerhis$1$ghaf028.training.segmentation.tei.xml
10.3249$1$1868-1581-2-2-clark.training.segmentation.tei.xml
10.5354$1$rti.v1i1.21339.training.segmentation.tei.xml
10.5354$1$rti.v1i2.25648.training.segmentation.tei.xml
10.5771$1$2699-1284-2024-3-149.training.segmentation.tei.xml
10.5771__2699-1284-2020-1-83.training.segmentation.tei.xml
10.6092__issn.2531-6133__10402.training.segmentation.tei.xml
10.12759$1$hsr.6.1981.3.3-17.training.segmentation.tei.xml
10.12775$1$clr.2013.004.training.segmentation.tei.xml
10.12775$1$clr.2013.008.training.segmentation.tei.xml
10.14276$1$2384-8901$1$448.training.segmentation.tei.xml
10.16995$1$lefou.9.training.segmentation.tei.xml
10.19164__ijple.v6i1.1295.training.segmentation.tei.xml
10.19195$1$0524-4544.337.9.training.segmentation.tei.xml
10.25364%2F01.10%3A2023.2.1.training.segmentation.tei.xml
10.25364%2F01.11%3A2024.1.5.training.segmentation.tei.xml
10.26321$1$a.facchinetti.01.2025.05.training.segmentation.tei.xml
10.26321$1$s.marino.01.2025.06.training.segmentation.tei.xml
10.32361$1$2025170222092.training.segmentation.tei.xml
10.34767$1$dp.2024.02.02.training.segmentation.tei.xml
1296-Article Text-4476-1-10-20221017.training.segmentation.tei.xml

### List of files used for evaluation:

10.1093$1$gerhis$1$ghae045.training.segmentation.tei.xml
10.3249$1$1868-1581-1-2-gutbrod.training.segmentation.tei.xml
10.5771__2699-1284-2020-1-16.training.segmentation.tei.xml
10.6092__issn.2531-6133__6356.training.segmentation.tei.xml
10.12759$1$hsr.3.1978.1.3-10.training.segmentation.tei.xml
10.12775$1$clr.2013.002.training.segmentation.tei.xml
10.14276$1$2384-8901$1$443.training.segmentation.tei.xml
10.19164__ijple.v6i1.1293.training.segmentation.tei.xml
10.19195$1$0524-4544.337.2.training.segmentation.tei.xml
10.32361$1$201810011903.training.segmentation.tei.xml
1294-Article Text-4464-1-10-20221017.training.segmentation.tei.xml

### Results

===== Field-level results =====

label accuracy precision recall f1 support

<body> 91.57 79.45 82.86 81.12 210
<cover> 99.79 0 0 0 0
<footnote> 98.54 64.29 50 56.25 18
<header> 96.98 36.84 29.17 32.56 24
<headnote> 91.16 59.86 75.22 66.67 113
<page> 96.15 85.71 96.88 90.95 192
<references> 91.05 76.06 82.23 79.02 197
<toc> 99.79 0 0 0 2

all (micro avg.) 95.03 75.61 82.41 78.86 756
all (macro avg.) 95.03 57.46 59.48 58.08 756

===== Instance-level results =====

Total expected instances: 11
Correct instances: 0
Instance-level recall: 0
4 changes: 3 additions & 1 deletion grobid-core/src/main/java/org/grobid/core/GrobidModels.java
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ public enum Flavor {
BLANK("blank"),
ARTICLE_LIGHT("article/light"),
ARTICLE_LIGHT_WITH_REFERENCES("article/light-ref"),
ARTICLE_DH_LAW_FOOTNOTES("article/dh-law-footnotes"),
_3GPP("sdo/3gpp"),
IETF("sdo/ietf");

Expand Down Expand Up @@ -171,7 +172,8 @@ public static GrobidModel getModelFlavor(GrobidModel model, Flavor flavor) {
} else {
GrobidModel grobidModel = modelFor(model.toString() + "/" + flavor.getLabel().toLowerCase());
if (!Files.exists(Paths.get(grobidModel.getModelPath()))) {
LOGGER.info("The requested model flavor " + flavor.getLabel() + " model is not available. Defaulting to the standard model. ");
LOGGER.info("The requested model flavor " + flavor.getLabel() + " model is not available. " +
"Defaulting to the standard model. ");
return model;
} else {
return grobidModel;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,9 @@ else if (tokenizations.get(tokPtr2).t().equals("\n") ||
else {
output = writeField(label, lastTag, tok, "<other>", "", addSpace, addEOL, 2);
if (output != null) {
if (refOpen) {
sb.append("</bibl>");
}
sb.append(output);
refOpen = false;
}
Expand Down
8 changes: 8 additions & 0 deletions grobid-home/config/grobid.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,14 @@ grobid:
max_sequence_length: 3000
batch_size: 10

- name: "segmentation-article-dh-law-footnotes"
engine: "wapiti"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.0000001
window: 50
nbMaxIterations: 2000

- name: "segmentation-article-light"
engine: "wapiti"
wapiti:
Expand Down
Loading
Loading