diff --git a/doc/benchmarks/Benchmarking-biorxiv.md b/doc/benchmarks/Benchmarking-biorxiv.md index 8e60478dba..4c9bd02d9e 100644 --- a/doc/benchmarks/Benchmarking-biorxiv.md +++ b/doc/benchmarks/Benchmarking-biorxiv.md @@ -40,46 +40,46 @@ Evaluation on 2000 random PDF files out of 1998 PDF (ratio 1.0). **Field-level results** -| label | precision | recall | f1 | support | -|-----------------------------|-----------|-----------|----------|---------| -| abstract | 2.42 | 2.36 | 2.39 | 1990 | -| authors | 85.12 | 84.39 | 84.75 | 1999 | -| first_author | 96.92 | 96.19 | 96.56 | 1997 | -| keywords | 58.13 | 59.24 | 58.68 | 839 | -| title | 77.33 | 76.6 | 76.97 | 2000 | -| | | | | | -| **all fields (micro avg.)** | **65** | **64.41** | **64.7** | 8825 | -| all fields (macro avg.) | 63.98 | 63.76 | 63.87 | 8825 | - -#### Soft Matching (ignoring punctuation, case and space characters mismatches) - -**Field-level results** - | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| abstract | 60.57 | 59.2 | 59.87 | 1990 | -| authors | 85.52 | 84.79 | 85.15 | 1999 | -| first_author | 97.12 | 96.39 | 96.76 | 1997 | -| keywords | 63.27 | 64.48 | 63.87 | 839 | -| title | 79.45 | 78.7 | 79.08 | 2000 | +| abstract | 2.42 | 2.36 | 2.39 | 1989 | +| authors | 85.36 | 84.63 | 85 | 1998 | +| first_author | 97.02 | 96.29 | 96.66 | 1996 | +| keywords | 58.08 | 59.19 | 58.63 | 838 | +| title | 77.47 | 76.74 | 77.1 | 1999 | | | | | | | -| **all fields (micro avg.)** | **79.05** | **78.33** | **78.69** | 8825 | -| all fields (macro avg.) | 77.19 | 76.71 | 76.95 | 8825 | +| **all fields (micro avg.)** | **65.1** | **64.51** | **64.81** | 8820 | +| all fields (macro avg.) | 64.07 | 63.84 | 63.96 | 8820 | -#### Levenshtein Matching (Minimum Levenshtein distance at 0.8) +#### Soft Matching (ignoring punctuation, case and space characters mismatches) **Field-level results** | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|----------|---------| -| abstract | 80.72 | 78.89 | 79.8 | 1990 | -| authors | 92.63 | 91.85 | 92.24 | 1999 | -| first_author | 97.38 | 96.64 | 97.01 | 1997 | -| keywords | 79.3 | 80.81 | 80.05 | 839 | -| title | 91.97 | 91.1 | 91.53 | 2000 | +| abstract | 60.6 | 59.23 | 59.9 | 1989 | +| authors | 85.76 | 85.04 | 85.4 | 1998 | +| first_author | 97.22 | 96.49 | 96.86 | 1996 | +| keywords | 63.23 | 64.44 | 63.83 | 838 | +| title | 79.6 | 78.84 | 79.22 | 1999 | | | | | | | -| **all fields (micro avg.)** | **89.61** | **88.79** | **89.2** | 8825 | -| all fields (macro avg.) | 88.4 | 87.86 | 88.13 | 8825 | +| **all fields (micro avg.)** | **79.16** | **78.45** | **78.8** | 8820 | +| all fields (macro avg.) | 77.28 | 76.81 | 77.04 | 8820 | + +#### Levenshtein Matching (Minimum Levenshtein distance at 0.8) + +**Field-level results** + +| label | precision | recall | f1 | support | +|-----------------------------|-----------|-----------|-----------|---------| +| abstract | 80.71 | 78.88 | 79.79 | 1989 | +| authors | 92.88 | 92.09 | 92.49 | 1998 | +| first_author | 97.53 | 96.79 | 97.16 | 1996 | +| keywords | 79.27 | 80.79 | 80.02 | 838 | +| title | 92.12 | 91.25 | 91.68 | 1999 | +| | | | | | +| **all fields (micro avg.)** | **89.73** | **88.91** | **89.32** | 8820 | +| all fields (macro avg.) | 88.5 | 87.96 | 88.23 | 8820 | #### Ratcliff/Obershelp Matching (Minimum Ratcliff/Obershelp similarity at 0.95) @@ -87,28 +87,28 @@ Evaluation on 2000 random PDF files out of 1998 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| abstract | 77.22 | 75.48 | 76.34 | 1990 | -| authors | 88.7 | 87.94 | 88.32 | 1999 | -| first_author | 96.92 | 96.19 | 96.56 | 1997 | -| keywords | 70.99 | 72.35 | 71.66 | 839 | -| title | 87.63 | 86.8 | 87.21 | 2000 | +| abstract | 77.21 | 75.47 | 76.33 | 1989 | +| authors | 88.89 | 88.14 | 88.51 | 1998 | +| first_author | 97.02 | 96.29 | 96.66 | 1996 | +| keywords | 70.96 | 72.32 | 71.63 | 838 | +| title | 87.78 | 86.94 | 87.36 | 1999 | | | | | | | -| **all fields (micro avg.)** | **86.04** | **85.26** | **85.65** | 8825 | -| all fields (macro avg.) | 84.29 | 83.75 | 84.02 | 8825 | +| **all fields (micro avg.)** | **86.13** | **85.35** | **85.74** | 8820 | +| all fields (macro avg.) | 84.37 | 83.83 | 84.1 | 8820 | #### Instance-level results ``` -Total expected instances: 2000 -Total correct instances: 40 (strict) -Total correct instances: 728 (soft) -Total correct instances: 1237 (Levenshtein) -Total correct instances: 1066 (ObservedRatcliffObershelp) - -Instance-level recall: 2 (strict) -Instance-level recall: 36.4 (soft) -Instance-level recall: 61.85 (Levenshtein) -Instance-level recall: 53.3 (RatcliffObershelp) +Total expected instances: 1999 +Total correct instances: 40 (strict) +Total correct instances: 728 (soft) +Total correct instances: 1238 (Levenshtein) +Total correct instances: 1066 (ObservedRatcliffObershelp) + +Instance-level recall: 2 (strict) +Instance-level recall: 36.42 (soft) +Instance-level recall: 61.93 (Levenshtein) +Instance-level recall: 53.33 (RatcliffObershelp) ``` ## Citation metadata @@ -121,20 +121,20 @@ Evaluation on 2000 random PDF files out of 1998 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| authors | 88.2 | 83.12 | 85.58 | 97183 | -| date | 91.71 | 86.15 | 88.84 | 97630 | +| authors | 88.19 | 83.11 | 85.58 | 97132 | +| date | 91.7 | 86.15 | 88.84 | 97579 | | doi | 70.86 | 83.85 | 76.81 | 16894 | -| first_author | 95.08 | 89.53 | 92.22 | 97183 | -| inTitle | 82.9 | 79.31 | 81.06 | 96430 | +| first_author | 95.08 | 89.53 | 92.22 | 97132 | +| inTitle | 82.88 | 79.3 | 81.05 | 96379 | | issue | 94.35 | 91.93 | 93.13 | 30312 | -| page | 94.99 | 78.22 | 85.79 | 88597 | +| page | 94.99 | 78.22 | 85.8 | 88551 | | pmcid | 66.44 | 86.12 | 75.01 | 807 | | pmid | 69.99 | 84.57 | 76.59 | 2093 | -| title | 84.9 | 83.42 | 84.16 | 92463 | -| volume | 96.27 | 95.07 | 95.66 | 87709 | +| title | 84.89 | 83.41 | 84.14 | 92415 | +| volume | 96.27 | 95.06 | 95.66 | 87661 | | | | | | | -| **all fields (micro avg.)** | **89.87** | **85.21** | **87.48** | 707301 | -| all fields (macro avg.) | 85.06 | 85.57 | 84.99 | 707301 | +| **all fields (micro avg.)** | **89.87** | **85.21** | **87.48** | 706955 | +| all fields (macro avg.) | 85.06 | 85.57 | 84.98 | 706955 | #### Soft Matching (ignoring punctuation, case and space characters mismatches) @@ -142,20 +142,20 @@ Evaluation on 2000 random PDF files out of 1998 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| authors | 89.35 | 84.21 | 86.71 | 97183 | -| date | 91.71 | 86.15 | 88.84 | 97630 | +| authors | 89.35 | 84.2 | 86.7 | 97132 | +| date | 91.7 | 86.15 | 88.84 | 97579 | | doi | 75.34 | 89.16 | 81.67 | 16894 | -| first_author | 95.51 | 89.93 | 92.64 | 97183 | -| inTitle | 92.37 | 88.38 | 90.33 | 96430 | +| first_author | 95.51 | 89.93 | 92.64 | 97132 | +| inTitle | 92.37 | 88.37 | 90.33 | 96379 | | issue | 94.35 | 91.93 | 93.13 | 30312 | -| page | 94.99 | 78.22 | 85.79 | 88597 | +| page | 94.99 | 78.22 | 85.8 | 88551 | | pmcid | 75.72 | 98.14 | 85.48 | 807 | | pmid | 74.42 | 89.92 | 81.44 | 2093 | -| title | 93.25 | 91.63 | 92.43 | 92463 | -| volume | 96.27 | 95.07 | 95.66 | 87709 | +| title | 93.24 | 91.61 | 92.42 | 92415 | +| volume | 96.27 | 95.06 | 95.66 | 87661 | | | | | | | -| **all fields (micro avg.)** | **92.69** | **87.88** | **90.22** | 707301 | -| all fields (macro avg.) | 88.48 | 89.34 | 88.56 | 707301 | +| **all fields (micro avg.)** | **92.69** | **87.88** | **90.22** | 706955 | +| all fields (macro avg.) | 88.48 | 89.34 | 88.55 | 706955 | #### Levenshtein Matching (Minimum Levenshtein distance at 0.8) @@ -163,20 +163,20 @@ Evaluation on 2000 random PDF files out of 1998 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| authors | 94.61 | 89.16 | 91.81 | 97183 | -| date | 91.71 | 86.15 | 88.84 | 97630 | +| authors | 94.61 | 89.16 | 91.8 | 97132 | +| date | 91.7 | 86.15 | 88.84 | 97579 | | doi | 77.58 | 91.81 | 84.1 | 16894 | -| first_author | 95.66 | 90.08 | 92.78 | 97183 | -| inTitle | 93.36 | 89.32 | 91.29 | 96430 | +| first_author | 95.66 | 90.07 | 92.78 | 97132 | +| inTitle | 93.35 | 89.31 | 91.29 | 96379 | | issue | 94.35 | 91.93 | 93.13 | 30312 | -| page | 94.99 | 78.22 | 85.79 | 88597 | +| page | 94.99 | 78.22 | 85.8 | 88551 | | pmcid | 75.72 | 98.14 | 85.48 | 807 | | pmid | 74.42 | 89.92 | 81.44 | 2093 | -| title | 96.08 | 94.41 | 95.24 | 92463 | -| volume | 96.27 | 95.07 | 95.66 | 87709 | +| title | 96.08 | 94.41 | 95.24 | 92415 | +| volume | 96.27 | 95.06 | 95.66 | 87661 | | | | | | | -| **all fields (micro avg.)** | **94.01** | **89.14** | **91.51** | 707301 | -| all fields (macro avg.) | 89.52 | 90.38 | 89.6 | 707301 | +| **all fields (micro avg.)** | **94.01** | **89.13** | **91.51** | 706955 | +| all fields (macro avg.) | 89.52 | 90.38 | 89.6 | 706955 | #### Ratcliff/Obershelp Matching (Minimum Ratcliff/Obershelp similarity at 0.95) @@ -184,73 +184,73 @@ Evaluation on 2000 random PDF files out of 1998 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| authors | 91.57 | 86.3 | 88.85 | 97183 | -| date | 91.71 | 86.15 | 88.84 | 97630 | +| authors | 91.56 | 86.29 | 88.85 | 97132 | +| date | 91.7 | 86.15 | 88.84 | 97579 | | doi | 76.04 | 89.98 | 82.42 | 16894 | -| first_author | 95.13 | 89.58 | 92.27 | 97183 | -| inTitle | 91.13 | 87.19 | 89.11 | 96430 | +| first_author | 95.13 | 89.58 | 92.27 | 97132 | +| inTitle | 91.12 | 87.18 | 89.1 | 96379 | | issue | 94.35 | 91.93 | 93.13 | 30312 | -| page | 94.99 | 78.22 | 85.79 | 88597 | +| page | 94.99 | 78.22 | 85.8 | 88551 | | pmcid | 66.44 | 86.12 | 75.01 | 807 | | pmid | 69.99 | 84.57 | 76.59 | 2093 | -| title | 95.41 | 93.75 | 94.57 | 92463 | -| volume | 96.27 | 95.07 | 95.66 | 87709 | +| title | 95.41 | 93.74 | 94.57 | 92415 | +| volume | 96.27 | 95.06 | 95.66 | 87661 | | | | | | | -| **all fields (micro avg.)** | **93.05** | **88.22** | **90.57** | 707301 | -| all fields (macro avg.) | 87.55 | 88.08 | 87.48 | 707301 | +| **all fields (micro avg.)** | **93.05** | **88.22** | **90.57** | 706955 | +| all fields (macro avg.) | 87.55 | 88.07 | 87.48 | 706955 | #### Instance-level results ``` -Total expected instances: 98799 -Total extracted instances: 97808 -Total correct instances: 43695 (strict) -Total correct instances: 54689 (soft) -Total correct instances: 58863 (Levenshtein) -Total correct instances: 55597 (RatcliffObershelp) +Total expected instances: 98748 +Total extracted instances: 97758 +Total correct instances: 43658 (strict) +Total correct instances: 54646 (soft) +Total correct instances: 58827 (Levenshtein) +Total correct instances: 55558 (RatcliffObershelp) -Instance-level precision: 44.67 (strict) -Instance-level precision: 55.91 (soft) -Instance-level precision: 60.18 (Levenshtein) -Instance-level precision: 56.84 (RatcliffObershelp) +Instance-level precision: 44.66 (strict) +Instance-level precision: 55.9 (soft) +Instance-level precision: 60.18 (Levenshtein) +Instance-level precision: 56.83 (RatcliffObershelp) -Instance-level recall: 44.23 (strict) -Instance-level recall: 55.35 (soft) -Instance-level recall: 59.58 (Levenshtein) -Instance-level recall: 56.27 (RatcliffObershelp) +Instance-level recall: 44.21 (strict) +Instance-level recall: 55.34 (soft) +Instance-level recall: 59.57 (Levenshtein) +Instance-level recall: 56.26 (RatcliffObershelp) -Instance-level f-score: 44.45 (strict) -Instance-level f-score: 55.63 (soft) -Instance-level f-score: 59.88 (Levenshtein) -Instance-level f-score: 56.56 (RatcliffObershelp) +Instance-level f-score: 44.43 (strict) +Instance-level f-score: 55.62 (soft) +Instance-level f-score: 59.87 (Levenshtein) +Instance-level f-score: 56.55 (RatcliffObershelp) -Matching 1 : 79152 +Matching 1 : 79095 -Matching 2 : 4442 +Matching 2 : 4449 -Matching 3 : 4360 +Matching 3 : 4361 -Matching 4 : 2101 +Matching 4 : 2101 -Total matches : 90055 +Total matches : 90006 ``` #### Citation context resolution ``` -Total expected references: 98797 - 49.4 references per article -Total predicted references: 97808 - 48.9 references per article +Total expected references: 98746 - 49.37 references per article +Total predicted references: 97758 - 48.88 references per article -Total expected citation contexts: 142862 - 71.43 citation contexts per article -Total predicted citation contexts: 134498 - 67.25 citation contexts per article +Total expected citation contexts: 142776 - 71.39 citation contexts per article +Total predicted citation contexts: 134412 - 67.21 citation contexts per article -Total correct predicted citation contexts: 115971 - 57.99 citation contexts per article -Total wrong predicted citation contexts: 18527 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) +Total correct predicted citation contexts: 115887 - 57.94 citation contexts per article +Total wrong predicted citation contexts: 18525 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) -Precision citation contexts: 86.23 -Recall citation contexts: 81.18 -fscore citation contexts: 83.62 +Precision citation contexts: 86.22 +Recall citation contexts: 81.17 +fscore citation contexts: 83.62 ``` ## Fulltext structures @@ -268,17 +268,17 @@ Evaluation on 2000 random PDF files out of 1998 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| availability_stmt | 29.61 | 25.56 | 27.44 | 446 | -| figure_title | 4.29 | 2.34 | 3.03 | 22978 | -| funding_stmt | 3.46 | 22.95 | 6.01 | 745 | -| reference_citation | 72.02 | 70.94 | 71.48 | 147470 | -| reference_figure | 70.41 | 77.14 | 73.62 | 47984 | +| availability_stmt | 28.91 | 24.89 | 26.75 | 446 | +| figure_title | 4.3 | 2.34 | 3.03 | 22967 | +| funding_stmt | 3.48 | 23.03 | 6.05 | 747 | +| reference_citation | 72.03 | 70.95 | 71.49 | 147384 | +| reference_figure | 70.38 | 77.12 | 73.6 | 47896 | | reference_table | 45.65 | 86.74 | 59.82 | 5957 | -| section_title | 71.35 | 69.91 | 70.62 | 32398 | +| section_title | 71.34 | 69.92 | 70.62 | 32368 | | table_title | 7.41 | 2.7 | 3.96 | 3925 | | | | | | | -| **all fields (micro avg.)** | **65.48** | **65.06** | **65.27** | 261903 | -| all fields (macro avg.) | 38.02 | 44.79 | 39.5 | 261903 | +| **all fields (micro avg.)** | **65.48** | **65.05** | **65.26** | 261690 | +| all fields (macro avg.) | 37.94 | 44.71 | 39.41 | 261690 | #### Soft Matching (ignoring punctuation, case and space characters mismatches) @@ -286,26 +286,27 @@ Evaluation on 2000 random PDF files out of 1998 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| availability_stmt | 50.65 | 43.72 | 46.93 | 446 | -| figure_title | 68.34 | 37.29 | 48.25 | 22978 | -| funding_stmt | 3.68 | 24.43 | 6.39 | 745 | -| reference_citation | 84.34 | 83.08 | 83.7 | 147470 | -| reference_figure | 71.05 | 77.84 | 74.29 | 47984 | +| availability_stmt | 50.78 | 43.72 | 46.99 | 446 | +| figure_title | 68.34 | 37.27 | 48.23 | 22967 | +| funding_stmt | 3.71 | 24.5 | 6.44 | 747 | +| reference_citation | 84.33 | 83.07 | 83.7 | 147384 | +| reference_figure | 71.02 | 77.81 | 74.26 | 47896 | | reference_table | 46.07 | 87.53 | 60.36 | 5957 | -| section_title | 76.91 | 75.37 | 76.13 | 32398 | +| section_title | 76.9 | 75.37 | 76.13 | 32368 | | table_title | 82.8 | 30.17 | 44.22 | 3925 | | | | | | | -| **all fields (micro avg.)** | **76.72** | **76.22** | **76.47** | 261903 | -| all fields (macro avg.) | 60.48 | 57.43 | 55.04 | 261903 | +| **all fields (micro avg.)** | **76.71** | **76.21** | **76.46** | 261690 | +| all fields (macro avg.) | 60.49 | 57.43 | 55.04 | 261690 | **Document-level ratio results** -| label | precision | recall | f1 | support | -|-----------------------------|-----------|-----------|-----------|---------| -| availability_stmt | 84.8 | 86.32 | 85.56 | 446 | -| | | | | | -| **all fields (micro avg.)** | **84.8** | **86.32** | **85.56** | 446 | -| all fields (macro avg.) | 84.8 | 86.32 | 85.56 | 446 | +| label | precision | recall | f1 | support | +|-----------------------------|-----------|----------|-----------|---------| +| availability_stmt | 84.96 | 86.1 | 85.52 | 446 | +| | | | | | +| **all fields (micro avg.)** | **84.96** | **86.1** | **85.52** | 446 | +| all fields (macro avg.) | 84.96 | 86.1 | 85.52 | 446 | + +Evaluation metrics produced in 1599.065 seconds -Evaluation metrics produced in 1607.353 seconds diff --git a/doc/benchmarks/Benchmarking-elife.md b/doc/benchmarks/Benchmarking-elife.md index da17249502..b6b4c95a5f 100644 --- a/doc/benchmarks/Benchmarking-elife.md +++ b/doc/benchmarks/Benchmarking-elife.md @@ -42,12 +42,12 @@ Evaluation on 984 random PDF files out of 982 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| | abstract | 9.53 | 9.25 | 9.39 | 984 | -| authors | 74.79 | 73.96 | 74.37 | 983 | -| first_author | 92.59 | 91.65 | 92.12 | 982 | -| title | 86.93 | 85.16 | 86.04 | 984 | +| authors | 57.16 | 56.46 | 56.81 | 983 | +| first_author | 89.39 | 88.39 | 88.89 | 982 | +| title | 83.33 | 81.3 | 82.3 | 984 | | | | | | | -| **all fields (micro avg.)** | **66.17** | **64.99** | **65.57** | 3933 | -| all fields (macro avg.) | 65.96 | 65 | 65.48 | 3933 | +| **all fields (micro avg.)** | **59.99** | **58.84** | **59.41** | 3933 | +| all fields (macro avg.) | 59.85 | 58.85 | 59.35 | 3933 | #### Soft Matching (ignoring punctuation, case and space characters mismatches) @@ -56,12 +56,12 @@ Evaluation on 984 random PDF files out of 982 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| | abstract | 22.3 | 21.65 | 21.97 | 984 | -| authors | 75.1 | 74.26 | 74.68 | 983 | -| first_author | 92.59 | 91.65 | 92.12 | 982 | -| title | 94.92 | 92.99 | 93.94 | 984 | +| authors | 57.47 | 56.77 | 57.11 | 983 | +| first_author | 89.39 | 88.39 | 88.89 | 982 | +| title | 93.65 | 91.36 | 92.49 | 984 | | | | | | | -| **all fields (micro avg.)** | **71.4** | **70.12** | **70.75** | 3933 | -| all fields (macro avg.) | 71.23 | 70.14 | 70.68 | 3933 | +| **all fields (micro avg.)** | **65.8** | **64.53** | **65.16** | 3933 | +| all fields (macro avg.) | 65.7 | 64.54 | 65.12 | 3933 | #### Levenshtein Matching (Minimum Levenshtein distance at 0.8) @@ -70,12 +70,12 @@ Evaluation on 984 random PDF files out of 982 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| | abstract | 47.43 | 46.04 | 46.73 | 984 | -| authors | 88.68 | 87.69 | 88.18 | 983 | -| first_author | 92.9 | 91.96 | 92.43 | 982 | -| title | 96.37 | 94.41 | 95.38 | 984 | +| authors | 83.32 | 82.3 | 82.8 | 983 | +| first_author | 90.01 | 89 | 89.5 | 982 | +| title | 96.15 | 93.8 | 94.96 | 984 | | | | | | | -| **all fields (micro avg.)** | **81.47** | **80.02** | **80.73** | 3933 | -| all fields (macro avg.) | 81.35 | 80.02 | 80.68 | 3933 | +| **all fields (micro avg.)** | **79.31** | **77.78** | **78.54** | 3933 | +| all fields (macro avg.) | 79.23 | 77.78 | 78.5 | 3933 | #### Ratcliff/Obershelp Matching (Minimum Ratcliff/Obershelp similarity at 0.95) @@ -84,26 +84,26 @@ Evaluation on 984 random PDF files out of 982 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| | abstract | 44.5 | 43.19 | 43.84 | 984 | -| authors | 80.35 | 79.45 | 79.9 | 983 | -| first_author | 92.59 | 91.65 | 92.12 | 982 | -| title | 96.37 | 94.41 | 95.38 | 984 | +| authors | 67.97 | 67.14 | 67.55 | 983 | +| first_author | 89.39 | 88.39 | 88.89 | 982 | +| title | 95.73 | 93.39 | 94.55 | 984 | | | | | | | -| **all fields (micro avg.)** | **78.57** | **77.17** | **77.86** | 3933 | -| all fields (macro avg.) | 78.45 | 77.18 | 77.81 | 3933 | +| **all fields (micro avg.)** | **74.46** | **73.02** | **73.74** | 3933 | +| all fields (macro avg.) | 74.4 | 73.03 | 73.71 | 3933 | #### Instance-level results ``` -Total expected instances: 984 -Total correct instances: 74 (strict) -Total correct instances: 196 (soft) -Total correct instances: 381 (Levenshtein) -Total correct instances: 338 (ObservedRatcliffObershelp) - -Instance-level recall: 7.52 (strict) -Instance-level recall: 19.92 (soft) -Instance-level recall: 38.72 (Levenshtein) -Instance-level recall: 34.35 (RatcliffObershelp) +Total expected instances: 984 +Total correct instances: 36 (strict) +Total correct instances: 108 (soft) +Total correct instances: 332 (Levenshtein) +Total correct instances: 260 (ObservedRatcliffObershelp) + +Instance-level recall: 3.66 (strict) +Instance-level recall: 10.98 (soft) +Instance-level recall: 33.74 (Levenshtein) +Instance-level recall: 26.42 (RatcliffObershelp) ``` ## Citation metadata @@ -116,17 +116,17 @@ Evaluation on 984 random PDF files out of 982 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| authors | 79.43 | 78.36 | 78.89 | 63265 | -| date | 95.89 | 94.19 | 95.03 | 63662 | -| first_author | 94.83 | 93.51 | 94.17 | 63265 | -| inTitle | 95.82 | 94.87 | 95.34 | 63213 | -| issue | 2 | 75 | 3.9 | 16 | -| page | 96.28 | 95.44 | 95.86 | 53375 | -| title | 90.27 | 90.87 | 90.57 | 62044 | -| volume | 97.88 | 98.39 | 98.14 | 61049 | +| authors | 79.44 | 78.37 | 78.9 | 63265 | +| date | 95.89 | 94.2 | 95.04 | 63662 | +| first_author | 94.82 | 93.51 | 94.16 | 63265 | +| inTitle | 95.81 | 94.88 | 95.34 | 63213 | +| issue | 2.01 | 75 | 3.91 | 16 | +| page | 96.28 | 95.45 | 95.86 | 53375 | +| title | 90.28 | 90.89 | 90.58 | 62044 | +| volume | 97.88 | 98.41 | 98.14 | 61049 | | | | | | | -| **all fields (micro avg.)** | **92.7** | **92.13** | **92.42** | 429889 | -| all fields (macro avg.) | 81.55 | 90.08 | 81.49 | 429889 | +| **all fields (micro avg.)** | **92.7** | **92.14** | **92.42** | 429889 | +| all fields (macro avg.) | 81.55 | 90.09 | 81.49 | 429889 | #### Soft Matching (ignoring punctuation, case and space characters mismatches) @@ -134,17 +134,17 @@ Evaluation on 984 random PDF files out of 982 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| authors | 79.57 | 78.49 | 79.03 | 63265 | -| date | 95.89 | 94.19 | 95.03 | 63662 | -| first_author | 94.91 | 93.59 | 94.25 | 63265 | -| inTitle | 96.29 | 95.35 | 95.82 | 63213 | -| issue | 2 | 75 | 3.9 | 16 | -| page | 96.28 | 95.44 | 95.86 | 53375 | -| title | 95.94 | 96.58 | 96.26 | 62044 | -| volume | 97.88 | 98.39 | 98.14 | 61049 | +| authors | 79.57 | 78.5 | 79.04 | 63265 | +| date | 95.89 | 94.2 | 95.04 | 63662 | +| first_author | 94.9 | 93.59 | 94.24 | 63265 | +| inTitle | 96.29 | 95.36 | 95.82 | 63213 | +| issue | 2.01 | 75 | 3.91 | 16 | +| page | 96.28 | 95.45 | 95.86 | 53375 | +| title | 95.95 | 96.59 | 96.27 | 62044 | +| volume | 97.88 | 98.41 | 98.14 | 61049 | | | | | | | -| **all fields (micro avg.)** | **93.63** | **93.06** | **93.34** | 429889 | -| all fields (macro avg.) | 82.35 | 90.88 | 82.28 | 429889 | +| **all fields (micro avg.)** | **93.63** | **93.07** | **93.35** | 429889 | +| all fields (macro avg.) | 82.35 | 90.89 | 82.29 | 429889 | #### Levenshtein Matching (Minimum Levenshtein distance at 0.8) @@ -152,17 +152,17 @@ Evaluation on 984 random PDF files out of 982 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| authors | 93.33 | 92.07 | 92.7 | 63265 | -| date | 95.89 | 94.19 | 95.03 | 63662 | -| first_author | 95.36 | 94.03 | 94.69 | 63265 | -| inTitle | 96.62 | 95.67 | 96.14 | 63213 | -| issue | 2 | 75 | 3.9 | 16 | -| page | 96.28 | 95.44 | 95.86 | 53375 | +| authors | 93.31 | 92.06 | 92.68 | 63265 | +| date | 95.89 | 94.2 | 95.04 | 63662 | +| first_author | 95.35 | 94.03 | 94.68 | 63265 | +| inTitle | 96.62 | 95.68 | 96.15 | 63213 | +| issue | 2.01 | 75 | 3.91 | 16 | +| page | 96.28 | 95.45 | 95.86 | 53375 | | title | 97.69 | 98.34 | 98.01 | 62044 | -| volume | 97.88 | 98.39 | 98.14 | 61049 | +| volume | 97.88 | 98.41 | 98.14 | 61049 | | | | | | | -| **all fields (micro avg.)** | **96.01** | **95.42** | **95.71** | 429889 | -| all fields (macro avg.) | 84.38 | 92.89 | 84.31 | 429889 | +| **all fields (micro avg.)** | **96.01** | **95.43** | **95.72** | 429889 | +| all fields (macro avg.) | 84.38 | 92.9 | 84.31 | 429889 | #### Ratcliff/Obershelp Matching (Minimum Ratcliff/Obershelp similarity at 0.95) @@ -170,70 +170,70 @@ Evaluation on 984 random PDF files out of 982 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| authors | 86.75 | 85.57 | 86.16 | 63265 | -| date | 95.89 | 94.19 | 95.03 | 63662 | -| first_author | 94.84 | 93.53 | 94.18 | 63265 | -| inTitle | 96.3 | 95.35 | 95.82 | 63213 | -| issue | 2 | 75 | 3.9 | 16 | -| page | 96.28 | 95.44 | 95.86 | 53375 | -| title | 97.54 | 98.18 | 97.86 | 62044 | -| volume | 97.88 | 98.39 | 98.14 | 61049 | +| authors | 86.75 | 85.59 | 86.17 | 63265 | +| date | 95.89 | 94.2 | 95.04 | 63662 | +| first_author | 94.83 | 93.53 | 94.18 | 63265 | +| inTitle | 96.29 | 95.36 | 95.82 | 63213 | +| issue | 2.01 | 75 | 3.91 | 16 | +| page | 96.28 | 95.45 | 95.86 | 53375 | +| title | 97.53 | 98.19 | 97.86 | 62044 | +| volume | 97.88 | 98.41 | 98.14 | 61049 | | | | | | | -| **all fields (micro avg.)** | **94.9** | **94.32** | **94.61** | 429889 | +| **all fields (micro avg.)** | **94.9** | **94.33** | **94.62** | 429889 | | all fields (macro avg.) | 83.43 | 91.96 | 83.37 | 429889 | #### Instance-level results ``` -Total expected instances: 63664 -Total extracted instances: 66152 -Total correct instances: 42401 (strict) -Total correct instances: 45243 (soft) -Total correct instances: 52907 (Levenshtein) -Total correct instances: 49502 (RatcliffObershelp) +Total expected instances: 63664 +Total extracted instances: 66161 +Total correct instances: 42407 (strict) +Total correct instances: 45253 (soft) +Total correct instances: 52913 (Levenshtein) +Total correct instances: 49509 (RatcliffObershelp) -Instance-level precision: 64.1 (strict) -Instance-level precision: 68.39 (soft) -Instance-level precision: 79.98 (Levenshtein) -Instance-level precision: 74.83 (RatcliffObershelp) +Instance-level precision: 64.1 (strict) +Instance-level precision: 68.4 (soft) +Instance-level precision: 79.98 (Levenshtein) +Instance-level precision: 74.83 (RatcliffObershelp) -Instance-level recall: 66.6 (strict) -Instance-level recall: 71.07 (soft) -Instance-level recall: 83.1 (Levenshtein) -Instance-level recall: 77.76 (RatcliffObershelp) +Instance-level recall: 66.61 (strict) +Instance-level recall: 71.08 (soft) +Instance-level recall: 83.11 (Levenshtein) +Instance-level recall: 77.77 (RatcliffObershelp) -Instance-level f-score: 65.32 (strict) -Instance-level f-score: 69.7 (soft) -Instance-level f-score: 81.51 (Levenshtein) -Instance-level f-score: 76.26 (RatcliffObershelp) +Instance-level f-score: 65.33 (strict) +Instance-level f-score: 69.71 (soft) +Instance-level f-score: 81.51 (Levenshtein) +Instance-level f-score: 76.27 (RatcliffObershelp) -Matching 1 : 58715 +Matching 1 : 58724 -Matching 2 : 1019 +Matching 2 : 1019 -Matching 3 : 1252 +Matching 3 : 1250 -Matching 4 : 368 +Matching 4 : 367 -Total matches : 61354 +Total matches : 61360 ``` #### Citation context resolution ``` -Total expected references: 63664 - 64.7 references per article -Total predicted references: 66152 - 67.23 references per article +Total expected references: 63664 - 64.7 references per article +Total predicted references: 66161 - 67.24 references per article -Total expected citation contexts: 109022 - 110.79 citation contexts per article -Total predicted citation contexts: 99982 - 101.61 citation contexts per article +Total expected citation contexts: 109022 - 110.79 citation contexts per article +Total predicted citation contexts: 99932 - 101.56 citation contexts per article -Total correct predicted citation contexts: 96212 - 97.78 citation contexts per article -Total wrong predicted citation contexts: 3770 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) +Total correct predicted citation contexts: 96236 - 97.8 citation contexts per article +Total wrong predicted citation contexts: 3696 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) -Precision citation contexts: 96.23 -Recall citation contexts: 88.25 -fscore citation contexts: 92.07 +Precision citation contexts: 96.3 +Recall citation contexts: 88.27 +fscore citation contexts: 92.11 ``` ## Fulltext structures @@ -251,43 +251,43 @@ Evaluation on 984 random PDF files out of 982 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| availability_stmt | 29.21 | 27.86 | 28.52 | 585 | +| availability_stmt | 29.08 | 27.69 | 28.37 | 585 | | figure_title | 0.07 | 0.02 | 0.03 | 31718 | -| funding_stmt | 6.18 | 29.53 | 10.22 | 921 | -| reference_citation | 57.06 | 55.97 | 56.51 | 108949 | -| reference_figure | 58.42 | 51.02 | 54.47 | 68926 | -| reference_table | 71.83 | 73.46 | 72.63 | 2381 | -| section_title | 82.81 | 77.25 | 79.93 | 21831 | +| funding_stmt | 6.15 | 29.97 | 10.2 | 921 | +| reference_citation | 57.08 | 55.95 | 56.51 | 108949 | +| reference_figure | 58.42 | 51.01 | 54.47 | 68926 | +| reference_table | 71.56 | 73.46 | 72.5 | 2381 | +| section_title | 82.83 | 77.26 | 79.95 | 21831 | | table_title | 0 | 0 | 0 | 1925 | | | | | | | -| **all fields (micro avg.)** | **56.03** | **48.56** | **52.03** | 237236 | -| all fields (macro avg.) | 38.2 | 39.39 | 37.79 | 237236 | +| **all fields (micro avg.)** | **56.02** | **48.55** | **52.02** | 237236 | +| all fields (macro avg.) | 38.15 | 39.42 | 37.75 | 237236 | #### Soft Matching (ignoring punctuation, case and space characters mismatches) **Field-level results** -| label | precision | recall | f1 | support | -|-----------------------------|-----------|-----------|----------|---------| -| availability_stmt | 40.68 | 38.8 | 39.72 | 585 | -| figure_title | 49.75 | 16 | 24.21 | 31718 | -| funding_stmt | 6.18 | 29.53 | 10.22 | 921 | -| reference_citation | 93.6 | 91.81 | 92.7 | 108949 | -| reference_figure | 58.71 | 51.27 | 54.73 | 68926 | -| reference_table | 71.91 | 73.54 | 72.72 | 2381 | -| section_title | 83.85 | 78.21 | 80.93 | 21831 | -| table_title | 94.26 | 28.16 | 43.36 | 1925 | -| | | | | | -| **all fields (micro avg.)** | **77.97** | **67.57** | **72.4** | 237236 | -| all fields (macro avg.) | 62.37 | 50.92 | 52.32 | 237236 | +| label | precision | recall | f1 | support | +|-----------------------------|-----------|-----------|-----------|---------| +| availability_stmt | 40.39 | 38.46 | 39.4 | 585 | +| figure_title | 49.72 | 15.98 | 24.19 | 31718 | +| funding_stmt | 6.15 | 29.97 | 10.2 | 921 | +| reference_citation | 93.65 | 91.8 | 92.72 | 108949 | +| reference_figure | 58.7 | 51.26 | 54.73 | 68926 | +| reference_table | 71.64 | 73.54 | 72.58 | 2381 | +| section_title | 83.86 | 78.23 | 80.95 | 21831 | +| table_title | 94.43 | 28.16 | 43.38 | 1925 | +| | | | | | +| **all fields (micro avg.)** | **77.96** | **67.57** | **72.39** | 237236 | +| all fields (macro avg.) | 62.32 | 50.92 | 52.27 | 237236 | **Document-level ratio results** | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| availability_stmt | 96.88 | 95.38 | 96.12 | 585 | +| availability_stmt | 96.87 | 95.21 | 96.03 | 585 | | | | | | | -| **all fields (micro avg.)** | **96.88** | **95.38** | **96.12** | 585 | -| all fields (macro avg.) | 96.88 | 95.38 | 96.12 | 585 | +| **all fields (micro avg.)** | **96.87** | **95.21** | **96.03** | 585 | +| all fields (macro avg.) | 96.87 | 95.21 | 96.03 | 585 | -Evaluation metrics produced in 1348.778 seconds +Evaluation metrics produced in 1309.47 seconds diff --git a/doc/benchmarks/Benchmarking-plos.md b/doc/benchmarks/Benchmarking-plos.md index 0bd2c68a3e..17d0f596ae 100644 --- a/doc/benchmarks/Benchmarking-plos.md +++ b/doc/benchmarks/Benchmarking-plos.md @@ -42,28 +42,28 @@ Evaluation on 1000 random PDF files out of 998 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| | abstract | 13.33 | 13.33 | 13.33 | 960 | -| authors | 99.07 | 99.07 | 99.07 | 969 | +| authors | 99.17 | 99.17 | 99.17 | 969 | | first_author | 99.28 | 99.28 | 99.28 | 969 | | keywords | 0 | 0 | 0 | 0 | | title | 95.97 | 95.3 | 95.63 | 1000 | | | | | | | -| **all fields (micro avg.)** | **77.18** | **77.04** | **77.11** | 3898 | -| all fields (macro avg.) | 76.91 | 76.75 | 76.83 | 3898 | +| **all fields (micro avg.)** | **77.2** | **77.07** | **77.13** | 3898 | +| all fields (macro avg.) | 76.94 | 76.77 | 76.86 | 3898 | #### Soft Matching (ignoring punctuation, case and space characters mismatches) **Field-level results** -| label | precision | recall | f1 | support | -|-----------------------------|-----------|-----------|----------|---------| -| abstract | 50.52 | 50.52 | 50.52 | 960 | -| authors | 99.07 | 99.07 | 99.07 | 969 | -| first_author | 99.28 | 99.28 | 99.28 | 969 | -| keywords | 0 | 0 | 0 | 0 | -| title | 99.6 | 98.9 | 99.25 | 1000 | -| | | | | | -| **all fields (micro avg.)** | **87.28** | **87.12** | **87.2** | 3898 | -| all fields (macro avg.) | 87.12 | 86.94 | 87.03 | 3898 | +| label | precision | recall | f1 | support | +|-----------------------------|-----------|-----------|-----------|---------| +| abstract | 50.52 | 50.52 | 50.52 | 960 | +| authors | 99.17 | 99.17 | 99.17 | 969 | +| first_author | 99.28 | 99.28 | 99.28 | 969 | +| keywords | 0 | 0 | 0 | 0 | +| title | 99.6 | 98.9 | 99.25 | 1000 | +| | | | | | +| **all fields (micro avg.)** | **87.3** | **87.15** | **87.23** | 3898 | +| all fields (macro avg.) | 87.14 | 86.97 | 87.06 | 3898 | #### Levenshtein Matching (Minimum Levenshtein distance at 0.8) @@ -98,16 +98,16 @@ Evaluation on 1000 random PDF files out of 998 PDF (ratio 1.0). #### Instance-level results ``` -Total expected instances: 1000 -Total correct instances: 142 (strict) -Total correct instances: 491 (soft) -Total correct instances: 729 (Levenshtein) -Total correct instances: 641 (ObservedRatcliffObershelp) - -Instance-level recall: 14.2 (strict) -Instance-level recall: 49.1 (soft) -Instance-level recall: 72.9 (Levenshtein) -Instance-level recall: 64.1 (RatcliffObershelp) +Total expected instances: 1000 +Total correct instances: 142 (strict) +Total correct instances: 491 (soft) +Total correct instances: 729 (Levenshtein) +Total correct instances: 641 (ObservedRatcliffObershelp) + +Instance-level recall: 14.2 (strict) +Instance-level recall: 49.1 (soft) +Instance-level recall: 72.9 (Levenshtein) +Instance-level recall: 64.1 (RatcliffObershelp) ``` ## Citation metadata @@ -189,55 +189,55 @@ Evaluation on 1000 random PDF files out of 998 PDF (ratio 1.0). #### Instance-level results ``` -Total expected instances: 48449 -Total extracted instances: 48221 -Total correct instances: 13495 (strict) -Total correct instances: 22265 (soft) -Total correct instances: 24914 (Levenshtein) -Total correct instances: 23267 (RatcliffObershelp) +Total expected instances: 48449 +Total extracted instances: 48221 +Total correct instances: 13495 (strict) +Total correct instances: 22265 (soft) +Total correct instances: 24914 (Levenshtein) +Total correct instances: 23267 (RatcliffObershelp) -Instance-level precision: 27.99 (strict) -Instance-level precision: 46.17 (soft) -Instance-level precision: 51.67 (Levenshtein) -Instance-level precision: 48.25 (RatcliffObershelp) +Instance-level precision: 27.99 (strict) +Instance-level precision: 46.17 (soft) +Instance-level precision: 51.67 (Levenshtein) +Instance-level precision: 48.25 (RatcliffObershelp) -Instance-level recall: 27.85 (strict) -Instance-level recall: 45.96 (soft) -Instance-level recall: 51.42 (Levenshtein) -Instance-level recall: 48.02 (RatcliffObershelp) +Instance-level recall: 27.85 (strict) +Instance-level recall: 45.96 (soft) +Instance-level recall: 51.42 (Levenshtein) +Instance-level recall: 48.02 (RatcliffObershelp) -Instance-level f-score: 27.92 (strict) -Instance-level f-score: 46.06 (soft) -Instance-level f-score: 51.54 (Levenshtein) -Instance-level f-score: 48.14 (RatcliffObershelp) +Instance-level f-score: 27.92 (strict) +Instance-level f-score: 46.06 (soft) +Instance-level f-score: 51.54 (Levenshtein) +Instance-level f-score: 48.14 (RatcliffObershelp) -Matching 1 : 35376 +Matching 1 : 35376 -Matching 2 : 1259 +Matching 2 : 1259 -Matching 3 : 3266 +Matching 3 : 3266 -Matching 4 : 1799 +Matching 4 : 1799 -Total matches : 41700 +Total matches : 41700 ``` #### Citation context resolution ``` -Total expected references: 48449 - 48.45 references per article -Total predicted references: 48221 - 48.22 references per article +Total expected references: 48449 - 48.45 references per article +Total predicted references: 48221 - 48.22 references per article -Total expected citation contexts: 69755 - 69.75 citation contexts per article -Total predicted citation contexts: 73164 - 73.16 citation contexts per article +Total expected citation contexts: 69755 - 69.75 citation contexts per article +Total predicted citation contexts: 73164 - 73.16 citation contexts per article -Total correct predicted citation contexts: 56709 - 56.71 citation contexts per article -Total wrong predicted citation contexts: 16455 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) +Total correct predicted citation contexts: 56709 - 56.71 citation contexts per article +Total wrong predicted citation contexts: 16455 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) -Precision citation contexts: 77.51 -Recall citation contexts: 81.3 -fscore citation contexts: 79.36 +Precision citation contexts: 77.51 +Recall citation contexts: 81.3 +fscore citation contexts: 79.36 ``` ## Fulltext structures @@ -255,35 +255,35 @@ Evaluation on 1000 random PDF files out of 998 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| availability_stmt | 54 | 51.99 | 52.98 | 779 | +| availability_stmt | 56.8 | 54.69 | 55.72 | 779 | | figure_title | 0.2 | 0.1 | 0.13 | 8943 | -| funding_stmt | 5.47 | 30.72 | 9.28 | 1507 | +| funding_stmt | 5.37 | 30.19 | 9.12 | 1507 | | reference_citation | 87.96 | 94.35 | 91.04 | 69741 | | reference_figure | 74.18 | 85.72 | 79.53 | 11010 | | reference_table | 70.28 | 94.3 | 80.54 | 5159 | -| section_title | 72.63 | 66.19 | 69.26 | 17540 | +| section_title | 72.62 | 66.18 | 69.25 | 17540 | | table_title | 0 | 0 | 0 | 6092 | | | | | | | -| **all fields (micro avg.)** | **74.06** | **76.67** | **75.34** | 120771 | -| all fields (macro avg.) | 45.59 | 52.92 | 47.85 | 120771 | +| **all fields (micro avg.)** | **74.07** | **76.68** | **75.35** | 120771 | +| all fields (macro avg.) | 45.93 | 53.19 | 48.17 | 120771 | #### Soft Matching (ignoring punctuation, case and space characters mismatches) **Field-level results** -| label | precision | recall | f1 | support | -|-----------------------------|-----------|----------|-----------|---------| -| availability_stmt | 79.73 | 76.77 | 78.22 | 779 | -| figure_title | 90.96 | 45.79 | 60.91 | 8943 | -| funding_stmt | 6.99 | 39.28 | 11.87 | 1507 | -| reference_citation | 87.96 | 94.36 | 91.05 | 69741 | -| reference_figure | 74.42 | 86 | 79.8 | 11010 | -| reference_table | 70.44 | 94.51 | 80.72 | 5159 | -| section_title | 78.4 | 71.45 | 74.76 | 17540 | -| table_title | 53.33 | 7.5 | 13.15 | 6092 | -| | | | | | -| **all fields (micro avg.)** | **78.73** | **81.5** | **80.09** | 120771 | -| all fields (macro avg.) | 67.78 | 64.46 | 61.31 | 120771 | +| label | precision | recall | f1 | support | +|-----------------------------|-----------|-----------|-----------|---------| +| availability_stmt | 79.73 | 76.77 | 78.22 | 779 | +| figure_title | 90.96 | 45.79 | 60.91 | 8943 | +| funding_stmt | 6.78 | 38.09 | 11.51 | 1507 | +| reference_citation | 87.96 | 94.36 | 91.05 | 69741 | +| reference_figure | 74.42 | 86 | 79.8 | 11010 | +| reference_table | 70.44 | 94.51 | 80.72 | 5159 | +| section_title | 78.39 | 71.44 | 74.76 | 17540 | +| table_title | 53.33 | 7.5 | 13.15 | 6092 | +| | | | | | +| **all fields (micro avg.)** | **78.71** | **81.48** | **80.07** | 120771 | +| all fields (macro avg.) | 67.75 | 64.31 | 61.26 | 120771 | **Document-level ratio results** @@ -294,6 +294,7 @@ Evaluation on 1000 random PDF files out of 998 PDF (ratio 1.0). | **all fields (micro avg.)** | **100** | **96.28** | **98.1** | 779 | | all fields (macro avg.) | 100 | 96.28 | 98.1 | 779 | -Evaluation metrics produced in 795.257 seconds +Evaluation metrics produced in 777.814 seconds + diff --git a/doc/benchmarks/Benchmarking-pmc.md b/doc/benchmarks/Benchmarking-pmc.md index 4f428bd1bc..54abcb53f6 100644 --- a/doc/benchmarks/Benchmarking-pmc.md +++ b/doc/benchmarks/Benchmarking-pmc.md @@ -42,13 +42,13 @@ Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| | abstract | 16.89 | 16.54 | 16.71 | 1911 | -| authors | 92.78 | 92.63 | 92.7 | 1941 | -| first_author | 96.8 | 96.65 | 96.73 | 1941 | +| authors | 90.85 | 90.57 | 90.71 | 1941 | +| first_author | 96.49 | 96.19 | 96.34 | 1941 | | keywords | 65.58 | 63.91 | 64.73 | 1380 | -| title | 84.46 | 84.2 | 84.33 | 1943 | +| title | 83.83 | 83.22 | 83.52 | 1943 | | | | | | | -| **all fields (micro avg.)** | **72.08** | **71.39** | **71.73** | 9116 | -| all fields (macro avg.) | 71.3 | 70.79 | 71.04 | 9116 | +| **all fields (micro avg.)** | **71.44** | **70.65** | **71.04** | 9116 | +| all fields (macro avg.) | 70.73 | 70.09 | 70.4 | 9116 | #### Soft Matching (ignoring punctuation, case and space characters mismatches) @@ -57,13 +57,13 @@ Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| | abstract | 63.98 | 62.64 | 63.3 | 1911 | -| authors | 94.74 | 94.59 | 94.66 | 1941 | -| first_author | 97.21 | 97.06 | 97.14 | 1941 | +| authors | 92.66 | 92.38 | 92.52 | 1941 | +| first_author | 96.9 | 96.6 | 96.75 | 1941 | | keywords | 74.2 | 72.32 | 73.25 | 1380 | -| title | 92 | 91.71 | 91.86 | 1943 | +| title | 91.45 | 90.79 | 91.12 | 1943 | | | | | | | -| **all fields (micro avg.)** | **85.25** | **84.43** | **84.84** | 9116 | -| all fields (macro avg.) | 84.43 | 83.66 | 84.04 | 9116 | +| **all fields (micro avg.)** | **84.6** | **83.67** | **84.13** | 9116 | +| all fields (macro avg.) | 83.84 | 82.94 | 83.39 | 9116 | #### Levenshtein Matching (Minimum Levenshtein distance at 0.8) @@ -72,13 +72,13 @@ Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| | abstract | 90.86 | 88.96 | 89.9 | 1911 | -| authors | 96.65 | 96.5 | 96.57 | 1941 | -| first_author | 97.47 | 97.32 | 97.4 | 1941 | +| authors | 95.66 | 95.36 | 95.51 | 1941 | +| first_author | 97.16 | 96.86 | 97.01 | 1941 | | keywords | 84.61 | 82.46 | 83.52 | 1380 | -| title | 98.24 | 97.94 | 98.09 | 1943 | +| title | 97.77 | 97.07 | 97.42 | 1943 | | | | | | | -| **all fields (micro avg.)** | **94.17** | **93.28** | **93.72** | 9116 | -| all fields (macro avg.) | 93.57 | 92.64 | 93.1 | 9116 | +| **all fields (micro avg.)** | **93.79** | **92.75** | **93.27** | 9116 | +| all fields (macro avg.) | 93.21 | 92.14 | 92.67 | 9116 | #### Ratcliff/Obershelp Matching (Minimum Ratcliff/Obershelp similarity at 0.95) @@ -87,27 +87,27 @@ Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| | abstract | 87.07 | 85.24 | 86.14 | 1911 | -| authors | 95.67 | 95.52 | 95.59 | 1941 | -| first_author | 96.8 | 96.65 | 96.73 | 1941 | +| authors | 94.11 | 93.82 | 93.96 | 1941 | +| first_author | 96.49 | 96.19 | 96.34 | 1941 | | keywords | 79.93 | 77.9 | 78.9 | 1380 | -| title | 96.18 | 95.88 | 96.03 | 1943 | +| title | 95.75 | 95.06 | 95.4 | 1943 | | | | | | | -| **all fields (micro avg.)** | **91.89** | **91.02** | **91.45** | 9116 | -| all fields (macro avg.) | 91.13 | 90.24 | 90.68 | 9116 | +| **all fields (micro avg.)** | **91.39** | **90.38** | **90.88** | 9116 | +| all fields (macro avg.) | 90.67 | 89.64 | 90.15 | 9116 | #### Instance-level results ``` -Total expected instances: 1943 -Total correct instances: 216 (strict) -Total correct instances: 906 (soft) -Total correct instances: 1445 (Levenshtein) -Total correct instances: 1297 (ObservedRatcliffObershelp) - -Instance-level recall: 11.12 (strict) -Instance-level recall: 46.63 (soft) -Instance-level recall: 74.37 (Levenshtein) -Instance-level recall: 66.75 (RatcliffObershelp) +Total expected instances: 1943 +Total correct instances: 215 (strict) +Total correct instances: 888 (soft) +Total correct instances: 1421 (Levenshtein) +Total correct instances: 1272 (ObservedRatcliffObershelp) + +Instance-level recall: 11.07 (strict) +Instance-level recall: 45.7 (soft) +Instance-level recall: 73.13 (Levenshtein) +Instance-level recall: 65.47 (RatcliffObershelp) ``` ## Citation metadata @@ -120,17 +120,17 @@ Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| authors | 83.11 | 75.94 | 79.36 | 85778 | -| date | 94.69 | 83.83 | 88.93 | 87067 | -| first_author | 89.85 | 82.09 | 85.8 | 85778 | -| inTitle | 73.27 | 71.45 | 72.35 | 81007 | +| authors | 83.1 | 75.94 | 79.36 | 85778 | +| date | 94.7 | 83.83 | 88.93 | 87067 | +| first_author | 89.85 | 82.09 | 85.79 | 85778 | +| inTitle | 73.27 | 71.46 | 72.35 | 81007 | | issue | 91.43 | 87.44 | 89.39 | 16635 | -| page | 94.68 | 83.31 | 88.63 | 80501 | -| title | 79.78 | 74.95 | 77.29 | 80736 | +| page | 94.69 | 83.31 | 88.63 | 80501 | +| title | 79.78 | 74.96 | 77.29 | 80736 | | volume | 96.17 | 89.37 | 92.64 | 80067 | | | | | | | | **all fields (micro avg.)** | **87.32** | **80.34** | **83.69** | 597569 | -| all fields (macro avg.) | 87.87 | 81.05 | 84.3 | 597569 | +| all fields (macro avg.) | 87.88 | 81.05 | 84.3 | 597569 | #### Soft Matching (ignoring punctuation, case and space characters mismatches) @@ -138,13 +138,13 @@ Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| authors | 83.58 | 76.37 | 79.81 | 85778 | -| date | 94.69 | 83.83 | 88.93 | 87067 | +| authors | 83.57 | 76.37 | 79.81 | 85778 | +| date | 94.7 | 83.83 | 88.93 | 87067 | | first_author | 90.02 | 82.24 | 85.96 | 85778 | -| inTitle | 85.03 | 82.92 | 83.97 | 81007 | +| inTitle | 85.04 | 82.93 | 83.97 | 81007 | | issue | 91.43 | 87.44 | 89.39 | 16635 | -| page | 94.68 | 83.31 | 88.63 | 80501 | -| title | 91.55 | 86.01 | 88.69 | 80736 | +| page | 94.69 | 83.31 | 88.63 | 80501 | +| title | 91.55 | 86.01 | 88.7 | 80736 | | volume | 96.17 | 89.37 | 92.64 | 80067 | | | | | | | | **all fields (micro avg.)** | **90.73** | **83.48** | **86.95** | 597569 | @@ -156,16 +156,16 @@ Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| authors | 89.29 | 81.59 | 85.27 | 85778 | -| date | 94.69 | 83.83 | 88.93 | 87067 | +| authors | 89.28 | 81.59 | 85.26 | 85778 | +| date | 94.7 | 83.83 | 88.93 | 87067 | | first_author | 90.24 | 82.44 | 86.17 | 85778 | | inTitle | 86.28 | 84.14 | 85.2 | 81007 | | issue | 91.43 | 87.44 | 89.39 | 16635 | -| page | 94.68 | 83.31 | 88.63 | 80501 | -| title | 93.9 | 88.22 | 90.97 | 80736 | +| page | 94.69 | 83.31 | 88.63 | 80501 | +| title | 93.91 | 88.22 | 90.98 | 80736 | | volume | 96.17 | 89.37 | 92.64 | 80067 | | | | | | | -| **all fields (micro avg.)** | **92.07** | **84.72** | **88.24** | 597569 | +| **all fields (micro avg.)** | **92.08** | **84.72** | **88.24** | 597569 | | all fields (macro avg.) | 92.09 | 85.04 | 88.4 | 597569 | #### Ratcliff/Obershelp Matching (Minimum Ratcliff/Obershelp similarity at 0.95) @@ -174,13 +174,13 @@ Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| authors | 86.05 | 78.63 | 82.18 | 85778 | -| date | 94.69 | 83.83 | 88.93 | 87067 | +| authors | 86.05 | 78.63 | 82.17 | 85778 | +| date | 94.7 | 83.83 | 88.93 | 87067 | | first_author | 89.87 | 82.1 | 85.81 | 85778 | -| inTitle | 83.59 | 81.52 | 82.55 | 81007 | +| inTitle | 83.6 | 81.53 | 82.55 | 81007 | | issue | 91.43 | 87.44 | 89.39 | 16635 | -| page | 94.68 | 83.31 | 88.63 | 80501 | -| title | 93.5 | 87.84 | 90.58 | 80736 | +| page | 94.69 | 83.31 | 88.63 | 80501 | +| title | 93.51 | 87.85 | 90.59 | 80736 | | volume | 96.17 | 89.37 | 92.64 | 80067 | | | | | | | | **all fields (micro avg.)** | **91.12** | **83.84** | **87.33** | 597569 | @@ -189,55 +189,55 @@ Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). #### Instance-level results ``` -Total expected instances: 90125 -Total extracted instances: 85141 -Total correct instances: 38534 (strict) -Total correct instances: 50633 (soft) -Total correct instances: 55471 (Levenshtein) -Total correct instances: 52032 (RatcliffObershelp) +Total expected instances: 90125 +Total extracted instances: 85138 +Total correct instances: 38530 (strict) +Total correct instances: 50629 (soft) +Total correct instances: 55467 (Levenshtein) +Total correct instances: 52029 (RatcliffObershelp) -Instance-level precision: 45.26 (strict) -Instance-level precision: 59.47 (soft) -Instance-level precision: 65.15 (Levenshtein) -Instance-level precision: 61.11 (RatcliffObershelp) +Instance-level precision: 45.26 (strict) +Instance-level precision: 59.47 (soft) +Instance-level precision: 65.15 (Levenshtein) +Instance-level precision: 61.11 (RatcliffObershelp) -Instance-level recall: 42.76 (strict) -Instance-level recall: 56.18 (soft) -Instance-level recall: 61.55 (Levenshtein) -Instance-level recall: 57.73 (RatcliffObershelp) +Instance-level recall: 42.75 (strict) +Instance-level recall: 56.18 (soft) +Instance-level recall: 61.54 (Levenshtein) +Instance-level recall: 57.73 (RatcliffObershelp) -Instance-level f-score: 43.97 (strict) -Instance-level f-score: 57.78 (soft) -Instance-level f-score: 63.3 (Levenshtein) -Instance-level f-score: 59.37 (RatcliffObershelp) +Instance-level f-score: 43.97 (strict) +Instance-level f-score: 57.77 (soft) +Instance-level f-score: 63.3 (Levenshtein) +Instance-level f-score: 59.37 (RatcliffObershelp) -Matching 1 : 67991 +Matching 1 : 67992 -Matching 2 : 4123 +Matching 2 : 4122 -Matching 3 : 1868 +Matching 3 : 1870 -Matching 4 : 661 +Matching 4 : 663 -Total matches : 74643 +Total matches : 74647 ``` #### Citation context resolution ``` -Total expected references: 90125 - 46.38 references per article -Total predicted references: 85141 - 43.82 references per article +Total expected references: 90125 - 46.38 references per article +Total predicted references: 85138 - 43.82 references per article -Total expected citation contexts: 139835 - 71.97 citation contexts per article -Total predicted citation contexts: 114496 - 58.93 citation contexts per article +Total expected citation contexts: 139835 - 71.97 citation contexts per article +Total predicted citation contexts: 114503 - 58.93 citation contexts per article -Total correct predicted citation contexts: 96976 - 49.91 citation contexts per article -Total wrong predicted citation contexts: 17520 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) +Total correct predicted citation contexts: 96979 - 49.91 citation contexts per article +Total wrong predicted citation contexts: 17524 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) -Precision citation contexts: 84.7 -Recall citation contexts: 69.35 -fscore citation contexts: 76.26 +Precision citation contexts: 84.7 +Recall citation contexts: 69.35 +fscore citation contexts: 76.26 ``` ## Fulltext structures @@ -255,14 +255,14 @@ Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|----------|-----------|---------| -| figure_title | 31.53 | 26.55 | 28.82 | 7281 | -| reference_citation | 58.14 | 58.76 | 58.45 | 134196 | -| reference_figure | 60.59 | 68.27 | 64.2 | 19330 | +| figure_title | 31.52 | 26.53 | 28.81 | 7281 | +| reference_citation | 58.13 | 58.76 | 58.44 | 134196 | +| reference_figure | 60.6 | 68.27 | 64.21 | 19330 | | reference_table | 82.87 | 89.52 | 86.06 | 7327 | -| section_title | 73.58 | 67.75 | 70.55 | 27619 | +| section_title | 73.59 | 67.77 | 70.56 | 27619 | | table_title | 67.76 | 49.58 | 57.26 | 3971 | | | | | | | -| **all fields (micro avg.)** | **60.68** | **60.7** | **60.69** | 199724 | +| **all fields (micro avg.)** | **60.67** | **60.7** | **60.69** | 199724 | | all fields (macro avg.) | 62.41 | 60.07 | 60.89 | 199724 | #### Soft Matching (ignoring punctuation, case and space characters mismatches) @@ -271,11 +271,11 @@ Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). | label | precision | recall | f1 | support | |-----------------------------|-----------|-----------|-----------|---------| -| figure_title | 79.55 | 66.98 | 72.73 | 7281 | -| reference_citation | 62.42 | 63.09 | 62.75 | 134196 | -| reference_figure | 61.09 | 68.84 | 64.73 | 19330 | +| figure_title | 79.54 | 66.97 | 72.72 | 7281 | +| reference_citation | 62.41 | 63.08 | 62.75 | 134196 | +| reference_figure | 61.1 | 68.84 | 64.74 | 19330 | | reference_table | 83.04 | 89.71 | 86.25 | 7327 | -| section_title | 79.09 | 72.82 | 75.83 | 27619 | +| section_title | 79.09 | 72.84 | 75.84 | 27619 | | table_title | 94.22 | 68.95 | 79.63 | 3971 | | | | | | | | **all fields (micro avg.)** | **66.2** | **66.22** | **66.21** | 199724 | @@ -289,4 +289,5 @@ Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). | **all fields (micro avg.)** | **0** | **0** | **0** | 0 | | all fields (macro avg.) | 0 | 0 | 0 | 0 | -Evaluation metrics produced in 1311.519 seconds +Evaluation metrics produced in 1247.994 seconds + diff --git a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java index 3071e7c131..57b97bfbd2 100755 --- a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java +++ b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java @@ -1025,6 +1025,42 @@ public void setDOI(String id) { this.doi = cleanDOI(id); } + /** + * Updates the DOI only if the new DOI is a more complete version of the current + * one. + * The new DOI must START WITH the existing DOI and be longer, with the + * extension + * starting with a valid DOI separator (hyphen, slash, or dot). + * This ensures we only accept true extensions (e.g., completing a truncated + * DOI) + * and reject garbage appends like page numbers. + * + * @param newDoi the candidate DOI to potentially use as replacement + */ + public void updateDOIIfLonger(String newDoi) { + if (newDoi == null) { + return; + } + String cleanedNewDoi = cleanDOI(newDoi); + + // If no existing DOI, accept the new one + if (StringUtils.isBlank(this.doi)) { + this.doi = cleanedNewDoi; + return; + } + + // Only replace if the new DOI is an extension of the existing one + if (cleanedNewDoi.startsWith(this.doi) && cleanedNewDoi.length() > this.doi.length()) { + // Check that the extension starts with a valid DOI separator + // Valid DOI suffixes typically start with: - / . + // This rejects garbage like "1of12" being appended + char extensionStart = cleanedNewDoi.charAt(this.doi.length()); + if (extensionStart == '-' || extensionStart == '/' || extensionStart == '.') { + this.doi = cleanedNewDoi; + } + } + } + public void setInDOI(String id) { if (id != null) { inDOI = StringUtils.normalizeSpace(id); @@ -1073,12 +1109,9 @@ public static String cleanDOI(String doi) { doi = doi.replaceAll("[\\p{M}]", ""); doi = doi.replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); - // remove possible starting/trailing parenthesis - if (doi.startsWith("(") || doi.startsWith("[") || doi.startsWith("⟨")) - doi = doi.substring(1); - - if (doi.endsWith(")") || doi.endsWith("]") || doi.endsWith("⟩")) - doi = doi.substring(0,doi.length()-1); + // remove possible starting/trailing parenthesis, punctuations + doi = StringUtils.stripStart(doi, "([{"); + doi = StringUtils.stripEnd(doi, ")]}."); return doi; } diff --git a/grobid-core/src/main/java/org/grobid/core/document/Document.java b/grobid-core/src/main/java/org/grobid/core/document/Document.java index 389469c6f6..e4ee710c57 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/Document.java +++ b/grobid-core/src/main/java/org/grobid/core/document/Document.java @@ -11,6 +11,7 @@ import com.google.common.collect.SortedSetMultimap; import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.grobid.core.analyzers.Analyzer; import org.grobid.core.analyzers.GrobidAnalyzer; import org.grobid.core.data.*; @@ -636,35 +637,35 @@ public String getAllBlocksClean(int toIgnore1, int toIgnore2) { } /* - * Try to match a DOI in the first page, independently from any preliminar + * Try to match a DOI in the first page, independently of any preliminary * segmentation. This can be useful for improving the chance to find a DOI * in headers or footnotes. */ public List getDOIMatches() { - List results = new ArrayList(); + List results = new ArrayList<>(); List pages = getPages(); - int p = 0; - for (Page page : pages) { - if ((page.getBlocks() != null) && (page.getBlocks().size() > 0)) { - for (int blockIndex = 0; blockIndex < page.getBlocks().size(); blockIndex++) { - Block block = page.getBlocks().get(blockIndex); - String localText = block.getText(); - if ((localText != null) && (localText.length() > 0)) { - localText = localText.trim(); - Matcher DOIMatcher = TextUtilities.DOIPattern.matcher(localText); - while (DOIMatcher.find()) { - String theDOI = DOIMatcher.group(); - if (!results.contains(theDOI)) { - results.add(theDOI); - } + if (CollectionUtils.isEmpty(pages)) { + return results; + } + + Page firstPage = pages.getFirst(); + if (CollectionUtils.isNotEmpty(firstPage.getBlocks())) { + for (int blockIndex = 0; blockIndex < firstPage.getBlocks().size(); blockIndex++) { + Block block = firstPage.getBlocks().get(blockIndex); + String localText = block.getText(); + if (StringUtils.isNotBlank(localText)) { + localText = localText.trim(); + Matcher DOIMatcher = TextUtilities.DOIPattern.matcher(localText); + while (DOIMatcher.find()) { + String theDOI = DOIMatcher.group(); + if (!results.contains(theDOI)) { + results.add(theDOI); } } } } - if (p > 1) - break; - p++; } + return results; } diff --git a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java index 14b76a04a3..a6e80e557b 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java @@ -296,7 +296,10 @@ public String processingHeaderSection( // DOI pass List dois = doc.getDOIMatches(); if (isNotEmpty(dois) && dois.size() == 1) { - resHeader.setDOI(dois.get(0)); + // Only update header DOI when the page-found DOI is longer/likely more complete + if (resHeader != null) { + resHeader.updateDOIIfLonger(dois.getFirst()); + } } // normalization of dates diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java index 66d2e4ea45..74efa582f7 100755 --- a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java @@ -55,8 +55,10 @@ public class TextUtilities { static public final Pattern ORCIDPattern = Pattern.compile(ORCIDRegex); // the magical DOI regular expression... + // This pattern matches a DOI beginning with 10./ and ensures the final character is not a + // trailing quote or punctuation like ", ', comma, semicolon, period, colon, etc. Internal periods are allowed. static public final Pattern DOIPattern = Pattern - .compile("(10\\.\\d{4,5}\\/[\\S]+[^;,.\\s])"); + .compile("(10\\.\\d{4,5}/\\S+[^\\s\"'“”‘’`,.;:!?\\)\\]\\}>])"); // a regular expression for arXiv identifiers // see https://arxiv.org/help/arxiv_identifier and https://arxiv.org/help/arxiv_identifier_for_services @@ -84,7 +86,7 @@ public class TextUtilities { // TODO: maybe find a better regex (better == more robust, not more "standard") static public final Pattern emailPattern = Pattern.compile("\\w+((\\.|-|_|,)\\w+)?\\s?((\\.|-|_|,)\\w+)?\\s?@\\s?\\w+(\\s?(\\.|-)\\s?\\w+)+"); // variant: \w+(\s?(\.|-|_|,)\w+)?(\s?(\.|-|_|,)\w+)?\s?@\s?\w+(\s?(\.|\-)\s?\w+)+ - + /** * Replace numbers in the string by a dummy character for string distance evaluations * diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/DOIPatternEdgeCasesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/DOIPatternEdgeCasesTest.java new file mode 100644 index 0000000000..4a85619bad --- /dev/null +++ b/grobid-core/src/test/java/org/grobid/core/utilities/DOIPatternEdgeCasesTest.java @@ -0,0 +1,107 @@ +package org.grobid.core.utilities; + +import org.grobid.core.analyzers.GrobidAnalyzer; +import org.grobid.core.layout.LayoutToken; +import org.grobid.core.lexicon.Lexicon; +import org.junit.Before; +import org.junit.Test; + +import java.util.List; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.hasSize; + +public class DOIPatternEdgeCasesTest { + + private Lexicon lexicon; + + @Before + public void setUp() { + lexicon = Lexicon.getInstance(); + } + + private String extractByTokenOffsets(List tokens, OffsetPosition pos) { + // positions from tokenPositionsDOIPattern are token-index based and inclusive for end + List sub = tokens.subList(pos.start, pos.end + 1); + return LayoutTokensUtil.toText(sub).trim(); + } + + @Test + public void testDOIQuotedDoubleStraight() { + String piece = "\"10.1000/xyz123\" and text"; + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(piece); + String text = LayoutTokensUtil.toText(tokens); + List positions = lexicon.tokenPositionsDOIPattern(tokens, text); + assertThat(positions, hasSize(1)); + String doi = extractByTokenOffsets(tokens, positions.get(0)); + assertThat(doi, is("10.1000/xyz123")); + } + + @Test + public void testDOIQuotedSmart() { + String piece = "“10.1000/xyz-ABC” more"; + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(piece); + String text = LayoutTokensUtil.toText(tokens); + List positions = lexicon.tokenPositionsDOIPattern(tokens, text); + assertThat(positions, hasSize(1)); + String doi = extractByTokenOffsets(tokens, positions.get(0)); + assertThat(doi, is("10.1000/xyz-ABC")); + } + + @Test + public void testDOIWithTrailingPeriod() { + String piece = "See 10.5555/abc.def."; + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(piece); + String text = LayoutTokensUtil.toText(tokens); + List positions = lexicon.tokenPositionsDOIPattern(tokens, text); + assertThat(positions, hasSize(1)); + String doi = extractByTokenOffsets(tokens, positions.get(0)); + assertThat(doi, is("10.5555/abc.def")); + } + + @Test + public void testDOIWithinParenthesesAndPeriod() { + String piece = "(10.12345/ABC_123)."; + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(piece); + String text = LayoutTokensUtil.toText(tokens); + List positions = lexicon.tokenPositionsDOIPattern(tokens, text); + assertThat(positions, hasSize(1)); + String doi = extractByTokenOffsets(tokens, positions.get(0)); + assertThat(doi, is("10.12345/ABC_123")); + } + + @Test + public void testDOIPrefixWithColon() { + String piece = "doi:10.1000/test-1"; + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(piece); + String text = LayoutTokensUtil.toText(tokens); + List positions = lexicon.tokenPositionsDOIPattern(tokens, text); + assertThat(positions, hasSize(1)); + String doi = extractByTokenOffsets(tokens, positions.get(0)); + assertThat(doi, is("10.1000/test-1")); + } + + @Test + public void testDOIFromDoiOrgURL() { + String piece = "https://doi.org/10.1109/TSE.2019.1234567, referenced"; + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(piece); + String text = LayoutTokensUtil.toText(tokens); + List positions = lexicon.tokenPositionsDOIPattern(tokens, text); + assertThat(positions, hasSize(1)); + String doi = extractByTokenOffsets(tokens, positions.get(0)); + assertThat(doi, is("10.1109/TSE.2019.1234567")); + } + + @Test + public void testDOIWithTrailingCommaAndQuote() { + String piece = "10.1000/xyz-1’, another"; // right single quote + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(piece); + String text = LayoutTokensUtil.toText(tokens); + List positions = lexicon.tokenPositionsDOIPattern(tokens, text); + assertThat(positions, hasSize(1)); + String doi = extractByTokenOffsets(tokens, positions.get(0)); + assertThat(doi, is("10.1000/xyz-1")); + } +} + diff --git a/grobid-core/src/test/kotlin/org/grobid/core/data/BiblioItemUpdateDOITest.kt b/grobid-core/src/test/kotlin/org/grobid/core/data/BiblioItemUpdateDOITest.kt new file mode 100644 index 0000000000..bfd2fa5697 --- /dev/null +++ b/grobid-core/src/test/kotlin/org/grobid/core/data/BiblioItemUpdateDOITest.kt @@ -0,0 +1,105 @@ +package org.grobid.core.data + +import org.grobid.core.main.LibraryLoader +import org.hamcrest.Matchers +import org.junit.Assert +import org.junit.Before +import org.junit.Test + +class BiblioItemUpdateDOITest { + @Before + @Throws(Exception::class) + fun setUp() { + LibraryLoader.load() + } + + @Test + fun testUpdateDOI_newExtends_shouldReplace() { + // Case: truncated DOI gets completed (new DOI starts with old and is longer) + val dest = BiblioItem() + dest.setDOI("10.1007/s13280-020-01405") + + dest.updateDOIIfLonger("10.1007/s13280-020-01405-w") + + Assert.assertThat(dest.getDOI(), Matchers.`is`("10.1007/s13280-020-01405-w")) + } + + @Test + fun testUpdateDOI_garbageAppended_shouldNotReplace() { + // Case: DOI with page numbers concatenated - should be rejected + // "10.1073/pnas.22211031201of12" does NOT start with "10.1073/pnas.2221103120" + val dest = BiblioItem() + dest.setDOI("10.1073/pnas.2221103120") + + dest.updateDOIIfLonger("10.1073/pnas.22211031201of12") + + Assert.assertThat(dest.getDOI(), Matchers.`is`("10.1073/pnas.2221103120")) + } + + @Test + fun testUpdateDOI_headerEmpty_shouldReplace() { + val dest = BiblioItem() + dest.setDOI(null) + val found = "doi:10.1109/5.771073" + + dest.updateDOIIfLonger(found) + + Assert.assertThat(dest.getDOI(), Matchers.`is`("10.1109/5.771073")) + } + + @Test + fun testUpdateDOI_headerLonger_shouldNotReplace() { + // New DOI is shorter - should not replace + val dest = BiblioItem() + dest.setDOI("10.1000/valid.long.doi.suffix") + val found = "10.1000/valid.lo" // truncated + + dest.updateDOIIfLonger(found) + + Assert.assertThat(dest.getDOI(), Matchers.`is`("10.1000/valid.long.doi.suffix")) + } + + @Test + fun testUpdateDOI_differentDOI_shouldNotReplace() { + // New DOI doesn't start with old - should not replace + val dest = BiblioItem() + dest.setDOI("10.1000/short") + val found = "10.1000/longer.suffix/with/more" + + dest.updateDOIIfLonger(found) + + // Should NOT replace because new doesn't start with old + Assert.assertThat(dest.getDOI(), Matchers.`is`("10.1000/short")) + } + + @Test + fun testUpdateDOI_sameDOI_shouldNotChange() { + val dest = BiblioItem() + dest.setDOI("10.1007/s13280-020-01405-w") + + dest.updateDOIIfLonger("10.1007/s13280-020-01405-w") + + Assert.assertThat(dest.getDOI(), Matchers.`is`("10.1007/s13280-020-01405-w")) + } + + @Test + fun testUpdateDOI_cleansPrefixBeforeComparison() { + val dest = BiblioItem() + dest.setDOI("10.1007/s13280-020-01405") + + // New DOI has https prefix that should be cleaned + dest.updateDOIIfLonger("https://doi.org/10.1007/s13280-020-01405-w") + + Assert.assertThat(dest.getDOI(), Matchers.`is`("10.1007/s13280-020-01405-w")) + } + + @Test + fun testUpdateDOI_nullNew_shouldNotChange() { + val dest = BiblioItem() + dest.setDOI("10.1007/s13280-020-01405-w") + + dest.updateDOIIfLonger(null) + + Assert.assertThat(dest.getDOI(), Matchers.`is`("10.1007/s13280-020-01405-w")) + } +}