|
| 1 | +package org.jabref.logic.importer.relatedwork; |
| 2 | + |
| 3 | +import java.text.Normalizer; |
| 4 | +import java.util.HashMap; |
| 5 | +import java.util.LinkedHashMap; |
| 6 | +import java.util.List; |
| 7 | +import java.util.Locale; |
| 8 | +import java.util.Map; |
| 9 | +import java.util.Optional; |
| 10 | +import java.util.regex.Matcher; |
| 11 | +import java.util.regex.Pattern; |
| 12 | + |
| 13 | +import org.jabref.model.entry.BibEntry; |
| 14 | +import org.jabref.model.entry.field.StandardField; |
| 15 | + |
| 16 | +/** |
| 17 | + * Deterministic extractor for author–year style citations in "Related Work" sections. |
| 18 | + * Handles single and multi-citation parentheticals, including diacritics and all-caps acronyms (e.g., CIA, Šimić). |
| 19 | + */ |
| 20 | +public class HeuristicRelatedWorkExtractor implements RelatedWorkExtractor { |
| 21 | + |
| 22 | + // Headings like "1.4 Related work", "RELATED WORK", etc. (case-insensitive) |
| 23 | + private static final Pattern RELATED_WORK_HEADING = |
| 24 | + Pattern.compile("(?im)^(\\d+(?:\\.\\d+)*)?\\s*related\\s+work[s]?\\s*[:\\-]?$"); |
| 25 | + |
| 26 | + // Any parenthetical block; author-year pairs are mined inside it. |
| 27 | + private static final Pattern PAREN_BLOCK = Pattern.compile("\\(([^)]+)\\)"); |
| 28 | + |
| 29 | + // Unicode-aware author–year inside a parenthetical. |
| 30 | + // Allows all-caps acronyms like "CIA" and Unicode surnames like "Šimić". |
| 31 | + // \p{Lu} = uppercase letter, \p{L} = any letter, \p{M} = combining mark. |
| 32 | + private static final Pattern AUTHOR_YEAR_INNER = Pattern.compile( |
| 33 | + "(?U)" // enable Unicode character classes |
| 34 | + + "(\\p{Lu}[\\p{L}\\p{M}'\\-]*)" // 1: first author token (can be acronym or surname) |
| 35 | + + "(?:\\s+(?:et\\s+al\\.)|\\s*(?:&|and)\\s+\\p{Lu}[\\p{L}\\p{M}'\\-]+)?" |
| 36 | + + "\\s*,?\\s*" |
| 37 | + + "(\\d{4})([a-z]?)" // 2: year, 3: optional trailing letter |
| 38 | + ); |
| 39 | + |
| 40 | + /** |
| 41 | + * Extract a mapping from cited entry key to a short contextual snippet. |
| 42 | + * |
| 43 | + * <p>The returned map uses the cited entry's citation key (for example, {@code Smith2021}) |
| 44 | + * as the key, and a sentence-like snippet taken from around the in-text citation as the value.</p> |
| 45 | + * |
| 46 | + * @param fullText the full (plain) text of the paper or section to scan |
| 47 | + * @param bibliography candidate entries that may be cited; used to resolve author/year to a citation key |
| 48 | + * @return a {@code Map} from citation key to snippet; never {@code null}, possibly empty |
| 49 | + */ |
| 50 | + @Override |
| 51 | + public Map<String, String> extract(String fullText, List<BibEntry> bibliography) { |
| 52 | + String related = sliceRelatedWorkSection(fullText); |
| 53 | + Map<String, BibEntry> index = buildIndex(bibliography); |
| 54 | + Map<String, String> out = new LinkedHashMap<>(); |
| 55 | + |
| 56 | + Matcher paren = PAREN_BLOCK.matcher(related); |
| 57 | + while (paren.find()) { |
| 58 | + String inner = paren.group(1); |
| 59 | + Matcher cite = AUTHOR_YEAR_INNER.matcher(inner); |
| 60 | + |
| 61 | + while (cite.find()) { |
| 62 | + String citedToken = normalizeSurname(cite.group(1)); // e.g., "cia" or "nash" |
| 63 | + String yearDigits = cite.group(2); // ignore group(3) letter |
| 64 | + String citedKey = findKeyFor(citedToken, yearDigits, index); |
| 65 | + if (citedKey == null || out.containsKey(citedKey)) { |
| 66 | + continue; |
| 67 | + } |
| 68 | + |
| 69 | + String snippet = expandToSentenceLikeSpan(related, paren.start(), paren.end()); |
| 70 | + snippet = pruneTrailingCitationTail(snippet).trim(); |
| 71 | + |
| 72 | + if (!snippet.endsWith(".")) { |
| 73 | + snippet = snippet + "."; |
| 74 | + } |
| 75 | + if (snippet.length() > 300) { |
| 76 | + snippet = snippet.substring(0, 300) + "..."; |
| 77 | + } |
| 78 | + |
| 79 | + out.put(citedKey, snippet); |
| 80 | + } |
| 81 | + } |
| 82 | + |
| 83 | + return out; |
| 84 | + } |
| 85 | + |
| 86 | + /** |
| 87 | + * Try to isolate the "Related work" section; fallback to full text. |
| 88 | + */ |
| 89 | + private String sliceRelatedWorkSection(String text) { |
| 90 | + Matcher start = RELATED_WORK_HEADING.matcher(text); |
| 91 | + int begin = -1; |
| 92 | + while (start.find()) { |
| 93 | + begin = start.end(); |
| 94 | + break; |
| 95 | + } |
| 96 | + if (begin < 0) { |
| 97 | + return text; // fallback: whole text |
| 98 | + } |
| 99 | + |
| 100 | + // Next likely section heading AFTER begin (numbered or ALL-CAPS) |
| 101 | + Pattern nextSection = Pattern.compile( |
| 102 | + "(?m)^(?:\\d+(?:\\.\\d+)*)\\s+[A-Z][A-Z\\s\\-]{3,}$|^[A-Z][A-Z\\s\\-]{3,}$"); |
| 103 | + Matcher end = nextSection.matcher(text); |
| 104 | + int stop = text.length(); |
| 105 | + while (end.find()) { |
| 106 | + if (end.start() > begin) { |
| 107 | + stop = end.start(); |
| 108 | + break; |
| 109 | + } |
| 110 | + } |
| 111 | + return text.substring(begin, stop); |
| 112 | + } |
| 113 | + |
| 114 | + private Map<String, BibEntry> buildIndex(List<BibEntry> bibs) { |
| 115 | + Map<String, BibEntry> idx = new HashMap<>(); |
| 116 | + for (BibEntry b : bibs) { |
| 117 | + Optional<String> y = b.getField(StandardField.YEAR); |
| 118 | + if (y.isEmpty()) { |
| 119 | + Optional<String> date = b.getField(StandardField.DATE); |
| 120 | + if (date.isPresent()) { |
| 121 | + Matcher m = Pattern.compile("(\\d{4})").matcher(date.get()); |
| 122 | + if (m.find()) { |
| 123 | + y = Optional.of(m.group(1)); |
| 124 | + } |
| 125 | + } |
| 126 | + } |
| 127 | + Optional<String> a = b.getField(StandardField.AUTHOR); |
| 128 | + if (y.isEmpty() || a.isEmpty()) { |
| 129 | + continue; |
| 130 | + } |
| 131 | + String yearDigits = y.get().replaceAll("[^0-9]", ""); |
| 132 | + if (yearDigits.isEmpty()) { |
| 133 | + continue; |
| 134 | + } |
| 135 | + |
| 136 | + String firstAuthor = firstAuthorRaw(a.get()); |
| 137 | + String firstSurname = extractFirstSurnameFromRaw(firstAuthor); |
| 138 | + if (!firstSurname.isEmpty()) { |
| 139 | + idx.put(normalizeSurname(firstSurname) + yearDigits, b); |
| 140 | + } |
| 141 | + |
| 142 | + // Also index acronym for corporate/multi-word first author without comma. |
| 143 | + String acronym = maybeAcronym(firstAuthor); |
| 144 | + if (!acronym.isEmpty()) { |
| 145 | + idx.put(acronym + yearDigits, b); |
| 146 | + } |
| 147 | + } |
| 148 | + return idx; |
| 149 | + } |
| 150 | + |
| 151 | + /** |
| 152 | + * Get the raw first author string (before surname extraction). |
| 153 | + */ |
| 154 | + private String firstAuthorRaw(String authorField) { |
| 155 | + return authorField.split("\\s+and\\s+")[0].trim(); |
| 156 | + } |
| 157 | + |
| 158 | + /** |
| 159 | + * Extract the first author surname from a raw first-author token. |
| 160 | + */ |
| 161 | + private String extractFirstSurnameFromRaw(String firstAuthor) { |
| 162 | + if (firstAuthor.contains(",")) { |
| 163 | + return firstAuthor.substring(0, firstAuthor.indexOf(',')).trim(); |
| 164 | + } |
| 165 | + if (firstAuthor.startsWith("{") && firstAuthor.endsWith("}")) { |
| 166 | + String inner = firstAuthor.substring(1, firstAuthor.length() - 1).trim(); |
| 167 | + String[] parts = inner.split("\\s+"); |
| 168 | + return parts.length == 0 ? "" : parts[parts.length - 1]; |
| 169 | + } |
| 170 | + String[] parts = firstAuthor.split("\\s+"); |
| 171 | + return parts.length == 0 ? "" : parts[parts.length - 1]; |
| 172 | + } |
| 173 | + |
| 174 | + private String maybeAcronym(String firstAuthor) { |
| 175 | + if (firstAuthor.contains(",")) { |
| 176 | + return ""; // likely "Surname, Given" → skip acronym |
| 177 | + } |
| 178 | + String unbraced = firstAuthor; |
| 179 | + if (unbraced.startsWith("{") && unbraced.endsWith("}")) { |
| 180 | + unbraced = unbraced.substring(1, unbraced.length() - 1); |
| 181 | + } |
| 182 | + String[] parts = unbraced.trim().split("\\s+"); |
| 183 | + if (parts.length < 2) { |
| 184 | + return ""; // single token → not helpful |
| 185 | + } |
| 186 | + StringBuilder sb = new StringBuilder(); |
| 187 | + for (String p : parts) { |
| 188 | + if (p.isEmpty()) { |
| 189 | + continue; |
| 190 | + } |
| 191 | + char c = p.charAt(0); |
| 192 | + if (Character.isLetter(c)) { |
| 193 | + sb.append(Character.toLowerCase(c)); |
| 194 | + } |
| 195 | + } |
| 196 | + return sb.toString(); |
| 197 | + } |
| 198 | + |
| 199 | + /** |
| 200 | + * Normalize token: remove braces, strip diacritics, lowercase. |
| 201 | + */ |
| 202 | + private String normalizeSurname(String s) { |
| 203 | + String noBraces = s.replace("{", "").replace("}", ""); |
| 204 | + String normalized = Normalizer.normalize(noBraces, Normalizer.Form.NFD) |
| 205 | + .replaceAll("\\p{M}+", ""); |
| 206 | + return normalized.toLowerCase(Locale.ROOT); |
| 207 | + } |
| 208 | + |
| 209 | + /** |
| 210 | + * Lookup by normalized token (surname or acronym) + 4-digit year. |
| 211 | + */ |
| 212 | + private String findKeyFor(String lowerToken, String yearDigits, Map<String, BibEntry> index) { |
| 213 | + BibEntry entry = index.get(lowerToken + yearDigits); |
| 214 | + return (entry != null) ? entry.getCitationKey().orElse(null) : null; // null signals "not found" |
| 215 | + } |
| 216 | + |
| 217 | + /** |
| 218 | + * Expand to a sentence-like span around the parenthetical match. |
| 219 | + */ |
| 220 | + private String expandToSentenceLikeSpan(String text, int matchStart, int matchEnd) { |
| 221 | + int left = matchStart; |
| 222 | + while (left > 0) { |
| 223 | + char c = text.charAt(left - 1); |
| 224 | + if (c == '.' || c == '!' || c == '?' || c == '\n') { |
| 225 | + break; |
| 226 | + } |
| 227 | + left--; |
| 228 | + } |
| 229 | + int right = matchEnd; |
| 230 | + int len = text.length(); |
| 231 | + while (right < len) { |
| 232 | + char c = text.charAt(right); |
| 233 | + if (c == '.' || c == '!' || c == '?' || c == '\n') { |
| 234 | + right++; // include the boundary char |
| 235 | + break; |
| 236 | + } |
| 237 | + right++; |
| 238 | + } |
| 239 | + if (right > len) { |
| 240 | + right = len; |
| 241 | + } |
| 242 | + return text.substring(left, right); |
| 243 | + } |
| 244 | + |
| 245 | + /** |
| 246 | + * Heuristically remove trailing citation trains at the end of a snippet |
| 247 | + */ |
| 248 | + private String pruneTrailingCitationTail(String s) { |
| 249 | + int lastParen = s.lastIndexOf(')'); |
| 250 | + if (lastParen > -1 && lastParen >= s.length() - 3) { |
| 251 | + String head = s.substring(0, lastParen + 1).trim(); |
| 252 | + if (head.endsWith(").")) { |
| 253 | + return head; |
| 254 | + } |
| 255 | + if (head.endsWith(")")) { |
| 256 | + return head + "."; |
| 257 | + } |
| 258 | + return head; |
| 259 | + } |
| 260 | + return s; |
| 261 | + } |
| 262 | +} |
0 commit comments