diff --git a/doc/Grobid-service.md b/doc/Grobid-service.md index 369a6850a0..139507bebd 100644 --- a/doc/Grobid-service.md +++ b/doc/Grobid-service.md @@ -176,6 +176,7 @@ Extract the header of the input PDF document, normalize it and convert it into a | | | | `includeRawCopyrights` | optional | `includeRawCopyrights` is a boolean value, `0` (default, do not include raw copyrights/license string in the result) or `1` (include raw copyrights/license string in the result). | | | | | `start` | optional | Start page number of the PDF to be considered, previous pages will be skipped/ignored, integer with first page starting at `1`, (default `-1`, start from the first page of the PDF) | | | | | `end` | optional | End page number of the PDF to be considered, next pages will be skipped/ignored, integer with first page starting at `1` (default `2`, end with the last page of the PDF) | +| | | | `typedAreas` | optional | JSON array specifying areas with coordinates and types for specialized processing (see [Typed Areas](#typed-areas) below) | Use `Accept: application/x-bibtex` to retrieve BibTeX format instead of XML TEI. @@ -229,6 +230,7 @@ Convert the complete input document into TEI XML format (header, body and biblio | | | | `start` | optional | Start page number of the PDF to be considered, previous pages will be skipped/ignored, integer with first page starting at `1`, (default `-1`, start from the first page of the PDF) | | | | | `end` | optional | End page number of the PDF to be considered, next pages will be skipped/ignored, integer with first page starting at `1` (default `-1`, end with the last page of the PDF) | | | | | `flavor` | optional | Indicate which flavor to apply for structuring the document. Useful when the default structuring cannot be applied to a specific document (e.g. the body is empty. More technical details and available flavor names in the [dedicated page](Grobid-specialized-processes.md). | +| | | | `typedAreas` | optional | JSON array specifying areas with coordinates and types for specialized processing (see [Typed Areas](#typed-areas) below) | Response status codes: @@ -291,6 +293,7 @@ Extract and convert all the bibliographical references present in the input docu | POST, PUT | `multipart/form-data` | `application/xml` | `input` | required | PDF file to be processed | | | | | `consolidateCitations` | optional | `consolidateCitations` is a string of value `0` (no consolidation, default value) or `1` (consolidate and inject all extra metadata), or `2` (consolidate the citation and inject DOI only). | | | | | `includeRawCitations` | optional | `includeRawCitations` is a boolean value, `0` (default. do not include raw reference string in the result) or `1` (include raw reference string in the result). | +| | | | `typedAreas` | optional | JSON array specifying areas with coordinates and types for specialized processing (see [Typed Areas](#typed-areas) below) | Use `Accept: application/x-bibtex` to retrieve BibTeX instead of TEI. @@ -318,6 +321,135 @@ It is possible to include the original raw reference string in the parsed result curl -v --form input=@./thefile.pdf --form includeRawCitations=1 localhost:8070/api/processReferences ``` +## Typed Areas + +The typed areas feature allows you to specify regions in PDF documents for specialized processing. Instead of relying solely on automatic detection, you can pre-identify areas containing figures, tables, or content to be ignored. This provides better accuracy and control over the document processing pipeline. + +### Supported Area Types + +- **`figure`**: Areas containing figures/diagrams that will be processed with the specialized figure model +- **`table`**: Areas containing tables that will be processed with the specialized table model +- **`ignore`**: Areas that should be completely excluded from all processing + +### JSON Format + +The `typedAreas` parameter expects a JSON array with the following structure: + +```json +[ + { + "page": 1, + "x": 100.0, + "y": 200.0, + "width": 300.0, + "height": 150.0, + "type": "figure" + }, + { + "page": 1, + "x": 450.0, + "y": 200.0, + "width": 250.0, + "height": 200.0, + "type": "table" + }, + { + "page": 1, + "x": 50.0, + "y": 500.0, + "width": 500.0, + "height": 100.0, + "type": "ignore" + } +] +``` + +### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `page` | integer | Yes | Page number (1-based, following PDF convention) | +| `x` | number | Yes | X-coordinate of the upper-left corner of the area | +| `y` | number | Yes | Y-coordinate of the upper-left corner of the area | +| `width` | number | Yes | Width of the area | +| `height` | number | Yes | Height of the area | +| `type` | string | Yes | Area type: `"figure"`, `"table"`, or `"ignore"` | + +### Coordinate System + +The coordinate system follows the PDF convention: +- **Origin**: Upper-left corner of the page +- **Units**: Points (1/72 inch) +- **Page numbering**: 1-based (first page is page 1) + +### Processing Behavior + +**Figure areas**: +- Tokens within figure areas are extracted from the main text processing +- Applied to the specialized FigureParser model +- Results are integrated into the TEI output as structured figure elements +- Bypasses the segmentation model for improved accuracy + +**Table areas**: +- Tokens within table areas are extracted from the main text processing +- Applied to the specialized TableParser model +- Results are integrated into the TEI output as structured table elements +- Bypasses the segmentation model for improved accuracy + +**Ignore areas**: +- Tokens within ignore areas are completely discarded +- No further processing is performed on these regions +- Useful for excluding headers, footers, watermarks, or other unwanted content + +### Usage Examples + +**cURL example with typed areas:** +```bash +curl -v -H "Accept: application/xml" \ + --form input=@./document.pdf \ + --form typedAreas='[ + {"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, "type": "figure"}, + {"page": 1, "x": 450, "y": 200, "width": 250, "height": 200, "type": "table"} + ]' \ + localhost:8070/api/processFulltextDocument +``` + +**Python example:** +```python +import requests +import json + +typed_areas = [ + {"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, "type": "figure"}, + {"page": 1, "x": 450, "y": 200, "width": 250, "height": 200, "type": "table"} +] + +with open('document.pdf', 'rb') as f: + files = {'input': f} + data = {'typedAreas': json.dumps(typed_areas)} + response = requests.post( + 'http://localhost:8070/api/processFulltextDocument', + files=files, + data=data, + headers={'Accept': 'application/xml'} + ) +``` + +### Benefits + +1. **Improved Accuracy**: Pre-identified figures and tables bypass the segmentation model, reducing detection errors +2. **Better Quality**: Specialized models applied to known area types produce higher quality results +3. **Performance**: More efficient processing by avoiding unnecessary model applications +4. **Control**: Precise control over which regions are processed and how +5. **Integration**: Seamlessly integrated into existing TEI output structure + +### Error Handling + +- Invalid JSON format will result in HTTP 400 error +- Invalid area types will be logged as warnings and skipped +- Coordinates outside page boundaries will be clamped to valid ranges +- Missing required fields will cause the area to be skipped with a warning + ### Raw text to TEI conversion services #### /api/processDate diff --git a/doc/Typed-Areas-API.md b/doc/Typed-Areas-API.md new file mode 100644 index 0000000000..9ff1ec18c5 --- /dev/null +++ b/doc/Typed-Areas-API.md @@ -0,0 +1,303 @@ +# Typed Areas API Documentation + +The Typed Areas API provides enhanced control over PDF document processing by allowing you to specify regions for specialized handling. This feature improves accuracy for figure and table extraction while providing control over content exclusion. + +## Overview + +Instead of relying solely on automatic detection, you can pre-identify areas containing: +- **Figures** - processed with specialized figure model +- **Tables** - processed with specialized table model +- **Ignored content** - completely excluded from processing + +## Supported Endpoints + +The `typedAreas` parameter is supported by the following endpoints: + +- `POST /api/processHeaderDocument` - Header extraction with typed areas +- `POST /api/processFulltextDocument` - Full document processing with typed areas +- `POST /api/processReferences` - Reference extraction with typed areas + +## Request Format + +### Parameter Specification + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `typedAreas` | string (JSON) | No | JSON array of area specifications | + +### JSON Structure + +```json +[ + { + "page": 1, + "x": 100.0, + "y": 200.0, + "width": 300.0, + "height": 150.0, + "type": "figure" + }, + { + "page": 1, + "x": 450.0, + "y": 200.0, + "width": 250.0, + "height": 200.0, + "type": "table" + }, + { + "page": 1, + "x": 50.0, + "y": 500.0, + "width": 500.0, + "height": 100.0, + "type": "ignore" + } +] +``` + +### Field Descriptions + +- **`page`** (integer, required): Page number (1-based, following PDF convention) +- **`x`** (number, required): X-coordinate of upper-left corner in points +- **`y`** (number, required): Y-coordinate of upper-left corner in points +- **`width`** (number, required): Width of the area in points +- **`height`** (number, required): Height of the area in points +- **`type`** (string, required): Area type - `"figure"`, `"table"`, or `"ignore"` + +## Area Types and Processing + +### Figure Areas (`"type": "figure"`) + +**Processing**: +- Tokens within figure areas are extracted from main text processing +- Applied to specialized FigureParser model +- Results integrated into TEI output as structured `
` elements +- Bypasses segmentation model for improved accuracy + +**Use Cases**: +- Pre-identified figures from external OCR or layout analysis +- Complex diagrams where automatic detection fails +- Figures with known boundaries for consistent processing + +### Table Areas (`"type": "table"`) + +**Processing**: +- Tokens within table areas are extracted from main text processing +- Applied to specialized TableParser model +- Results integrated into TEI output as structured `` elements +- Bypasses segmentation model for improved accuracy + +**Use Cases**: +- Tables with complex layouts or formatting +- Pre-identified table regions from document analysis +- Tables requiring consistent extraction across documents + +### Ignore Areas (`"type": "ignore"`) + +**Processing**: +- Tokens within ignore areas are completely discarded +- No further processing performed on these regions +- Content excluded from all model processing + +**Use Cases**: +- Headers, footers, and page numbers +- Watermarks or background elements +- Marginalia or annotations +- Advertisements or irrelevant content + +## Coordinate System + +The coordinate system follows PDF conventions: + +``` +(0,0) +----------------------→ X (points) + | + | + ↓ Y (points) +``` + +- **Origin**: Upper-left corner of the page +- **Units**: Points (1/72 inch ≈ 0.353 mm) +- **Page numbering**: 1-based (first page is page 1) + +## Usage Examples + +### cURL Examples + +**Basic header processing with typed areas:** +```bash +curl -v -H "Accept: application/xml" \ + --form input=@./document.pdf \ + --form typedAreas='[ + {"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, "type": "figure"}, + {"page": 1, "x": 450, "y": 200, "width": 250, "height": 200, "type": "table"} + ]' \ + localhost:8070/api/processHeaderDocument +``` + +**Full document processing with areas:** +```bash +curl -v -H "Accept: application/xml" \ + --form input=@./document.pdf \ + --form consolidateHeader=1 \ + --form typedAreas='[ + {"page": 1, "x": 50, "y": 750, "width": 500, "height": 50, "type": "ignore"}, + {"page": 2, "x": 100, "y": 100, "width": 400, "height": 300, "type": "figure"} + ]' \ + localhost:8070/api/processFulltextDocument +``` + +### Using a JSON File + +For complex area definitions, store them in a JSON file and pass it to curl: + +**Create `typed_areas.json`:** +```json +[ + {"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, "type": "figure"}, + {"page": 1, "x": 450, "y": 200, "width": 250, "height": 200, "type": "table"}, + {"page": 1, "x": 50, "y": 750, "width": 500, "height": 30, "type": "ignore"} +] +``` + +**Pass the file content as the form field value:** +```bash +curl -v -H "Accept: application/xml" \ + --form input=@./document.pdf \ + --form "typedAreas=$(cat typed_areas.json)" \ + localhost:8070/api/processFulltextDocument +``` + +## Error Handling + +### Common Error Scenarios + +**Invalid JSON format:** +```json +// Invalid - missing quotes around type +{"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, type: figure} +``` +**Error**: HTTP 400 - "Invalid JSON format" + +**Invalid area type:** +```json +// Invalid - unsupported area type +{"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, "type": "diagram"} +``` +**Behavior**: Area logged as warning and skipped + +**Missing required fields:** +```json +// Invalid - missing type field +{"page": 1, "x": 100, "y": 200, "width": 300, "height": 150} +``` +**Behavior**: Area logged as warning and skipped + +**Invalid coordinates:** +```json +// Valid but outside bounds - will be clamped +{"page": 1, "x": -100, "y": 200, "width": 300, "height": 150, "type": "figure"} +``` +**Behavior**: Coordinates clamped to valid page boundaries + +### Response Status Codes + +| Status Code | Description | +|-------------|-------------| +| 200 | Successful processing with typed areas | +| 204 | Processing completed but no content extracted | +| 400 | Invalid request (malformed JSON, missing parameters) | +| 500 | Internal server error during processing | +| 503 | Service unavailable (all threads in use) | + +## Integration with Existing Workflow + +### Combining with Other Parameters + +Typed areas work seamlessly with all existing GROBID parameters: + +```bash +curl -v -H "Accept: application/xml" \ + --form input=@./document.pdf \ + --form typedAreas='[{"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, "type": "figure"}]' \ + --form consolidateHeader=1 \ + --form consolidateCitations=1 \ + --form segmentSentences=1 \ + --form generateIDs=1 \ + --form teiCoordinates=figure,table \ + localhost:8070/api/processFulltextDocument +``` + +### TEI Output Structure + +Processed typed areas are integrated into the standard TEI output: + +```xml +
+ Figure 1: Sample Figure + Description extracted from specialized processing + +
+ +
+ Table 1: Sample Data + + Header 1 + Header 2 + + +
+``` + +## Performance Considerations + +### Optimization Tips + +1. **Area Size**: Define areas as tightly as possible around content +2. **Overlapping Areas**: Avoid overlapping typed areas - results may be unpredictable +3. **Large Documents**: Consider processing pages individually for very large documents +4. **Batch Processing**: Reuse area definitions across similar documents when possible + +### Performance Impact + +- **Improved**: Bypassing segmentation for pre-identified areas +- **Overhead**: JSON parsing and area coordinate calculations +- **Memory**: Additional token lists for different area types +- **Overall**: Typically faster processing for documents with many pre-identified figures/tables + +## Migration from Legacy ignoreAreas + +The legacy `ignoreAreas` parameter has been **removed**. The `typedAreas` parameter is now the only supported way to define areas for processing. + +**Required format:** +```json +[ + {"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, "type": "figure"}, + {"page": 1, "x": 400, "y": 200, "width": 200, "height": 100, "type": "ignore"} +] +``` + +If you were previously using `ignoreAreas`, replace it with `typedAreas` and set `"type": "ignore"` for each area. + +## Troubleshooting + +### Common Issues + +1. **Areas not being processed**: Check JSON format and field names +2. **Incorrect coordinates**: Verify coordinate system and page numbering +3. **Partial extraction**: Ensure areas fully encompass target content +4. **Performance issues**: Reduce number of areas or make them more precise + +### Debugging Tips + +1. **Start simple**: Test with a single, well-defined area +2. **Verify coordinates**: Use PDF viewer to confirm area boundaries +3. **Check logs**: Server logs provide detailed error messages for invalid areas +4. **Validate JSON**: Use JSON validator to ensure correct syntax + +### Getting Help + +- **Documentation**: See [GROBID Service API](Grobid-service.md) for general API usage +- **Issues**: Report bugs or request features via GitHub issues +- **Community**: Join discussions for usage tips and best practices \ No newline at end of file diff --git a/doc/index.md b/doc/index.md index 38115c2e2f..1cae85f318 100644 --- a/doc/index.md +++ b/doc/index.md @@ -17,6 +17,8 @@ * [Use GROBID as a service](Grobid-service.md) +* [Typed Areas API Documentation](Typed-Areas-API.md) + * [Build a GROBID development environment](Install-Grobid.md) * [Manage GROBID with containers (Docker)](Grobid-docker.md) diff --git a/grobid-core/src/main/java/org/grobid/core/document/Document.java b/grobid-core/src/main/java/org/grobid/core/document/Document.java index 389469c6f6..0053982525 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/Document.java +++ b/grobid-core/src/main/java/org/grobid/core/document/Document.java @@ -27,6 +27,8 @@ import org.grobid.core.layout.Cluster; import org.grobid.core.layout.GraphicObject; import org.grobid.core.layout.GraphicObjectType; +import org.grobid.core.layout.IgnoreArea; +import org.grobid.core.layout.AreaType; import org.grobid.core.layout.LayoutToken; import org.grobid.core.layout.PDFAnnotation; import org.grobid.core.layout.Page; @@ -63,10 +65,14 @@ import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.IdentityHashMap; +import java.util.LinkedHashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.SortedSet; +import java.util.TreeSet; import java.util.regex.Matcher; import java.util.stream.Collectors; @@ -156,6 +162,20 @@ public void setImages(List images) { protected transient List equations; protected transient List annexEquations; + // typed areas for specialized processing + protected transient List figureAreas = new ArrayList<>(); + protected transient List tableAreas = new ArrayList<>(); + protected transient List ignoredAreas = new ArrayList<>(); + + // tokens extracted from typed areas for specialized processing + protected transient List figureTokens = new ArrayList<>(); + protected transient List tableTokens = new ArrayList<>(); + protected transient Map> tableTokensByArea = new LinkedHashMap<>(); + protected transient List ignoredTokens = new ArrayList<>(); + + // tokens that fall within typed areas and should be excluded from body processing + protected transient Set excludedTokens = Collections.newSetFromMap(new IdentityHashMap<>()); + // the analyzer/tokenizer used for processing this document protected transient Analyzer analyzer = GrobidAnalyzer.getInstance(); @@ -1703,4 +1723,197 @@ public List
getAnnexFigures() { public void setAnnexFigures(List
annexFigures) { this.annexFigures = annexFigures; } + + // Typed area getters and setters + public List getFigureAreas() { + return figureAreas; + } + + public void setFigureAreas(List figureAreas) { + this.figureAreas = figureAreas != null ? figureAreas : new ArrayList<>(); + } + + public List getTableAreas() { + return tableAreas; + } + + public void setTableAreas(List tableAreas) { + this.tableAreas = tableAreas != null ? tableAreas : new ArrayList<>(); + } + + public List getIgnoredAreas() { + return ignoredAreas; + } + + public void setIgnoredAreas(List ignoredAreas) { + this.ignoredAreas = ignoredAreas != null ? ignoredAreas : new ArrayList<>(); + } + + // Token getters and setters for typed areas + public List getFigureTokens() { + return figureTokens; + } + + public void setFigureTokens(List figureTokens) { + this.figureTokens = figureTokens != null ? figureTokens : new ArrayList<>(); + } + + public List getTableTokens() { + return tableTokens; + } + + public void setTableTokens(List tableTokens) { + this.tableTokens = tableTokens != null ? tableTokens : new ArrayList<>(); + } + + public Map> getTableTokensByArea() { + return tableTokensByArea; + } + + public List getIgnoredTokens() { + return ignoredTokens; + } + + public void setIgnoredTokens(List ignoredTokens) { + this.ignoredTokens = ignoredTokens != null ? ignoredTokens : new ArrayList<>(); + } + + private boolean isTokenExcluded(LayoutToken token) { + return excludedTokens.contains(token); + } + + /** + * Filters document pieces by splitting them around excluded token runs. + * Returns new pieces that skip over any tokens in the excludedTokens set. + */ + public SortedSet filterDocumentPiecesByExcludedTokens(SortedSet pieces) { + if (excludedTokens.isEmpty() || pieces == null || pieces.isEmpty()) { + return pieces; + } + SortedSet filtered = new TreeSet<>(); + for (DocumentPiece piece : pieces) { + int startPos = piece.getLeft().getTokenDocPos(); + int endPos = piece.getRight().getTokenDocPos(); + int runStart = -1; + for (int i = startPos; i <= endPos; i++) { + LayoutToken token = tokenizations.get(i); + if (!excludedTokens.contains(token)) { + if (runStart == -1) runStart = i; + } else { + if (runStart != -1) { + filtered.add(createPiece(runStart, i - 1)); + runStart = -1; + } + } + } + if (runStart != -1) { + filtered.add(createPiece(runStart, endPos)); + } + } + return filtered; + } + + private DocumentPiece createPiece(int startTokenDocPos, int endTokenDocPos) { + int startBlock = tokenizations.get(startTokenDocPos).getBlockPtr(); + int endBlock = tokenizations.get(endTokenDocPos).getBlockPtr(); + return new DocumentPiece( + new DocumentPointer(this, startBlock, startTokenDocPos), + new DocumentPointer(this, endBlock, endTokenDocPos)); + } + + /** + * Filters out layout tokens that fall within the specified typed areas and categorizes them by type. + * Tokens in figure/table areas are collected for ML-based processing; tokens in ignore areas are discarded. + * + * @param typedAreas list of typed areas for specialized processing + */ + public void filterLayoutTokensByTypedAreas(List typedAreas) { + if (typedAreas == null || typedAreas.isEmpty() || tokenizations == null || tokenizations.isEmpty()) { + return; + } + + LOGGER.debug("Processing {} typed areas", typedAreas.size()); + + // Clear previous token lists + figureTokens.clear(); + tableTokens.clear(); + tableTokensByArea.clear(); + ignoredTokens.clear(); + figureAreas.clear(); + tableAreas.clear(); + ignoredAreas.clear(); + + // Categorize areas by type + for (IgnoreArea area : typedAreas) { + if (area.getType() == null) { + continue; + } + + switch (area.getType()) { + case FIGURE: + figureAreas.add(area); + break; + case TABLE: + tableAreas.add(area); + break; + case IGNORE: + ignoredAreas.add(area); + break; + } + } + + excludedTokens.clear(); + int figureTokenCount = 0; + int tableTokenCount = 0; + int ignoredTokenCount = 0; + + for (LayoutToken token : tokenizations) { + // Check if token intersects with any typed area + for (IgnoreArea area : typedAreas) { + if (area.contains(token)) { + switch (area.getType()) { + case FIGURE: + figureTokens.add(token); + figureTokenCount++; + break; + case TABLE: + tableTokens.add(token); + tableTokensByArea.computeIfAbsent(area, k -> new ArrayList<>()).add(token); + tableTokenCount++; + break; + case IGNORE: + ignoredTokens.add(token); + ignoredTokenCount++; + break; + } + excludedTokens.add(token); + break; + } + } + } + + recalculateBlockPointers(); + + LOGGER.debug("Processed typed areas: {} figure tokens, {} table tokens, {} ignored tokens, {} excluded total", + figureTokenCount, tableTokenCount, ignoredTokenCount, excludedTokens.size()); + } + + /** + * Recalculate blockPtr for all tokens based on the current blocks list. + * Ensures that each token's blockPtr correctly points to the block + * whose startToken <= tokenDocPos < nextBlock.startToken. + */ + private void recalculateBlockPointers() { + if (blocks == null || blocks.isEmpty() || tokenizations == null || tokenizations.isEmpty()) { + return; + } + int blockIdx = 0; + for (int i = 0; i < tokenizations.size(); i++) { + while (blockIdx < blocks.size() - 1 + && blocks.get(blockIdx + 1).getStartToken() <= i) { + blockIdx++; + } + tokenizations.get(i).setBlockPtr(blockIdx); + } + } } diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 698c7b8c1c..64df73f9f7 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -155,7 +155,17 @@ public Document processing(DocumentSource documentSource, try { // general segmentation Document doc = parsers.getSegmentationParser(flavor).processing(documentSource, config); + + // Apply typed areas filtering if configured + if (config.getTypedAreas() != null && !config.getTypedAreas().isEmpty()) { + doc.filterLayoutTokensByTypedAreas(config.getTypedAreas()); + // Apply specialized processing for figures and tables + processTypedAreas(doc); + } + SortedSet documentBodyParts = doc.getDocumentPart(SegmentationLabels.BODY); + // Filter body pieces to exclude typed area regions + documentBodyParts = doc.filterDocumentPiecesByExcludedTokens(documentBodyParts); // header processing BiblioItem headerResults = new BiblioItem(); @@ -277,7 +287,11 @@ else if (config.getConsolidateCitations() == 2) bodyFigures = processFigures(bodyResults, bodyTokenization.getTokenization()); doc.setFigures(bodyFigures); - bodyResults = fixFiguresLabellingResults(doc, bodyResults); + // Skip graphic object reassignment when user provided figure areas, + // since those areas already account for their graphic objects + if (doc.getFigureAreas().isEmpty()) { + bodyResults = fixFiguresLabellingResults(doc, bodyResults); + } // Figures @@ -333,6 +347,10 @@ else if (config.getConsolidateCitations() == 2) LOGGER.debug("Fulltext model: The featured body is empty"); } + // Save typed area tables before annex processing (which overwrites doc.annexTables) + List typedAreaTables = doc.getAnnexTables() != null + ? new ArrayList<>(doc.getAnnexTables()) : new ArrayList<>(); + // possible annexes (view as a piece of full text similar to the body) documentBodyParts = doc.getDocumentPart(SegmentationLabels.ANNEX); featSeg = getBodyTextFeatured(doc, documentBodyParts); @@ -407,6 +425,15 @@ else if (config.getConsolidateCitations() == 2) doc.setAnnexEquations(annexEquations); } + // Merge typed area tables back (they were overwritten by annex processing) + if (!typedAreaTables.isEmpty()) { + if (annexTables == null) { + annexTables = new ArrayList<>(); + } + annexTables.addAll(typedAreaTables); + doc.setAnnexTables(annexTables); + } + // post-process reference and footnote callout to keep them consistent (e.g. for example avoid that a footnote // callout in superscript is by error labeled as a numerical reference callout) List markerTypes = null; @@ -724,6 +751,13 @@ public Document processingHeaderFunding(DocumentSource documentSource, // general segmentation Document doc = parsers.getSegmentationParser().processing(documentSource, config); + // Apply typed areas filtering if configured + if (config.getTypedAreas() != null && !config.getTypedAreas().isEmpty()) { + doc.filterLayoutTokensByTypedAreas(config.getTypedAreas()); + // Apply specialized processing for figures and tables + processTypedAreas(doc); + } + // header processing BiblioItem resHeader = new BiblioItem(); Pair featSeg = null; @@ -1063,6 +1097,7 @@ else if (nbAuthorType > (bibDataSets.size() / 2)) } LayoutToken token = tokens.get(n); + layoutTokens.add(token); features = new FeaturesVectorFulltext(); @@ -3543,6 +3578,490 @@ public static List getDocumentFullTextTokens(List> generateFeaturesForTokens(List tokens, Document doc) { + if (CollectionUtils.isEmpty(tokens)) { + return null; + } + + FeatureFactory featureFactory = FeatureFactory.getInstance(); + StringBuilder fulltext = new StringBuilder(); + String currentFont = null; + int currentFontSize = -1; + + List blocks = doc.getBlocks(); + if (CollectionUtils.isEmpty(blocks)) { + return null; + } + + FeaturesVectorFulltext features; + FeaturesVectorFulltext previousFeatures = null; + + List filteredTokens = new ArrayList<>(); + + int mm = 0; // page position + int nn = 0; // document position + double lineStartX = Double.NaN; + boolean indented = false; + boolean previousNewline = false; + boolean newline; + + // Compute total text length for relative position + int fulltextLength = 0; + for (LayoutToken t : tokens) { + String text = t.getText(); + if (text != null) { + String cleaned = text.replace(" ", ""); + if (!cleaned.isEmpty() && !cleaned.equals("\n")) { + fulltextLength += cleaned.length(); + } + } + } + + // Track block boundaries and graphics per block + int previousBlockPtr = -1; + boolean graphicVector = false; + boolean graphicBitmap = false; + double density = 0.0; + double spacingPreviousBlock = 0.0; + double lowestPos = 0.0; + int currentPage = -1; + + for (int i = 0; i < tokens.size(); i++) { + LayoutToken token = tokens.get(i); + + // Detect block boundary changes + int blockPtr = token.getBlockPtr(); + boolean isNewBlock = (blockPtr != previousBlockPtr); + + if (isNewBlock && blockPtr >= 0 && blockPtr < blocks.size()) { + Block block = blocks.get(blockPtr); + graphicVector = false; + graphicBitmap = false; + + double pageHeight = block.getPage().getHeight(); + int localPage = block.getPage().getNumber(); + if (localPage != currentPage) { + currentPage = localPage; + mm = 0; + lowestPos = 0.0; + spacingPreviousBlock = 0.0; + } + + if (lowestPos > block.getY()) { + spacingPreviousBlock = doc.getMaxBlockSpacing() / 5.0; + } else { + spacingPreviousBlock = block.getY() - lowestPos; + } + + String localText = block.getText(); + if (localText != null && !localText.contains("@PAGE") && !localText.contains("@IMAGE")) { + if (block.getHeight() != 0.0 && block.getWidth() != 0.0) { + density = (double) localText.length() / (block.getHeight() * block.getWidth()); + } + } + + List localImages = Document.getConnectedGraphics(block, doc); + if (localImages != null) { + for (GraphicObject localImage : localImages) { + if (localImage.getType() == GraphicObjectType.BITMAP) + graphicBitmap = true; + if (localImage.getType() == GraphicObjectType.VECTOR || localImage.getType() == GraphicObjectType.VECTOR_BOX) + graphicVector = true; + } + } + + previousBlockPtr = blockPtr; + } + + features = new FeaturesVectorFulltext(); + features.token = token; + + double coordinateLineY = token.getY(); + + String text = token.getText(); + if (text == null || text.isEmpty()) { + continue; + } + text = text.replace(" ", ""); + if (text.isEmpty()) { + mm++; + nn++; + continue; + } + if (text.equals("\n")) { + previousNewline = true; + mm++; + nn++; + continue; + } + newline = false; + + // final sanitisation and filtering + text = text.replaceAll("[ \n]", ""); + if (TextUtilities.filterLine(text)) { + continue; + } + + if (previousNewline) { + newline = true; + previousNewline = false; + if (previousFeatures != null) { + double previousLineStartX = lineStartX; + lineStartX = token.getX(); + double characterWidth = token.width / text.length(); + if (!Double.isNaN(previousLineStartX)) { + if (previousLineStartX - lineStartX > characterWidth) + indented = false; + else if (lineStartX - previousLineStartX > characterWidth) + indented = true; + } + } + } + + filteredTokens.add(token); + features.string = text; + + if (graphicBitmap) { + features.bitmapAround = true; + } + if (graphicVector) { + features.vectorAround = true; + } + + if (newline) { + features.lineStatus = "LINESTART"; + lineStartX = token.getX(); + if (previousFeatures != null) { + if (!"LINESTART".equals(previousFeatures.lineStatus)) + previousFeatures.lineStatus = "LINEEND"; + } + } + + Matcher m0 = featureFactory.isPunct.matcher(text); + if (m0.find()) { + features.punctType = "PUNCT"; + } + if (text.equals("(") || text.equals("[")) { + features.punctType = "OPENBRACKET"; + } else if (text.equals(")") || text.equals("]")) { + features.punctType = "ENDBRACKET"; + } else if (text.equals(".")) { + features.punctType = "DOT"; + } else if (text.equals(",")) { + features.punctType = "COMMA"; + } else if (text.equals("-")) { + features.punctType = "HYPHEN"; + } else if (text.equals("\"") || text.equals("\'") || text.equals("`")) { + features.punctType = "QUOTE"; + } + + if (indented) { + features.alignmentStatus = "LINEINDENT"; + } else { + features.alignmentStatus = "ALIGNEDLEFT"; + } + + if (isNewBlock) { + features.lineStatus = "LINESTART"; + if (previousFeatures != null) { + if (!"LINESTART".equals(previousFeatures.lineStatus)) + previousFeatures.lineStatus = "LINEEND"; + } + lineStartX = token.getX(); + features.blockStatus = "BLOCKSTART"; + } else { + // Look ahead for end of line + boolean endline = false; + boolean endblock = false; + int ii = 1; + boolean endloop = false; + while ((i + ii < tokens.size()) && (!endloop)) { + LayoutToken tok = tokens.get(i + ii); + if (tok != null) { + String toto = tok.getText(); + if (toto != null) { + if (toto.equals("\n")) { + endline = true; + endloop = true; + } else { + if (toto.length() != 0 + && !toto.startsWith("@IMAGE") + && !toto.startsWith("@PAGE") + && !text.contains(".pbm") + && !text.contains(".svg") + && !text.contains(".png") + && !text.contains(".jpg")) { + endloop = true; + } + } + } + } + // Check if we're switching blocks + if (tok.getBlockPtr() != token.getBlockPtr()) { + endblock = true; + endline = true; + endloop = true; + } + if (i + ii == tokens.size() - 1) { + endblock = true; + endline = true; + } + ii++; + } + + if (!endline && !newline) { + features.lineStatus = "LINEIN"; + } else if (!newline) { + features.lineStatus = "LINEEND"; + previousNewline = true; + } + + if (!endblock && features.blockStatus == null) + features.blockStatus = "BLOCKIN"; + else if (features.blockStatus == null) { + features.blockStatus = "BLOCKEND"; + } + } + + if (text.length() == 1) { + features.singleChar = true; + } + + if (Character.isUpperCase(text.charAt(0))) { + features.capitalisation = "INITCAP"; + } + + if (featureFactory.test_all_capital(text)) { + features.capitalisation = "ALLCAP"; + } + + if (featureFactory.test_digit(text)) { + features.digit = "CONTAINSDIGITS"; + } + + Matcher m = featureFactory.isDigit.matcher(text); + if (m.find()) { + features.digit = "ALLDIGIT"; + } + + if (currentFont == null) { + currentFont = token.getFont(); + features.fontStatus = "NEWFONT"; + } else if (!currentFont.equals(token.getFont())) { + currentFont = token.getFont(); + features.fontStatus = "NEWFONT"; + } else { + features.fontStatus = "SAMEFONT"; + } + + int newFontSize = (int) token.getFontSize(); + if (currentFontSize == -1) { + currentFontSize = newFontSize; + features.fontSize = "HIGHERFONT"; + } else if (currentFontSize == newFontSize) { + features.fontSize = "SAMEFONTSIZE"; + } else if (currentFontSize < newFontSize) { + features.fontSize = "HIGHERFONT"; + currentFontSize = newFontSize; + } else { + features.fontSize = "LOWERFONT"; + currentFontSize = newFontSize; + } + + if (token.isBold()) + features.bold = true; + + if (token.isItalic()) + features.italic = true; + + if (features.capitalisation == null) + features.capitalisation = "NOCAPS"; + + if (features.digit == null) + features.digit = "NODIGIT"; + + if (features.punctType == null) + features.punctType = "NOPUNCT"; + + features.relativeDocumentPosition = featureFactory + .linearScaling(nn, fulltextLength, NBBINS_POSITION); + + features.relativePagePositionChar = featureFactory + .linearScaling(mm, 0, NBBINS_POSITION); + + double pageHeight = 1.0; + if (token.getPage() >= 0 && doc.getPages() != null && token.getPage() < doc.getPages().size()) { + Page page = doc.getPages().get(token.getPage()); + if (page != null) { + pageHeight = page.getHeight(); + } + } + int pagePos = featureFactory.linearScaling(coordinateLineY, pageHeight, NBBINS_POSITION); + if (pagePos > NBBINS_POSITION) + pagePos = NBBINS_POSITION; + features.relativePagePosition = pagePos; + + if (spacingPreviousBlock != 0.0) { + features.spacingWithPreviousBlock = featureFactory + .linearScaling(spacingPreviousBlock - doc.getMinBlockSpacing(), + doc.getMaxBlockSpacing() - doc.getMinBlockSpacing(), NBBINS_SPACE); + } + + if (density != -1.0) { + features.characterDensity = featureFactory + .linearScaling(density - doc.getMinCharacterDensity(), + doc.getMaxCharacterDensity() - doc.getMinCharacterDensity(), NBBINS_DENSITY); + } + + features.calloutType = "UNKNOWN"; + features.calloutKnown = false; + + if (token.isSuperscript()) { + features.superscript = true; + } + + // Deferred print pattern: print previous features before overwriting + if (previousFeatures != null) { + if (features.blockStatus.equals("BLOCKSTART") && + previousFeatures.blockStatus.equals("BLOCKIN")) { + previousFeatures.blockStatus = "BLOCKEND"; + previousFeatures.lineStatus = "LINEEND"; + } + fulltext.append(previousFeatures.printVector()); + } + + mm += text.length(); + nn += text.length(); + previousFeatures = features; + } + + // Flush last feature + if (previousFeatures != null) { + fulltext.append(previousFeatures.printVector()); + } + + if (fulltext.length() == 0) { + return null; + } + + return Pair.of(fulltext.toString(), filteredTokens); + } + + /** + * Process typed areas (figures, tables) using specialized models. + * This method applies the appropriate figure and table parsers to pre-identified areas. + */ + protected void processTypedAreas(Document doc) { + if (doc == null) { + return; + } + + LOGGER.debug("Processing typed areas: {} figures, {} tables", + doc.getFigureAreas().size(), doc.getTableAreas().size()); + + // Process figure areas using the figure ML model + if (!doc.getFigureAreas().isEmpty() && !doc.getFigureTokens().isEmpty()) { + if (doc.getAnnexFigures() == null) { + doc.setAnnexFigures(new ArrayList<>()); + } + + Figure figure = null; + try { + Pair> featurePair = + generateFeaturesForTokens(doc.getFigureTokens(), doc); + if (featurePair != null && isNotBlank(featurePair.getLeft())) { + figure = parsers.getFigureParser().processing( + featurePair.getRight(), featurePair.getLeft()); + } + } catch (Exception e) { + LOGGER.warn("Figure ML processing failed, falling back to direct construction", e); + } + + if (figure == null) { + // Fallback: create Figure directly from tokens + figure = new Figure(); + figure.setContent(new StringBuilder(LayoutTokensUtil.toText(doc.getFigureTokens()))); + } + figure.setLayoutTokens(doc.getFigureTokens()); + for (LayoutToken lt : doc.getFigureTokens()) { + if (!LayoutTokensUtil.spaceyToken(lt.t()) && !LayoutTokensUtil.newLineToken(lt.t())) { + figure.setPage(lt.getPage()); + break; + } + } + doc.getAnnexFigures().add(figure); + LOGGER.debug("Created figure from typed areas via ML processing"); + } + + // Process table areas using the table ML model - each area separately + if (!doc.getTableAreas().isEmpty() && !doc.getTableTokensByArea().isEmpty()) { + if (doc.getAnnexTables() == null) { + doc.setAnnexTables(new ArrayList<>()); + } + + for (Map.Entry> entry : doc.getTableTokensByArea().entrySet()) { + IgnoreArea area = entry.getKey(); + List areaTokens = entry.getValue(); + if (areaTokens.isEmpty()) { + continue; + } + + List
tables = null; + try { + Pair> featurePair = + generateFeaturesForTokens(areaTokens, doc); + if (featurePair != null && isNotBlank(featurePair.getLeft())) { + tables = parsers.getTableParser().processing( + featurePair.getRight(), featurePair.getLeft()); + } + } catch (Exception e) { + LOGGER.warn("Table ML processing failed for area {}, falling back to direct construction", area, e); + } + + if (CollectionUtils.isNotEmpty(tables)) { + for (Table table : tables) { + for (LayoutToken lt : areaTokens) { + if (!LayoutTokensUtil.spaceyToken(lt.t()) && !LayoutTokensUtil.newLineToken(lt.t())) { + table.setPage(lt.getPage()); + break; + } + } + LOGGER.info("Typed area table from {}: hasContent={}, tokenCount={}", + area, table.getContent() != null && table.getContent().length() > 0, + table.getLayoutTokens() != null ? table.getLayoutTokens().size() : 0); + doc.getAnnexTables().add(table); + } + } else { + // Fallback: create Table directly from tokens + Table table = new Table(); + table.setLayoutTokens(areaTokens); + table.setContent(new StringBuilder(LayoutTokensUtil.toText(areaTokens))); + for (LayoutToken lt : areaTokens) { + if (!LayoutTokensUtil.spaceyToken(lt.t()) && !LayoutTokensUtil.newLineToken(lt.t())) { + table.setPage(lt.getPage()); + break; + } + } + LOGGER.info("Typed area table (fallback) from {}: tokenCount={}", area, areaTokens.size()); + doc.getAnnexTables().add(table); + } + } + LOGGER.info("Created {} table(s) from {} typed areas", + doc.getAnnexTables().size(), doc.getTableTokensByArea().size()); + } + + // Note: ignored areas are intentionally discarded and no further processing is performed + LOGGER.debug("Typed area processing completed"); + } + @Override public void close() throws IOException { super.close(); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java index 7234ff54b5..fa9370feb8 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java @@ -95,6 +95,11 @@ public Pair processing(File input, String md5Str, BiblioItem r documentSource.setMD5(md5Str); Document doc = parsers.getSegmentationParser().processing(documentSource, config); + // Apply typed areas filtering if configured + if (config.getTypedAreas() != null && !config.getTypedAreas().isEmpty()) { + doc.filterLayoutTokensByTypedAreas(config.getTypedAreas()); + } + String tei = processingHeaderSection(config, doc, resHeader, true); return new ImmutablePair(tei, doc); } finally { diff --git a/grobid-core/src/main/java/org/grobid/core/engines/config/GrobidAnalysisConfig.java b/grobid-core/src/main/java/org/grobid/core/engines/config/GrobidAnalysisConfig.java index c68b2f11a6..1ea85b37ec 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/config/GrobidAnalysisConfig.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/config/GrobidAnalysisConfig.java @@ -2,9 +2,11 @@ import java.io.File; import java.util.List; +import java.util.ArrayList; import org.grobid.core.GrobidModels; import org.grobid.core.analyzers.Analyzer; +import org.grobid.core.layout.IgnoreArea; /** * A class representing the runtime configuration values needed in the analysis chain @@ -91,9 +93,12 @@ private GrobidAnalysisConfig() { // a particular Grobid Analyzer to be used for tokenizing/filtering text private Analyzer analyzer = null; - // if true, the TEI text will be segmented into sentences + // if true, the TEI text will be segmented into sentences private boolean withSentenceSegmentation = false; + // list of typed areas for specialized processing + private List typedAreas = null; + public boolean isIncludeDiscardedText() { return includeDiscardedText; } @@ -215,6 +220,10 @@ public GrobidAnalysisConfigBuilder flavor(GrobidModels.Flavor a) { return this; } + public GrobidAnalysisConfigBuilder typedAreas(List areas) { + config.typedAreas = areas; + return this; + } public GrobidAnalysisConfig build() { postProcessAndValidate(); @@ -342,4 +351,12 @@ public boolean isWithSentenceSegmentation() { public String getFlavor() { return flavor; } + + public List getTypedAreas() { + return typedAreas; + } + + public void setTypedAreas(List typedAreas) { + this.typedAreas = typedAreas; + } } diff --git a/grobid-core/src/main/java/org/grobid/core/layout/AreaType.java b/grobid-core/src/main/java/org/grobid/core/layout/AreaType.java new file mode 100644 index 0000000000..f7fb2a74aa --- /dev/null +++ b/grobid-core/src/main/java/org/grobid/core/layout/AreaType.java @@ -0,0 +1,29 @@ +package org.grobid.core.layout; + +/** + * Enumeration of supported area types for typed area processing. + */ +public enum AreaType { + FIGURE("figure"), + TABLE("table"), + IGNORE("ignore"); + + private final String value; + + AreaType(String value) { + this.value = value; + } + + public String getValue() { + return value; + } + + public static AreaType fromString(String value) { + for (AreaType type : AreaType.values()) { + if (type.value.equalsIgnoreCase(value)) { + return type; + } + } + throw new IllegalArgumentException("Unknown area type: " + value); + } +} \ No newline at end of file diff --git a/grobid-core/src/main/java/org/grobid/core/layout/IgnoreArea.java b/grobid-core/src/main/java/org/grobid/core/layout/IgnoreArea.java new file mode 100644 index 0000000000..b7cdb7430d --- /dev/null +++ b/grobid-core/src/main/java/org/grobid/core/layout/IgnoreArea.java @@ -0,0 +1,219 @@ +package org.grobid.core.layout; + +/** + * Represents a typed area in a PDF document for specialized processing. + * This includes areas containing figures, tables, or content to be ignored. + */ +public class IgnoreArea { + private int page; // page number (1-based, following PDF convention) + private double x; // x-coordinate of upper-left corner + private double y; // y-coordinate of upper-left corner + private double width; // width of the area + private double height; // height of the area + private AreaType type; // type: figure, table, ignore + + public IgnoreArea() { + } + + public IgnoreArea(int page, double x, double y, double width, double height, AreaType type) { + this.page = page; + this.x = x; + this.y = y; + this.width = width; + this.height = height; + this.type = type; + } + + /** + * Legacy constructor for backward compatibility. + * @deprecated Use {@link #IgnoreArea(int, double, double, double, double, AreaType)} instead. + */ + @Deprecated + public IgnoreArea(int page, double x, double y, double width, double height, String name) { + this.page = page; + this.x = x; + this.y = y; + this.width = width; + this.height = height; + // Convert string name to AreaType for backward compatibility + this.type = name != null ? AreaType.fromString(name.toLowerCase()) : AreaType.IGNORE; + } + + /** + * Creates an IgnoreArea from a coordinate string in the format: "page,x,y,width,height,type" + */ + public static IgnoreArea fromCoordinates(String coordString) { + String[] parts = coordString.split(","); + if (parts.length < 6) { + throw new IllegalArgumentException("Invalid coordinate string format. Expected: page,x,y,width,height,type"); + } + + try { + int page = Integer.parseInt(parts[0].trim()); + double x = Double.parseDouble(parts[1].trim()); + double y = Double.parseDouble(parts[2].trim()); + double width = Double.parseDouble(parts[3].trim()); + double height = Double.parseDouble(parts[4].trim()); + AreaType type = AreaType.fromString(parts[5].trim()); + + return new IgnoreArea(page, x, y, width, height, type); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Invalid numeric values in coordinate string: " + coordString, e); + } + } + + /** + * Checks if a LayoutToken falls within or intersects with this ignore area. + * + * @param token the LayoutToken to check + * @return true if the token intersects with this ignore area + */ + public boolean contains(LayoutToken token) { + if (token.getPage() != this.page) { + return false; + } + + double tokenLeft = token.getX(); + double tokenRight = token.getX() + token.getWidth(); + double tokenTop = token.getY(); + double tokenBottom = token.getY() + token.getHeight(); + + double areaLeft = this.x; + double areaRight = this.x + this.width; + double areaTop = this.y; + double areaBottom = this.y + this.height; + + // Check for intersection: two rectangles intersect if their projections overlap on both axes + return !(tokenRight < areaLeft || + tokenLeft > areaRight || + tokenBottom < areaTop || + tokenTop > areaBottom); + } + + /** + * Creates an IgnoreArea from a coordinate string in the format: "page,x,y,width,height,name" + */ + public static IgnoreArea fromString(String coordString) { + String[] parts = coordString.split(","); + if (parts.length < 5) { + throw new IllegalArgumentException("Invalid coordinate string format. Expected: page,x,y,width,height[,name]"); + } + + try { + int page = Integer.parseInt(parts[0].trim()); + double x = Double.parseDouble(parts[1].trim()); + double y = Double.parseDouble(parts[2].trim()); + double width = Double.parseDouble(parts[3].trim()); + double height = Double.parseDouble(parts[4].trim()); + String name = parts.length > 5 ? parts[5].trim() : ""; + + return new IgnoreArea(page, x, y, width, height, name); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Invalid numeric values in coordinate string: " + coordString, e); + } + } + + // Getters and setters + public int getPage() { + return page; + } + + public void setPage(int page) { + this.page = page; + } + + public double getX() { + return x; + } + + public void setX(double x) { + this.x = x; + } + + public double getY() { + return y; + } + + public void setY(double y) { + this.y = y; + } + + public double getWidth() { + return width; + } + + public void setWidth(double width) { + this.width = width; + } + + public double getHeight() { + return height; + } + + public void setHeight(double height) { + this.height = height; + } + + public AreaType getType() { + return type; + } + + public void setType(AreaType type) { + this.type = type; + } + + /** + * Legacy getter for backward compatibility. + * @deprecated Use {@link #getType()} instead. + */ + @Deprecated + public String getName() { + return type != null ? type.getValue() : null; + } + + /** + * Legacy setter for backward compatibility. + * @deprecated Use {@link #setType(AreaType)} instead. + */ + @Deprecated + public void setName(String name) { + this.type = name != null ? AreaType.fromString(name.toLowerCase()) : AreaType.IGNORE; + } + + @Override + public String toString() { + return String.format("IgnoreArea{page=%d, x=%.2f, y=%.2f, width=%.2f, height=%.2f, type='%s'}", + page, x, y, width, height, type != null ? type.getValue() : "null"); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null || getClass() != obj.getClass()) return false; + + IgnoreArea that = (IgnoreArea) obj; + return page == that.page && + Double.compare(that.x, x) == 0 && + Double.compare(that.y, y) == 0 && + Double.compare(that.width, width) == 0 && + Double.compare(that.height, height) == 0 && + type == that.type; + } + + @Override + public int hashCode() { + int result; + long temp; + result = page; + temp = Double.doubleToLongBits(x); + result = 31 * result + (int) (temp ^ (temp >>> 32)); + temp = Double.doubleToLongBits(y); + result = 31 * result + (int) (temp ^ (temp >>> 32)); + temp = Double.doubleToLongBits(width); + result = 31 * result + (int) (temp ^ (temp >>> 32)); + temp = Double.doubleToLongBits(height); + result = 31 * result + (int) (temp ^ (temp >>> 32)); + result = 31 * result + (type != null ? type.hashCode() : 0); + return result; + } +} \ No newline at end of file diff --git a/grobid-core/src/test/java/org/grobid/core/layout/AreaTypeTest.java b/grobid-core/src/test/java/org/grobid/core/layout/AreaTypeTest.java new file mode 100644 index 0000000000..8637cea262 --- /dev/null +++ b/grobid-core/src/test/java/org/grobid/core/layout/AreaTypeTest.java @@ -0,0 +1,47 @@ +package org.grobid.core.layout; + +import org.junit.Test; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; + +public class AreaTypeTest { + + @Test + public void testFromString_figure() { + AreaType type = AreaType.fromString("figure"); + assertThat(type, is(AreaType.FIGURE)); + } + + @Test + public void testFromString_table() { + AreaType type = AreaType.fromString("table"); + assertThat(type, is(AreaType.TABLE)); + } + + @Test + public void testFromString_ignore() { + AreaType type = AreaType.fromString("ignore"); + assertThat(type, is(AreaType.IGNORE)); + } + + @Test + public void testFromString_caseInsensitive() { + assertThat(AreaType.fromString("FIGURE"), is(AreaType.FIGURE)); + assertThat(AreaType.fromString("TABLE"), is(AreaType.TABLE)); + assertThat(AreaType.fromString("IGNORE"), is(AreaType.IGNORE)); + assertThat(AreaType.fromString("Figure"), is(AreaType.FIGURE)); + } + + @Test(expected = IllegalArgumentException.class) + public void testFromString_invalidType() { + AreaType.fromString("invalid"); + } + + @Test + public void testGetValue() { + assertThat(AreaType.FIGURE.getValue(), is("figure")); + assertThat(AreaType.TABLE.getValue(), is("table")); + assertThat(AreaType.IGNORE.getValue(), is("ignore")); + } +} diff --git a/grobid-core/src/test/java/org/grobid/core/layout/IgnoreAreaTest.java b/grobid-core/src/test/java/org/grobid/core/layout/IgnoreAreaTest.java new file mode 100644 index 0000000000..0cdf66f57f --- /dev/null +++ b/grobid-core/src/test/java/org/grobid/core/layout/IgnoreAreaTest.java @@ -0,0 +1,192 @@ +package org.grobid.core.layout; + +import org.junit.Test; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.CoreMatchers.nullValue; +import static org.hamcrest.MatcherAssert.assertThat; + +public class IgnoreAreaTest { + + @Test + public void testConstructor_withAreaType() { + IgnoreArea area = new IgnoreArea(1, 100.0, 200.0, 300.0, 150.0, AreaType.FIGURE); + + assertThat(area.getPage(), is(1)); + assertThat(area.getX(), is(100.0)); + assertThat(area.getY(), is(200.0)); + assertThat(area.getWidth(), is(300.0)); + assertThat(area.getHeight(), is(150.0)); + assertThat(area.getType(), is(AreaType.FIGURE)); + } + + @Test + @SuppressWarnings("deprecation") + public void testLegacyConstructor_withStringName() { + IgnoreArea area = new IgnoreArea(1, 100.0, 200.0, 300.0, 150.0, "figure"); + + assertThat(area.getType(), is(AreaType.FIGURE)); + } + + @Test + @SuppressWarnings("deprecation") + public void testLegacyConstructor_withNullName_defaultsToIgnore() { + IgnoreArea area = new IgnoreArea(1, 100.0, 200.0, 300.0, 150.0, (String) null); + + assertThat(area.getType(), is(AreaType.IGNORE)); + } + + @Test + public void testFromCoordinates_validString() { + IgnoreArea area = IgnoreArea.fromCoordinates("1,100.5,200.5,300.0,150.0,table"); + + assertThat(area.getPage(), is(1)); + assertThat(area.getX(), is(100.5)); + assertThat(area.getY(), is(200.5)); + assertThat(area.getWidth(), is(300.0)); + assertThat(area.getHeight(), is(150.0)); + assertThat(area.getType(), is(AreaType.TABLE)); + } + + @Test(expected = IllegalArgumentException.class) + public void testFromCoordinates_insufficientParts() { + IgnoreArea.fromCoordinates("1,100,200,300,150"); // missing type + } + + @Test(expected = IllegalArgumentException.class) + public void testFromCoordinates_invalidNumber() { + IgnoreArea.fromCoordinates("1,abc,200,300,150,figure"); + } + + @Test + public void testContains_tokenInsideArea() { + IgnoreArea area = new IgnoreArea(1, 100.0, 100.0, 200.0, 200.0, AreaType.FIGURE); + + LayoutToken token = new LayoutToken(); + token.setPage(1); + token.setX(150.0); + token.setY(150.0); + token.setWidth(20.0); + token.setHeight(10.0); + + assertThat(area.contains(token), is(true)); + } + + @Test + public void testContains_tokenOutsideArea() { + IgnoreArea area = new IgnoreArea(1, 100.0, 100.0, 200.0, 200.0, AreaType.FIGURE); + + LayoutToken token = new LayoutToken(); + token.setPage(1); + token.setX(500.0); + token.setY(500.0); + token.setWidth(20.0); + token.setHeight(10.0); + + assertThat(area.contains(token), is(false)); + } + + @Test + public void testContains_tokenOnDifferentPage() { + IgnoreArea area = new IgnoreArea(1, 100.0, 100.0, 200.0, 200.0, AreaType.FIGURE); + + LayoutToken token = new LayoutToken(); + token.setPage(2); + token.setX(150.0); + token.setY(150.0); + token.setWidth(20.0); + token.setHeight(10.0); + + assertThat(area.contains(token), is(false)); + } + + @Test + public void testContains_tokenIntersectsArea() { + IgnoreArea area = new IgnoreArea(1, 100.0, 100.0, 200.0, 200.0, AreaType.FIGURE); + + // Token partially inside the area + LayoutToken token = new LayoutToken(); + token.setPage(1); + token.setX(290.0); // Starts at the edge + token.setY(150.0); + token.setWidth(20.0); + token.setHeight(10.0); + + assertThat(area.contains(token), is(true)); + } + + @Test + public void testContains_tokenBarelyOutsideRight() { + IgnoreArea area = new IgnoreArea(1, 100.0, 100.0, 200.0, 200.0, AreaType.FIGURE); + + LayoutToken token = new LayoutToken(); + token.setPage(1); + token.setX(301.0); // Just outside the right edge (100 + 200 = 300) + token.setY(150.0); + token.setWidth(20.0); + token.setHeight(10.0); + + assertThat(area.contains(token), is(false)); + } + + @Test + @SuppressWarnings("deprecation") + public void testGetName_legacy() { + IgnoreArea area = new IgnoreArea(1, 100.0, 200.0, 300.0, 150.0, AreaType.TABLE); + + assertThat(area.getName(), is("table")); + } + + @Test + @SuppressWarnings("deprecation") + public void testSetName_legacy() { + IgnoreArea area = new IgnoreArea(); + area.setName("ignore"); + + assertThat(area.getType(), is(AreaType.IGNORE)); + } + + @Test + public void testEquals() { + IgnoreArea area1 = new IgnoreArea(1, 100.0, 200.0, 300.0, 150.0, AreaType.FIGURE); + IgnoreArea area2 = new IgnoreArea(1, 100.0, 200.0, 300.0, 150.0, AreaType.FIGURE); + + assertThat(area1.equals(area2), is(true)); + assertThat(area1.hashCode(), is(area2.hashCode())); + } + + @Test + public void testEquals_differentType() { + IgnoreArea area1 = new IgnoreArea(1, 100.0, 200.0, 300.0, 150.0, AreaType.FIGURE); + IgnoreArea area2 = new IgnoreArea(1, 100.0, 200.0, 300.0, 150.0, AreaType.TABLE); + + assertThat(area1.equals(area2), is(false)); + } + + @Test + public void testToString() { + IgnoreArea area = new IgnoreArea(1, 100.0, 200.0, 300.0, 150.0, AreaType.FIGURE); + + String result = area.toString(); + assertThat(result.contains("page=1"), is(true)); + assertThat(result.contains("type='figure'"), is(true)); + } + + @Test + public void testSetters() { + IgnoreArea area = new IgnoreArea(); + area.setPage(2); + area.setX(50.0); + area.setY(75.0); + area.setWidth(100.0); + area.setHeight(80.0); + area.setType(AreaType.IGNORE); + + assertThat(area.getPage(), is(2)); + assertThat(area.getX(), is(50.0)); + assertThat(area.getY(), is(75.0)); + assertThat(area.getWidth(), is(100.0)); + assertThat(area.getHeight(), is(80.0)); + assertThat(area.getType(), is(AreaType.IGNORE)); + } +} diff --git a/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java b/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java index 2259964955..8b0ba0b0c6 100755 --- a/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java +++ b/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java @@ -15,6 +15,7 @@ import org.grobid.core.engines.config.GrobidAnalysisConfig; import org.grobid.core.factory.AbstractEngineFactory; import org.grobid.core.factory.GrobidPoolingFactory; +import org.grobid.core.layout.AreaType; import org.grobid.core.utilities.GrobidProperties; import org.grobid.service.data.ServiceInfo; import org.grobid.service.process.GrobidRestProcessFiles; @@ -27,11 +28,14 @@ import org.grobid.service.util.ZipUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import java.io.File; import java.io.InputStream; import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.NoSuchElementException; import static org.grobid.core.GrobidModels.Flavor.BLANK; @@ -145,8 +149,10 @@ public Response processHeaderDocumentReturnXml_post( @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights, @DefaultValue("0") @FormDataParam(INCLUDE_DISCARDED_TEXT) String includeDiscardedText, @DefaultValue("0") @FormDataParam("start") int startPage, - @DefaultValue("2") @FormDataParam("end") int endPage) { + @DefaultValue("2") @FormDataParam("end") int endPage, + @FormDataParam("typedAreas") String typedAreas) { int consol = validateConsolidationParam(consolidate); + List typedAreasList = parseTypedAreas(typedAreas); return restProcessFiles.processStatelessHeaderDocument( inputStream, consol, validateIncludeRawParam(includeRawAffiliations), @@ -154,6 +160,7 @@ public Response processHeaderDocumentReturnXml_post( validateIncludeRawParam(includeDiscardedText), startPage, endPage, + typedAreasList, ExpectedResponseType.XML ); } @@ -168,14 +175,17 @@ public Response processHeaderFundingDocumentReturnXml_post( @DefaultValue("0") @FormDataParam(CONSOLIDATE_FUNDERS) String consolidateFunders, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights, - @DefaultValue("0") @FormDataParam(INCLUDE_DISCARDED_TEXT) String includeDiscardedText) { + @DefaultValue("0") @FormDataParam(INCLUDE_DISCARDED_TEXT) String includeDiscardedText, + @FormDataParam("typedAreas") String typedAreas) { int consolHeader = validateConsolidationParam(consolidateHeader); int consolFunders = validateConsolidationParam(consolidateFunders); + List typedAreasList = parseTypedAreas(typedAreas); return restProcessFiles.processStatelessHeaderFundingDocument( inputStream, consolHeader, consolFunders, validateIncludeRawParam(includeRawAffiliations), validateIncludeRawParam(includeRawCopyrights), - validateIncludeRawParam(includeDiscardedText) + validateIncludeRawParam(includeDiscardedText), + typedAreasList ); } @@ -191,7 +201,8 @@ public Response processStatelessHeaderDocumentReturnXml( @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights, @DefaultValue("0") @FormDataParam(INCLUDE_DISCARDED_TEXT) String includeDiscardedText, @DefaultValue("0") @FormDataParam("start") int startPage, - @DefaultValue("2") @FormDataParam("end") int endPage) { + @DefaultValue("2") @FormDataParam("end") int endPage, + @FormDataParam("typedAreas") String typedAreas) { return processHeaderDocumentReturnXml_post( inputStream, consolidate, @@ -199,7 +210,8 @@ public Response processStatelessHeaderDocumentReturnXml( includeRawCopyrights, includeDiscardedText, startPage, - endPage + endPage, + typedAreas ); } @@ -213,13 +225,19 @@ public Response processHeaderDocumentReturnBibTeX_post( @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights, @DefaultValue("0") @FormDataParam("start") int startPage, - @DefaultValue("2") @FormDataParam("end") int endPage) { + @DefaultValue("2") @FormDataParam("end") int endPage, + @FormDataParam("typedAreas") String typedAreas) { int consol = validateConsolidationParam(consolidate); + List typedAreasList = parseTypedAreas(typedAreas); return restProcessFiles.processStatelessHeaderDocument( inputStream, consol, validateIncludeRawParam(includeRawAffiliations), validateIncludeRawParam(includeRawCopyrights), + false, // includeDiscardedText - not used in BibTeX mode + startPage, + endPage, + typedAreasList, ExpectedResponseType.BIBTEX ); } @@ -234,14 +252,16 @@ public Response processStatelessHeaderDocumentReturnBibTeX( @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights, @DefaultValue("0") @FormDataParam("start") int startPage, - @DefaultValue("2") @FormDataParam("end") int endPage) { + @DefaultValue("2") @FormDataParam("end") int endPage, + @FormDataParam("typedAreas") String typedAreas) { return processHeaderDocumentReturnBibTeX_post( inputStream, consolidate, includeRawAffiliations, includeRawCopyrights, startPage, - endPage + endPage, + typedAreas ); } @@ -263,12 +283,13 @@ public Response processFulltextDocument_post( @DefaultValue("-1") @FormDataParam("end") int endPage, @FormDataParam("generateIDs") String generateIDs, @FormDataParam("segmentSentences") String segmentSentences, - @FormDataParam("teiCoordinates") List coordinates + @FormDataParam("teiCoordinates") List coordinates, + @FormDataParam("typedAreas") String typedAreas ) throws Exception { return processFulltext( inputStream, flavor, consolidateHeader, consolidateCitations, consolidateFunders, includeRawAffiliations, includeRawCitations, includeRawCopyrights, includeDiscardedText, - startPage, endPage, generateIDs, segmentSentences, coordinates + startPage, endPage, generateIDs, segmentSentences, coordinates, typedAreas ); } @@ -290,12 +311,13 @@ public Response processFulltextDocument( @DefaultValue("-1") @FormDataParam("end") int endPage, @FormDataParam("generateIDs") String generateIDs, @FormDataParam("segmentSentences") String segmentSentences, - @FormDataParam("teiCoordinates") List coordinates + @FormDataParam("teiCoordinates") List coordinates, + @FormDataParam("typedAreas") String typedAreas ) throws Exception { return processFulltext( inputStream, flavor, consolidateHeader, consolidateCitations, consolidateFunders, includeRawAffiliations, includeRawCitations, includeRawCopyrights, includeDiscardedText, - startPage, endPage, generateIDs, segmentSentences, coordinates + startPage, endPage, generateIDs, segmentSentences, coordinates, typedAreas ); } @@ -312,7 +334,8 @@ private Response processFulltext(InputStream inputStream, int endPage, String generateIDs, String segmentSentences, - List coordinates + List coordinates, + String typedAreas ) throws Exception { int consolHeader = validateConsolidationParam(consolidateHeader); int consolCitations = validateConsolidationParam(consolidateCitations); @@ -323,6 +346,7 @@ private Response processFulltext(InputStream inputStream, GrobidModels.Flavor flavorValidated = validateModelFlavor(flavor); List teiCoordinates = collectCoordinates(coordinates); + List typedAreasList = parseTypedAreas(typedAreas); if (flavorValidated == BLANK) { return restProcessFiles.processFulltextDocumentBlank( @@ -341,7 +365,7 @@ private Response processFulltext(InputStream inputStream, includeRaw, validateIncludeRawParam(includeRawCopyrights), validateIncludeRawParam(includeDiscardedText), - startPage, endPage, generate, segment, teiCoordinates + startPage, endPage, generate, segment, teiCoordinates, typedAreasList ); } @@ -360,6 +384,64 @@ private List collectCoordinates(List coordinates) { return teiCoordinates; } + private List parseTypedAreas(String typedAreasJson) { + List typedAreasList = new ArrayList<>(); + + if (typedAreasJson == null || typedAreasJson.trim().isEmpty()) { + return typedAreasList; + } + + try { + // Parse JSON array of typed areas + ObjectMapper mapper = new ObjectMapper(); + JsonNode rootNode = mapper.readTree(typedAreasJson); + + if (rootNode.isArray()) { + for (JsonNode node : rootNode) { + try { + int page = node.get("page").asInt(); + double x = node.get("x").asDouble(); + double y = node.get("y").asDouble(); + double width = node.get("width").asDouble(); + double height = node.get("height").asDouble(); + + // New format: "type" field is required and should be "figure", "table", or "ignore" + if (!node.has("type")) { + LOGGER.warn("Typed area missing required 'type' field: " + node.toString()); + continue; + } + + String typeString = node.get("type").asText(); + org.grobid.core.layout.AreaType areaType = + org.grobid.core.layout.AreaType.fromString(typeString); + + org.grobid.core.layout.IgnoreArea area = + new org.grobid.core.layout.IgnoreArea(page, x, y, width, height, areaType); + typedAreasList.add(area); + } catch (Exception e) { + LOGGER.warn("Failed to parse typed area from JSON: " + node.toString(), e); + } + } + } else { + LOGGER.warn("typedAreas should be a JSON array, but received: " + typedAreasJson); + } + } catch (Exception e) { + LOGGER.error("Failed to parse typed areas JSON: " + typedAreasJson, e); + } + + if (!typedAreasList.isEmpty()) { + Map countsByType = typedAreasList.stream() + .collect(java.util.stream.Collectors.groupingBy( + org.grobid.core.layout.IgnoreArea::getType, + java.util.stream.Collectors.counting())); + LOGGER.info("Received {} typed areas: {}", typedAreasList.size(), countsByType); + } + + return typedAreasList; + } + + + private boolean validateGenerateIdParam(String generateIDs) { boolean generate = false; if ((generateIDs != null) && (generateIDs.equals("1") || generateIDs.equals("true"))) { @@ -401,11 +483,12 @@ public Response processFulltextAssetDocument_post( @DefaultValue("-1") @FormDataParam("end") int endPage, @FormDataParam("generateIDs") String generateIDs, @FormDataParam("segmentSentences") String segmentSentences, - @FormDataParam("teiCoordinates") List coordinates) throws Exception { + @FormDataParam("teiCoordinates") List coordinates, + @FormDataParam("typedAreas") String typedAreas) throws Exception { return processStatelessFulltextAssetHelper( inputStream, flavor, consolidateHeader, consolidateCitations, consolidateFunders, includeRawAffiliations, includeRawCitations, includeRawCopyrights, - startPage, endPage, generateIDs, segmentSentences, coordinates + startPage, endPage, generateIDs, segmentSentences, coordinates, typedAreas ); } @@ -426,11 +509,12 @@ public Response processStatelessFulltextAssetDocument( @DefaultValue("-1") @FormDataParam("end") int endPage, @FormDataParam("generateIDs") String generateIDs, @FormDataParam("segmentSentences") String segmentSentences, - @FormDataParam("teiCoordinates") List coordinates) throws Exception { + @FormDataParam("teiCoordinates") List coordinates, + @FormDataParam("typedAreas") String typedAreas) throws Exception { return processStatelessFulltextAssetHelper( inputStream, flavor, consolidateHeader, consolidateCitations, consolidateFunders, includeRawAffiliations, includeRawCitations, includeRawCopyrights, - startPage, endPage, generateIDs, segmentSentences, coordinates + startPage, endPage, generateIDs, segmentSentences, coordinates, typedAreas ); } @@ -446,7 +530,8 @@ private Response processStatelessFulltextAssetHelper(InputStream inputStream, int endPage, String generateIDs, String segmentSentences, - List coordinates) throws Exception { + List coordinates, + String typedAreas) throws Exception { int consolHeader = validateConsolidationParam(consolidateHeader); int consolCitations = validateConsolidationParam(consolidateCitations); int consolFunders = validateConsolidationParam(consolidateFunders); @@ -454,13 +539,14 @@ private Response processStatelessFulltextAssetHelper(InputStream inputStream, boolean generate = validateGenerateIdParam(generateIDs); boolean segment = validateGenerateIdParam(segmentSentences); List teiCoordinates = collectCoordinates(coordinates); + List typedAreasList = parseTypedAreas(typedAreas); GrobidModels.Flavor validatedModelFlavor = validateModelFlavor(flavor); return restProcessFiles.processStatelessFulltextAssetDocument( inputStream, validatedModelFlavor, consolHeader, consolCitations, consolFunders, validateIncludeRawParam(includeRawAffiliations), includeRaw, validateIncludeRawParam(includeRawCopyrights), - startPage, endPage, generate, segment, teiCoordinates + startPage, endPage, generate, segment, teiCoordinates, typedAreasList ); } diff --git a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java index e402ebcf86..ed3b90967c 100644 --- a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java +++ b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java @@ -76,6 +76,7 @@ public Response processStatelessHeaderDocument( false, 0, 2, + null, expectedResponseType ); } @@ -97,6 +98,7 @@ public Response processStatelessHeaderDocument( false, startPage, endPage, + null, expectedResponseType ); } @@ -109,6 +111,7 @@ public Response processStatelessHeaderDocument( final boolean includeDiscardedText, int startPage, int endPage, + final List typedAreas, ExpectedResponseType expectedResponseType ) { LOGGER.debug(methodLogIn()); @@ -141,15 +144,20 @@ public Response processStatelessHeaderDocument( BiblioItem result = new BiblioItem(); // starts conversion process + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder() + .consolidateHeader(consolidate) + .includeRawAffiliations(includeRawAffiliations) + .includeRawCopyrights(includeRawCopyrights) + .includeDiscardedText(includeDiscardedText) + .startPage(startPage) + .endPage(endPage) + .typedAreas(typedAreas) + .build(); + retVal = engine.processHeader( originFile.getAbsolutePath(), md5Str, - consolidate, - includeRawAffiliations, - includeRawCopyrights, - includeDiscardedText, - startPage, - endPage, + config, result ); @@ -201,7 +209,8 @@ public Response processStatelessHeaderFundingDocument( final int consolidateFunders, final boolean includeRawAffiliations, final boolean includeRawCopyrights, - final boolean includeDiscardedText + final boolean includeDiscardedText, + final List typedAreas ) { LOGGER.debug(methodLogIn()); String retVal = null; @@ -231,14 +240,19 @@ public Response processStatelessHeaderFundingDocument( String md5Str = DatatypeConverter.printHexBinary(digest).toUpperCase(); // starts conversion process + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder() + .consolidateHeader(consolidateHeader) + .consolidateFunders(consolidateFunders) + .includeRawAffiliations(includeRawAffiliations) + .includeRawCopyrights(includeRawCopyrights) + .includeDiscardedText(includeDiscardedText) + .typedAreas(typedAreas) + .build(); + retVal = engine.processHeaderFunding( originFile, md5Str, - consolidateHeader, - consolidateFunders, - includeRawAffiliations, - includeRawCopyrights, - includeDiscardedText + config ); if (GrobidRestUtils.isResultNullOrEmpty(retVal)) { @@ -300,7 +314,8 @@ public Response processFulltextDocument(final InputStream inputStream, final int endPage, final boolean generateIDs, final boolean segmentSentences, - final List teiCoordinates) throws Exception { + final List teiCoordinates, + final List typedAreas) throws Exception { LOGGER.debug(methodLogIn()); String retVal = null; @@ -344,6 +359,7 @@ public Response processFulltextDocument(final InputStream inputStream, .generateTeiCoordinates(teiCoordinates) .withSentenceSegmentation(segmentSentences) .flavor(flavor) + .typedAreas(typedAreas) .build(); retVal = engine.fullTextToTEI(originFile, flavor, md5Str, config); @@ -479,7 +495,8 @@ public Response processStatelessFulltextAssetDocument( final int endPage, final boolean generateIDs, final boolean segmentSentences, - final List teiCoordinates + final List teiCoordinates, + final List typedAreas ) throws Exception { LOGGER.debug(methodLogIn()); @@ -528,6 +545,7 @@ public Response processStatelessFulltextAssetDocument( .pdfAssetPath(new File(assetPath)) .withSentenceSegmentation(segmentSentences) .flavor(flavor) + .typedAreas(typedAreas) .build(); retVal = engine.fullTextToTEI(originFile, flavor, md5Str, config);