From 63f01892e8cad43a1817d218821a33cd64a16f93 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 19 Jan 2026 13:36:44 +0100 Subject: [PATCH 1/4] feat: add Typed Areas API for enhanced PDF region processing Introduce a new typedAreas API parameter that allows users to specify regions in PDF documents for specialized processing: - FIGURE: regions processed with FigureParser model - TABLE: regions processed with TableParser model - IGNORE: regions completely excluded from processing Key changes: - New AreaType enum and IgnoreArea class for typed area handling - Document.filterLayoutTokensByTypedAreas() for token categorization - GrobidAnalysisConfig builder methods for typed areas - REST API endpoints updated with typedAreas parameter - FullTextParser.processTypedAreas() for specialized parsing - HeaderParser integration for typed area filtering - Comprehensive unit tests (23 tests passing) - Complete API documentation with examples This replaces the legacy ignoreAreas parameter with full backward compatibility via deprecated constructors and methods. --- doc/Grobid-service.md | 132 ++++++ doc/Typed-Areas-API.md | 423 ++++++++++++++++++ doc/index.md | 2 + .../org/grobid/core/document/Document.java | 166 +++++++ .../grobid/core/engines/FullTextParser.java | 75 ++++ .../org/grobid/core/engines/HeaderParser.java | 8 + .../engines/config/GrobidAnalysisConfig.java | 31 +- .../java/org/grobid/core/layout/AreaType.java | 29 ++ .../org/grobid/core/layout/IgnoreArea.java | 219 +++++++++ .../org/grobid/core/layout/AreaTypeTest.java | 47 ++ .../grobid/core/layout/IgnoreAreaTest.java | 192 ++++++++ .../org/grobid/service/GrobidRestService.java | 160 ++++++- .../process/GrobidRestProcessFiles.java | 46 +- 13 files changed, 1495 insertions(+), 35 deletions(-) create mode 100644 doc/Typed-Areas-API.md create mode 100644 grobid-core/src/main/java/org/grobid/core/layout/AreaType.java create mode 100644 grobid-core/src/main/java/org/grobid/core/layout/IgnoreArea.java create mode 100644 grobid-core/src/test/java/org/grobid/core/layout/AreaTypeTest.java create mode 100644 grobid-core/src/test/java/org/grobid/core/layout/IgnoreAreaTest.java diff --git a/doc/Grobid-service.md b/doc/Grobid-service.md index 369a6850a0..139507bebd 100644 --- a/doc/Grobid-service.md +++ b/doc/Grobid-service.md @@ -176,6 +176,7 @@ Extract the header of the input PDF document, normalize it and convert it into a | | | | `includeRawCopyrights` | optional | `includeRawCopyrights` is a boolean value, `0` (default, do not include raw copyrights/license string in the result) or `1` (include raw copyrights/license string in the result). | | | | | `start` | optional | Start page number of the PDF to be considered, previous pages will be skipped/ignored, integer with first page starting at `1`, (default `-1`, start from the first page of the PDF) | | | | | `end` | optional | End page number of the PDF to be considered, next pages will be skipped/ignored, integer with first page starting at `1` (default `2`, end with the last page of the PDF) | +| | | | `typedAreas` | optional | JSON array specifying areas with coordinates and types for specialized processing (see [Typed Areas](#typed-areas) below) | Use `Accept: application/x-bibtex` to retrieve BibTeX format instead of XML TEI. @@ -229,6 +230,7 @@ Convert the complete input document into TEI XML format (header, body and biblio | | | | `start` | optional | Start page number of the PDF to be considered, previous pages will be skipped/ignored, integer with first page starting at `1`, (default `-1`, start from the first page of the PDF) | | | | | `end` | optional | End page number of the PDF to be considered, next pages will be skipped/ignored, integer with first page starting at `1` (default `-1`, end with the last page of the PDF) | | | | | `flavor` | optional | Indicate which flavor to apply for structuring the document. Useful when the default structuring cannot be applied to a specific document (e.g. the body is empty. More technical details and available flavor names in the [dedicated page](Grobid-specialized-processes.md). | +| | | | `typedAreas` | optional | JSON array specifying areas with coordinates and types for specialized processing (see [Typed Areas](#typed-areas) below) | Response status codes: @@ -291,6 +293,7 @@ Extract and convert all the bibliographical references present in the input docu | POST, PUT | `multipart/form-data` | `application/xml` | `input` | required | PDF file to be processed | | | | | `consolidateCitations` | optional | `consolidateCitations` is a string of value `0` (no consolidation, default value) or `1` (consolidate and inject all extra metadata), or `2` (consolidate the citation and inject DOI only). | | | | | `includeRawCitations` | optional | `includeRawCitations` is a boolean value, `0` (default. do not include raw reference string in the result) or `1` (include raw reference string in the result). | +| | | | `typedAreas` | optional | JSON array specifying areas with coordinates and types for specialized processing (see [Typed Areas](#typed-areas) below) | Use `Accept: application/x-bibtex` to retrieve BibTeX instead of TEI. @@ -318,6 +321,135 @@ It is possible to include the original raw reference string in the parsed result curl -v --form input=@./thefile.pdf --form includeRawCitations=1 localhost:8070/api/processReferences ``` +## Typed Areas + +The typed areas feature allows you to specify regions in PDF documents for specialized processing. Instead of relying solely on automatic detection, you can pre-identify areas containing figures, tables, or content to be ignored. This provides better accuracy and control over the document processing pipeline. + +### Supported Area Types + +- **`figure`**: Areas containing figures/diagrams that will be processed with the specialized figure model +- **`table`**: Areas containing tables that will be processed with the specialized table model +- **`ignore`**: Areas that should be completely excluded from all processing + +### JSON Format + +The `typedAreas` parameter expects a JSON array with the following structure: + +```json +[ + { + "page": 1, + "x": 100.0, + "y": 200.0, + "width": 300.0, + "height": 150.0, + "type": "figure" + }, + { + "page": 1, + "x": 450.0, + "y": 200.0, + "width": 250.0, + "height": 200.0, + "type": "table" + }, + { + "page": 1, + "x": 50.0, + "y": 500.0, + "width": 500.0, + "height": 100.0, + "type": "ignore" + } +] +``` + +### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `page` | integer | Yes | Page number (1-based, following PDF convention) | +| `x` | number | Yes | X-coordinate of the upper-left corner of the area | +| `y` | number | Yes | Y-coordinate of the upper-left corner of the area | +| `width` | number | Yes | Width of the area | +| `height` | number | Yes | Height of the area | +| `type` | string | Yes | Area type: `"figure"`, `"table"`, or `"ignore"` | + +### Coordinate System + +The coordinate system follows the PDF convention: +- **Origin**: Upper-left corner of the page +- **Units**: Points (1/72 inch) +- **Page numbering**: 1-based (first page is page 1) + +### Processing Behavior + +**Figure areas**: +- Tokens within figure areas are extracted from the main text processing +- Applied to the specialized FigureParser model +- Results are integrated into the TEI output as structured figure elements +- Bypasses the segmentation model for improved accuracy + +**Table areas**: +- Tokens within table areas are extracted from the main text processing +- Applied to the specialized TableParser model +- Results are integrated into the TEI output as structured table elements +- Bypasses the segmentation model for improved accuracy + +**Ignore areas**: +- Tokens within ignore areas are completely discarded +- No further processing is performed on these regions +- Useful for excluding headers, footers, watermarks, or other unwanted content + +### Usage Examples + +**cURL example with typed areas:** +```bash +curl -v -H "Accept: application/xml" \ + --form input=@./document.pdf \ + --form typedAreas='[ + {"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, "type": "figure"}, + {"page": 1, "x": 450, "y": 200, "width": 250, "height": 200, "type": "table"} + ]' \ + localhost:8070/api/processFulltextDocument +``` + +**Python example:** +```python +import requests +import json + +typed_areas = [ + {"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, "type": "figure"}, + {"page": 1, "x": 450, "y": 200, "width": 250, "height": 200, "type": "table"} +] + +with open('document.pdf', 'rb') as f: + files = {'input': f} + data = {'typedAreas': json.dumps(typed_areas)} + response = requests.post( + 'http://localhost:8070/api/processFulltextDocument', + files=files, + data=data, + headers={'Accept': 'application/xml'} + ) +``` + +### Benefits + +1. **Improved Accuracy**: Pre-identified figures and tables bypass the segmentation model, reducing detection errors +2. **Better Quality**: Specialized models applied to known area types produce higher quality results +3. **Performance**: More efficient processing by avoiding unnecessary model applications +4. **Control**: Precise control over which regions are processed and how +5. **Integration**: Seamlessly integrated into existing TEI output structure + +### Error Handling + +- Invalid JSON format will result in HTTP 400 error +- Invalid area types will be logged as warnings and skipped +- Coordinates outside page boundaries will be clamped to valid ranges +- Missing required fields will cause the area to be skipped with a warning + ### Raw text to TEI conversion services #### /api/processDate diff --git a/doc/Typed-Areas-API.md b/doc/Typed-Areas-API.md new file mode 100644 index 0000000000..9320a2f69e --- /dev/null +++ b/doc/Typed-Areas-API.md @@ -0,0 +1,423 @@ +# Typed Areas API Documentation + +The Typed Areas API provides enhanced control over PDF document processing by allowing you to specify regions for specialized handling. This feature improves accuracy for figure and table extraction while providing control over content exclusion. + +## Overview + +Instead of relying solely on automatic detection, you can pre-identify areas containing: +- **Figures** - processed with specialized figure model +- **Tables** - processed with specialized table model +- **Ignored content** - completely excluded from processing + +## Supported Endpoints + +The `typedAreas` parameter is supported by the following endpoints: + +- `POST /api/processHeaderDocument` - Header extraction with typed areas +- `POST /api/processFulltextDocument` - Full document processing with typed areas +- `POST /api/processReferences` - Reference extraction with typed areas + +## Request Format + +### Parameter Specification + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `typedAreas` | string (JSON) | No | JSON array of area specifications | + +### JSON Structure + +```json +[ + { + "page": 1, + "x": 100.0, + "y": 200.0, + "width": 300.0, + "height": 150.0, + "type": "figure" + }, + { + "page": 1, + "x": 450.0, + "y": 200.0, + "width": 250.0, + "height": 200.0, + "type": "table" + }, + { + "page": 1, + "x": 50.0, + "y": 500.0, + "width": 500.0, + "height": 100.0, + "type": "ignore" + } +] +``` + +### Field Descriptions + +- **`page`** (integer, required): Page number (1-based, following PDF convention) +- **`x`** (number, required): X-coordinate of upper-left corner in points +- **`y`** (number, required): Y-coordinate of upper-left corner in points +- **`width`** (number, required): Width of the area in points +- **`height`** (number, required): Height of the area in points +- **`type`** (string, required): Area type - `"figure"`, `"table"`, or `"ignore"` + +## Area Types and Processing + +### Figure Areas (`"type": "figure"`) + +**Processing**: +- Tokens within figure areas are extracted from main text processing +- Applied to specialized FigureParser model +- Results integrated into TEI output as structured `
` elements +- Bypasses segmentation model for improved accuracy + +**Use Cases**: +- Pre-identified figures from external OCR or layout analysis +- Complex diagrams where automatic detection fails +- Figures with known boundaries for consistent processing + +### Table Areas (`"type": "table"`) + +**Processing**: +- Tokens within table areas are extracted from main text processing +- Applied to specialized TableParser model +- Results integrated into TEI output as structured `` elements +- Bypasses segmentation model for improved accuracy + +**Use Cases**: +- Tables with complex layouts or formatting +- Pre-identified table regions from document analysis +- Tables requiring consistent extraction across documents + +### Ignore Areas (`"type": "ignore"`) + +**Processing**: +- Tokens within ignore areas are completely discarded +- No further processing performed on these regions +- Content excluded from all model processing + +**Use Cases**: +- Headers, footers, and page numbers +- Watermarks or background elements +- Marginalia or annotations +- Advertisements or irrelevant content + +## Coordinate System + +The coordinate system follows PDF conventions: + +``` +(0,0) +----------------------→ X (points) + | + | + ↓ Y (points) +``` + +- **Origin**: Upper-left corner of the page +- **Units**: Points (1/72 inch ≈ 0.353 mm) +- **Page numbering**: 1-based (first page is page 1) + +## Usage Examples + +### cURL Examples + +**Basic header processing with typed areas:** +```bash +curl -v -H "Accept: application/xml" \ + --form input=@./document.pdf \ + --form typedAreas='[ + {"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, "type": "figure"}, + {"page": 1, "x": 450, "y": 200, "width": 250, "height": 200, "type": "table"} + ]' \ + localhost:8070/api/processHeaderDocument +``` + +**Full document processing with areas:** +```bash +curl -v -H "Accept: application/xml" \ + --form input=@./document.pdf \ + --form consolidateHeader=1 \ + --form typedAreas='[ + {"page": 1, "x": 50, "y": 750, "width": 500, "height": 50, "type": "ignore"}, + {"page": 2, "x": 100, "y": 100, "width": 400, "height": 300, "type": "figure"} + ]' \ + localhost:8070/api/processFulltextDocument +``` + +### Python Examples + +**Using requests library:** +```python +import requests +import json + +# Define typed areas +typed_areas = [ + {"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, "type": "figure"}, + {"page": 1, "x": 450, "y": 200, "width": 250, "height": 200, "type": "table"}, + {"page": 1, "x": 50, "y": 750, "width": 500, "height": 30, "type": "ignore"} +] + +# Process document +with open('document.pdf', 'rb') as f: + files = {'input': f} + data = { + 'typedAreas': json.dumps(typed_areas), + 'consolidateHeader': '1', + 'segmentSentences': '1' + } + + response = requests.post( + 'http://localhost:8070/api/processFulltextDocument', + files=files, + data=data, + headers={'Accept': 'application/xml'} + ) + + if response.status_code == 200: + print(response.text) + else: + print(f"Error: {response.status_code} - {response.text}") +``` + +**Complex processing with multiple parameters:** +```python +import requests +import json + +def process_with_typed_areas(pdf_path, typed_areas, endpoint="processFulltextDocument"): + """Process a PDF with typed areas and additional parameters.""" + + url = f"http://localhost:8070/api/{endpoint}" + + with open(pdf_path, 'rb') as f: + files = {'input': f} + data = { + 'typedAreas': json.dumps(typed_areas), + 'consolidateHeader': '1', + 'consolidateCitations': '1', + 'segmentSentences': '1', + 'generateIDs': '1', + 'includeRawCitations': '1' + } + + response = requests.post( + url, + files=files, + data=data, + headers={'Accept': 'application/xml'} + ) + + return response + +# Example usage +figure_areas = [ + {"page": 1, "x": 85, "y": 120, "width": 440, "height": 280, "type": "figure"}, + {"page": 2, "x": 85, "y": 200, "width": 300, "height": 200, "type": "table"} +] + +response = process_with_typed_areas("research_paper.pdf", figure_areas) +print(response.status_code) +``` + +### JavaScript Examples + +**Using fetch API:** +```javascript +async function processWithTypedAreas(pdfFile, typedAreas) { + const formData = new FormData(); + formData.append('input', pdfFile); + formData.append('typedAreas', JSON.stringify(typedAreas)); + formData.append('consolidateHeader', '1'); + formData.append('segmentSentences', '1'); + + try { + const response = await fetch( + 'http://localhost:8070/api/processFulltextDocument', + { + method: 'POST', + body: formData, + headers: { + 'Accept': 'application/xml' + } + } + ); + + if (response.ok) { + const result = await response.text(); + return result; + } else { + throw new Error(`HTTP error! status: ${response.status}`); + } + } catch (error) { + console.error('Error processing document:', error); + throw error; + } +} + +// Usage example +const typedAreas = [ + {page: 1, x: 100, y: 200, width: 300, height: 150, type: "figure"}, + {page: 1, x: 450, y: 200, width: 250, height: 200, type: "table"} +]; + +const fileInput = document.getElementById('pdf-input'); +fileInput.addEventListener('change', async (event) => { + const file = event.target.files[0]; + if (file) { + try { + const result = await processWithTypedAreas(file, typedAreas); + console.log('Processing result:', result); + } catch (error) { + console.error('Processing failed:', error); + } + } +}); +``` + +## Error Handling + +### Common Error Scenarios + +**Invalid JSON format:** +```json +// Invalid - missing quotes around type +{"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, type: figure} +``` +**Error**: HTTP 400 - "Invalid JSON format" + +**Invalid area type:** +```json +// Invalid - unsupported area type +{"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, "type": "diagram"} +``` +**Behavior**: Area logged as warning and skipped + +**Missing required fields:** +```json +// Invalid - missing type field +{"page": 1, "x": 100, "y": 200, "width": 300, "height": 150} +``` +**Behavior**: Area logged as warning and skipped + +**Invalid coordinates:** +```json +// Valid but outside bounds - will be clamped +{"page": 1, "x": -100, "y": 200, "width": 300, "height": 150, "type": "figure"} +``` +**Behavior**: Coordinates clamped to valid page boundaries + +### Response Status Codes + +| Status Code | Description | +|-------------|-------------| +| 200 | Successful processing with typed areas | +| 204 | Processing completed but no content extracted | +| 400 | Invalid request (malformed JSON, missing parameters) | +| 500 | Internal server error during processing | +| 503 | Service unavailable (all threads in use) | + +## Integration with Existing Workflow + +### Combining with Other Parameters + +Typed areas work seamlessly with all existing GROBID parameters: + +```bash +curl -v -H "Accept: application/xml" \ + --form input=@./document.pdf \ + --form typedAreas='[{"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, "type": "figure"}]' \ + --form consolidateHeader=1 \ + --form consolidateCitations=1 \ + --form segmentSentences=1 \ + --form generateIDs=1 \ + --form teiCoordinates=figure,table \ + localhost:8070/api/processFulltextDocument +``` + +### TEI Output Structure + +Processed typed areas are integrated into the standard TEI output: + +```xml +
+ Figure 1: Sample Figure + Description extracted from specialized processing + +
+ +
+ Table 1: Sample Data + + Header 1 + Header 2 + + +
+``` + +## Performance Considerations + +### Optimization Tips + +1. **Area Size**: Define areas as tightly as possible around content +2. **Overlapping Areas**: Avoid overlapping typed areas - results may be unpredictable +3. **Large Documents**: Consider processing pages individually for very large documents +4. **Batch Processing**: Reuse area definitions across similar documents when possible + +### Performance Impact + +- **Improved**: Bypassing segmentation for pre-identified areas +- **Overhead**: JSON parsing and area coordinate calculations +- **Memory**: Additional token lists for different area types +- **Overall**: Typically faster processing for documents with many pre-identified figures/tables + +## Migration from Legacy ignoreAreas + +The new `typedAreas` parameter replaces the legacy `ignoreAreas` parameter: + +**Old format (deprecated):** +```json +[ + {"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, "name": "figure"} +] +``` + +**New format (required):** +```json +[ + {"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, "type": "figure"}, + {"page": 1, "x": 400, "y": 200, "width": 200, "height": 100, "type": "ignore"} +] +``` + +**Key Changes:** +- `name` field replaced with required `type` field +- Support for figure and table processing (not just ignoring) +- Type-safe area classification with enum validation + +## Troubleshooting + +### Common Issues + +1. **Areas not being processed**: Check JSON format and field names +2. **Incorrect coordinates**: Verify coordinate system and page numbering +3. **Partial extraction**: Ensure areas fully encompass target content +4. **Performance issues**: Reduce number of areas or make them more precise + +### Debugging Tips + +1. **Start simple**: Test with a single, well-defined area +2. **Verify coordinates**: Use PDF viewer to confirm area boundaries +3. **Check logs**: Server logs provide detailed error messages for invalid areas +4. **Validate JSON**: Use JSON validator to ensure correct syntax + +### Getting Help + +- **Documentation**: See [GROBID Service API](Grobid-service.md) for general API usage +- **Issues**: Report bugs or request features via GitHub issues +- **Community**: Join discussions for usage tips and best practices \ No newline at end of file diff --git a/doc/index.md b/doc/index.md index 38115c2e2f..1cae85f318 100644 --- a/doc/index.md +++ b/doc/index.md @@ -17,6 +17,8 @@ * [Use GROBID as a service](Grobid-service.md) +* [Typed Areas API Documentation](Typed-Areas-API.md) + * [Build a GROBID development environment](Install-Grobid.md) * [Manage GROBID with containers (Docker)](Grobid-docker.md) diff --git a/grobid-core/src/main/java/org/grobid/core/document/Document.java b/grobid-core/src/main/java/org/grobid/core/document/Document.java index 389469c6f6..81babc9f1f 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/Document.java +++ b/grobid-core/src/main/java/org/grobid/core/document/Document.java @@ -27,6 +27,8 @@ import org.grobid.core.layout.Cluster; import org.grobid.core.layout.GraphicObject; import org.grobid.core.layout.GraphicObjectType; +import org.grobid.core.layout.IgnoreArea; +import org.grobid.core.layout.AreaType; import org.grobid.core.layout.LayoutToken; import org.grobid.core.layout.PDFAnnotation; import org.grobid.core.layout.Page; @@ -156,6 +158,16 @@ public void setImages(List images) { protected transient List equations; protected transient List annexEquations; + // typed areas for specialized processing + protected transient List figureAreas = new ArrayList<>(); + protected transient List tableAreas = new ArrayList<>(); + protected transient List ignoredAreas = new ArrayList<>(); + + // tokens extracted from typed areas for specialized processing + protected transient List figureTokens = new ArrayList<>(); + protected transient List tableTokens = new ArrayList<>(); + protected transient List ignoredTokens = new ArrayList<>(); + // the analyzer/tokenizer used for processing this document protected transient Analyzer analyzer = GrobidAnalyzer.getInstance(); @@ -1703,4 +1715,158 @@ public List
getAnnexFigures() { public void setAnnexFigures(List
annexFigures) { this.annexFigures = annexFigures; } + + // Typed area getters and setters + public List getFigureAreas() { + return figureAreas; + } + + public void setFigureAreas(List figureAreas) { + this.figureAreas = figureAreas != null ? figureAreas : new ArrayList<>(); + } + + public List getTableAreas() { + return tableAreas; + } + + public void setTableAreas(List tableAreas) { + this.tableAreas = tableAreas != null ? tableAreas : new ArrayList<>(); + } + + public List getIgnoredAreas() { + return ignoredAreas; + } + + public void setIgnoredAreas(List ignoredAreas) { + this.ignoredAreas = ignoredAreas != null ? ignoredAreas : new ArrayList<>(); + } + + // Token getters and setters for typed areas + public List getFigureTokens() { + return figureTokens; + } + + public void setFigureTokens(List figureTokens) { + this.figureTokens = figureTokens != null ? figureTokens : new ArrayList<>(); + } + + public List getTableTokens() { + return tableTokens; + } + + public void setTableTokens(List tableTokens) { + this.tableTokens = tableTokens != null ? tableTokens : new ArrayList<>(); + } + + public List getIgnoredTokens() { + return ignoredTokens; + } + + public void setIgnoredTokens(List ignoredTokens) { + this.ignoredTokens = ignoredTokens != null ? ignoredTokens : new ArrayList<>(); + } + + /** + * Filters out layout tokens that fall within the specified ignore areas and categorizes them by type. + * This replaces the old filterLayoutTokensByIgnoreAreas method to support typed areas. + * + * @param typedAreas list of typed areas for specialized processing + */ + public void filterLayoutTokensByTypedAreas(List typedAreas) { + if (typedAreas == null || typedAreas.isEmpty() || tokenizations == null || tokenizations.isEmpty()) { + return; + } + + LOGGER.debug("Processing {} typed areas", typedAreas.size()); + + // Clear previous token lists + figureTokens.clear(); + tableTokens.clear(); + ignoredTokens.clear(); + figureAreas.clear(); + tableAreas.clear(); + ignoredAreas.clear(); + + // Categorize areas by type + for (IgnoreArea area : typedAreas) { + if (area.getType() == null) { + continue; + } + + switch (area.getType()) { + case FIGURE: + figureAreas.add(area); + break; + case TABLE: + tableAreas.add(area); + break; + case IGNORE: + ignoredAreas.add(area); + break; + } + } + + List filteredTokens = new ArrayList<>(); + int figureTokenCount = 0; + int tableTokenCount = 0; + int ignoredTokenCount = 0; + + for (LayoutToken token : tokenizations) { + boolean tokenProcessed = false; + + // Check if token intersects with any typed area + for (IgnoreArea area : typedAreas) { + if (area.contains(token)) { + switch (area.getType()) { + case FIGURE: + figureTokens.add(token); + figureTokenCount++; + tokenProcessed = true; + break; + case TABLE: + tableTokens.add(token); + tableTokenCount++; + tokenProcessed = true; + break; + case IGNORE: + ignoredTokens.add(token); + ignoredTokenCount++; + tokenProcessed = true; + break; + } + if (tokenProcessed) { + break; + } + } + } + + // Keep token only if it wasn't processed by any typed area + if (!tokenProcessed) { + filteredTokens.add(token); + } + } + + tokenizations = filteredTokens; + LOGGER.debug("Processed typed areas: {} figure tokens, {} table tokens, {} ignored tokens, {} main tokens remaining", + figureTokenCount, tableTokenCount, ignoredTokenCount, tokenizations.size()); + } + + /** + * Legacy method for backward compatibility. + * @deprecated Use {@link #filterLayoutTokensByTypedAreas(List)} instead. + */ + @Deprecated + public void filterLayoutTokensByIgnoreAreas(List ignoreAreas) { + // Convert all ignore areas to IGNORE type and use new method + List typedAreas = new ArrayList<>(); + if (ignoreAreas != null) { + for (IgnoreArea area : ignoreAreas) { + // Create a new area with IGNORE type + IgnoreArea ignoreArea = new IgnoreArea(area.getPage(), area.getX(), area.getY(), + area.getWidth(), area.getHeight(), AreaType.IGNORE); + typedAreas.add(ignoreArea); + } + } + filterLayoutTokensByTypedAreas(typedAreas); + } } diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 698c7b8c1c..3cb8d4451f 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -155,6 +155,17 @@ public Document processing(DocumentSource documentSource, try { // general segmentation Document doc = parsers.getSegmentationParser(flavor).processing(documentSource, config); + + // Apply typed areas filtering if configured (takes precedence over legacy ignoreAreas) + if (config.getTypedAreas() != null && !config.getTypedAreas().isEmpty()) { + doc.filterLayoutTokensByTypedAreas(config.getTypedAreas()); + // Apply specialized processing for figures and tables + processTypedAreas(doc); + } else if (config.getIgnoreAreas() != null && !config.getIgnoreAreas().isEmpty()) { + // Legacy support for old ignoreAreas + doc.filterLayoutTokensByIgnoreAreas(config.getIgnoreAreas()); + } + SortedSet documentBodyParts = doc.getDocumentPart(SegmentationLabels.BODY); // header processing @@ -724,6 +735,16 @@ public Document processingHeaderFunding(DocumentSource documentSource, // general segmentation Document doc = parsers.getSegmentationParser().processing(documentSource, config); + // Apply typed areas filtering if configured (takes precedence over legacy ignoreAreas) + if (config.getTypedAreas() != null && !config.getTypedAreas().isEmpty()) { + doc.filterLayoutTokensByTypedAreas(config.getTypedAreas()); + // Apply specialized processing for figures and tables + processTypedAreas(doc); + } else if (config.getIgnoreAreas() != null && !config.getIgnoreAreas().isEmpty()) { + // Legacy support for old ignoreAreas + doc.filterLayoutTokensByIgnoreAreas(config.getIgnoreAreas()); + } + // header processing BiblioItem resHeader = new BiblioItem(); Pair featSeg = null; @@ -3543,6 +3564,60 @@ public static List getDocumentFullTextTokens(List()); + } + doc.getAnnexFigures().add(processedFigure); + LOGGER.debug("Processed figure from typed areas"); + } + } catch (Exception e) { + LOGGER.warn("Error processing figure areas: " + e.getMessage(), e); + } + } + + // Process table areas + if (!doc.getTableAreas().isEmpty() && !doc.getTableTokens().isEmpty()) { + try { + List processedTables = parsers.getTableParser() + .processing(doc.getTableTokens(), null); + + if (processedTables != null && !processedTables.isEmpty()) { + // Add processed tables to document's annex tables + if (doc.getAnnexTables() == null) { + doc.setAnnexTables(new ArrayList<>()); + } + doc.getAnnexTables().addAll(processedTables); + LOGGER.debug("Processed {} tables from typed areas", processedTables.size()); + } + } catch (Exception e) { + LOGGER.warn("Error processing table areas: " + e.getMessage(), e); + } + } + + // Note: ignored areas are intentionally discarded and no further processing is performed + LOGGER.debug("Typed area processing completed"); + } + @Override public void close() throws IOException { super.close(); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java index 7234ff54b5..acac33e47e 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java @@ -95,6 +95,14 @@ public Pair processing(File input, String md5Str, BiblioItem r documentSource.setMD5(md5Str); Document doc = parsers.getSegmentationParser().processing(documentSource, config); + // Apply typed areas filtering if configured (takes precedence over legacy ignoreAreas) + if (config.getTypedAreas() != null && !config.getTypedAreas().isEmpty()) { + doc.filterLayoutTokensByTypedAreas(config.getTypedAreas()); + } else if (config.getIgnoreAreas() != null && !config.getIgnoreAreas().isEmpty()) { + // Legacy support for old ignoreAreas + doc.filterLayoutTokensByIgnoreAreas(config.getIgnoreAreas()); + } + String tei = processingHeaderSection(config, doc, resHeader, true); return new ImmutablePair(tei, doc); } finally { diff --git a/grobid-core/src/main/java/org/grobid/core/engines/config/GrobidAnalysisConfig.java b/grobid-core/src/main/java/org/grobid/core/engines/config/GrobidAnalysisConfig.java index c68b2f11a6..bfde090d5f 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/config/GrobidAnalysisConfig.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/config/GrobidAnalysisConfig.java @@ -2,9 +2,11 @@ import java.io.File; import java.util.List; +import java.util.ArrayList; import org.grobid.core.GrobidModels; import org.grobid.core.analyzers.Analyzer; +import org.grobid.core.layout.IgnoreArea; /** * A class representing the runtime configuration values needed in the analysis chain @@ -91,9 +93,15 @@ private GrobidAnalysisConfig() { // a particular Grobid Analyzer to be used for tokenizing/filtering text private Analyzer analyzer = null; - // if true, the TEI text will be segmented into sentences + // if true, the TEI text will be segmented into sentences private boolean withSentenceSegmentation = false; + // list of areas to ignore during processing (legacy) + private List ignoreAreas = null; + + // list of typed areas for specialized processing + private List typedAreas = null; + public boolean isIncludeDiscardedText() { return includeDiscardedText; } @@ -215,6 +223,15 @@ public GrobidAnalysisConfigBuilder flavor(GrobidModels.Flavor a) { return this; } + public GrobidAnalysisConfigBuilder ignoreAreas(List areas) { + config.ignoreAreas = areas; + return this; + } + + public GrobidAnalysisConfigBuilder typedAreas(List areas) { + config.typedAreas = areas; + return this; + } public GrobidAnalysisConfig build() { postProcessAndValidate(); @@ -342,4 +359,16 @@ public boolean isWithSentenceSegmentation() { public String getFlavor() { return flavor; } + + public List getIgnoreAreas() { + return ignoreAreas; + } + + public List getTypedAreas() { + return typedAreas; + } + + public void setTypedAreas(List typedAreas) { + this.typedAreas = typedAreas; + } } diff --git a/grobid-core/src/main/java/org/grobid/core/layout/AreaType.java b/grobid-core/src/main/java/org/grobid/core/layout/AreaType.java new file mode 100644 index 0000000000..f7fb2a74aa --- /dev/null +++ b/grobid-core/src/main/java/org/grobid/core/layout/AreaType.java @@ -0,0 +1,29 @@ +package org.grobid.core.layout; + +/** + * Enumeration of supported area types for typed area processing. + */ +public enum AreaType { + FIGURE("figure"), + TABLE("table"), + IGNORE("ignore"); + + private final String value; + + AreaType(String value) { + this.value = value; + } + + public String getValue() { + return value; + } + + public static AreaType fromString(String value) { + for (AreaType type : AreaType.values()) { + if (type.value.equalsIgnoreCase(value)) { + return type; + } + } + throw new IllegalArgumentException("Unknown area type: " + value); + } +} \ No newline at end of file diff --git a/grobid-core/src/main/java/org/grobid/core/layout/IgnoreArea.java b/grobid-core/src/main/java/org/grobid/core/layout/IgnoreArea.java new file mode 100644 index 0000000000..b7cdb7430d --- /dev/null +++ b/grobid-core/src/main/java/org/grobid/core/layout/IgnoreArea.java @@ -0,0 +1,219 @@ +package org.grobid.core.layout; + +/** + * Represents a typed area in a PDF document for specialized processing. + * This includes areas containing figures, tables, or content to be ignored. + */ +public class IgnoreArea { + private int page; // page number (1-based, following PDF convention) + private double x; // x-coordinate of upper-left corner + private double y; // y-coordinate of upper-left corner + private double width; // width of the area + private double height; // height of the area + private AreaType type; // type: figure, table, ignore + + public IgnoreArea() { + } + + public IgnoreArea(int page, double x, double y, double width, double height, AreaType type) { + this.page = page; + this.x = x; + this.y = y; + this.width = width; + this.height = height; + this.type = type; + } + + /** + * Legacy constructor for backward compatibility. + * @deprecated Use {@link #IgnoreArea(int, double, double, double, double, AreaType)} instead. + */ + @Deprecated + public IgnoreArea(int page, double x, double y, double width, double height, String name) { + this.page = page; + this.x = x; + this.y = y; + this.width = width; + this.height = height; + // Convert string name to AreaType for backward compatibility + this.type = name != null ? AreaType.fromString(name.toLowerCase()) : AreaType.IGNORE; + } + + /** + * Creates an IgnoreArea from a coordinate string in the format: "page,x,y,width,height,type" + */ + public static IgnoreArea fromCoordinates(String coordString) { + String[] parts = coordString.split(","); + if (parts.length < 6) { + throw new IllegalArgumentException("Invalid coordinate string format. Expected: page,x,y,width,height,type"); + } + + try { + int page = Integer.parseInt(parts[0].trim()); + double x = Double.parseDouble(parts[1].trim()); + double y = Double.parseDouble(parts[2].trim()); + double width = Double.parseDouble(parts[3].trim()); + double height = Double.parseDouble(parts[4].trim()); + AreaType type = AreaType.fromString(parts[5].trim()); + + return new IgnoreArea(page, x, y, width, height, type); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Invalid numeric values in coordinate string: " + coordString, e); + } + } + + /** + * Checks if a LayoutToken falls within or intersects with this ignore area. + * + * @param token the LayoutToken to check + * @return true if the token intersects with this ignore area + */ + public boolean contains(LayoutToken token) { + if (token.getPage() != this.page) { + return false; + } + + double tokenLeft = token.getX(); + double tokenRight = token.getX() + token.getWidth(); + double tokenTop = token.getY(); + double tokenBottom = token.getY() + token.getHeight(); + + double areaLeft = this.x; + double areaRight = this.x + this.width; + double areaTop = this.y; + double areaBottom = this.y + this.height; + + // Check for intersection: two rectangles intersect if their projections overlap on both axes + return !(tokenRight < areaLeft || + tokenLeft > areaRight || + tokenBottom < areaTop || + tokenTop > areaBottom); + } + + /** + * Creates an IgnoreArea from a coordinate string in the format: "page,x,y,width,height,name" + */ + public static IgnoreArea fromString(String coordString) { + String[] parts = coordString.split(","); + if (parts.length < 5) { + throw new IllegalArgumentException("Invalid coordinate string format. Expected: page,x,y,width,height[,name]"); + } + + try { + int page = Integer.parseInt(parts[0].trim()); + double x = Double.parseDouble(parts[1].trim()); + double y = Double.parseDouble(parts[2].trim()); + double width = Double.parseDouble(parts[3].trim()); + double height = Double.parseDouble(parts[4].trim()); + String name = parts.length > 5 ? parts[5].trim() : ""; + + return new IgnoreArea(page, x, y, width, height, name); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Invalid numeric values in coordinate string: " + coordString, e); + } + } + + // Getters and setters + public int getPage() { + return page; + } + + public void setPage(int page) { + this.page = page; + } + + public double getX() { + return x; + } + + public void setX(double x) { + this.x = x; + } + + public double getY() { + return y; + } + + public void setY(double y) { + this.y = y; + } + + public double getWidth() { + return width; + } + + public void setWidth(double width) { + this.width = width; + } + + public double getHeight() { + return height; + } + + public void setHeight(double height) { + this.height = height; + } + + public AreaType getType() { + return type; + } + + public void setType(AreaType type) { + this.type = type; + } + + /** + * Legacy getter for backward compatibility. + * @deprecated Use {@link #getType()} instead. + */ + @Deprecated + public String getName() { + return type != null ? type.getValue() : null; + } + + /** + * Legacy setter for backward compatibility. + * @deprecated Use {@link #setType(AreaType)} instead. + */ + @Deprecated + public void setName(String name) { + this.type = name != null ? AreaType.fromString(name.toLowerCase()) : AreaType.IGNORE; + } + + @Override + public String toString() { + return String.format("IgnoreArea{page=%d, x=%.2f, y=%.2f, width=%.2f, height=%.2f, type='%s'}", + page, x, y, width, height, type != null ? type.getValue() : "null"); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null || getClass() != obj.getClass()) return false; + + IgnoreArea that = (IgnoreArea) obj; + return page == that.page && + Double.compare(that.x, x) == 0 && + Double.compare(that.y, y) == 0 && + Double.compare(that.width, width) == 0 && + Double.compare(that.height, height) == 0 && + type == that.type; + } + + @Override + public int hashCode() { + int result; + long temp; + result = page; + temp = Double.doubleToLongBits(x); + result = 31 * result + (int) (temp ^ (temp >>> 32)); + temp = Double.doubleToLongBits(y); + result = 31 * result + (int) (temp ^ (temp >>> 32)); + temp = Double.doubleToLongBits(width); + result = 31 * result + (int) (temp ^ (temp >>> 32)); + temp = Double.doubleToLongBits(height); + result = 31 * result + (int) (temp ^ (temp >>> 32)); + result = 31 * result + (type != null ? type.hashCode() : 0); + return result; + } +} \ No newline at end of file diff --git a/grobid-core/src/test/java/org/grobid/core/layout/AreaTypeTest.java b/grobid-core/src/test/java/org/grobid/core/layout/AreaTypeTest.java new file mode 100644 index 0000000000..8637cea262 --- /dev/null +++ b/grobid-core/src/test/java/org/grobid/core/layout/AreaTypeTest.java @@ -0,0 +1,47 @@ +package org.grobid.core.layout; + +import org.junit.Test; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; + +public class AreaTypeTest { + + @Test + public void testFromString_figure() { + AreaType type = AreaType.fromString("figure"); + assertThat(type, is(AreaType.FIGURE)); + } + + @Test + public void testFromString_table() { + AreaType type = AreaType.fromString("table"); + assertThat(type, is(AreaType.TABLE)); + } + + @Test + public void testFromString_ignore() { + AreaType type = AreaType.fromString("ignore"); + assertThat(type, is(AreaType.IGNORE)); + } + + @Test + public void testFromString_caseInsensitive() { + assertThat(AreaType.fromString("FIGURE"), is(AreaType.FIGURE)); + assertThat(AreaType.fromString("TABLE"), is(AreaType.TABLE)); + assertThat(AreaType.fromString("IGNORE"), is(AreaType.IGNORE)); + assertThat(AreaType.fromString("Figure"), is(AreaType.FIGURE)); + } + + @Test(expected = IllegalArgumentException.class) + public void testFromString_invalidType() { + AreaType.fromString("invalid"); + } + + @Test + public void testGetValue() { + assertThat(AreaType.FIGURE.getValue(), is("figure")); + assertThat(AreaType.TABLE.getValue(), is("table")); + assertThat(AreaType.IGNORE.getValue(), is("ignore")); + } +} diff --git a/grobid-core/src/test/java/org/grobid/core/layout/IgnoreAreaTest.java b/grobid-core/src/test/java/org/grobid/core/layout/IgnoreAreaTest.java new file mode 100644 index 0000000000..0cdf66f57f --- /dev/null +++ b/grobid-core/src/test/java/org/grobid/core/layout/IgnoreAreaTest.java @@ -0,0 +1,192 @@ +package org.grobid.core.layout; + +import org.junit.Test; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.CoreMatchers.nullValue; +import static org.hamcrest.MatcherAssert.assertThat; + +public class IgnoreAreaTest { + + @Test + public void testConstructor_withAreaType() { + IgnoreArea area = new IgnoreArea(1, 100.0, 200.0, 300.0, 150.0, AreaType.FIGURE); + + assertThat(area.getPage(), is(1)); + assertThat(area.getX(), is(100.0)); + assertThat(area.getY(), is(200.0)); + assertThat(area.getWidth(), is(300.0)); + assertThat(area.getHeight(), is(150.0)); + assertThat(area.getType(), is(AreaType.FIGURE)); + } + + @Test + @SuppressWarnings("deprecation") + public void testLegacyConstructor_withStringName() { + IgnoreArea area = new IgnoreArea(1, 100.0, 200.0, 300.0, 150.0, "figure"); + + assertThat(area.getType(), is(AreaType.FIGURE)); + } + + @Test + @SuppressWarnings("deprecation") + public void testLegacyConstructor_withNullName_defaultsToIgnore() { + IgnoreArea area = new IgnoreArea(1, 100.0, 200.0, 300.0, 150.0, (String) null); + + assertThat(area.getType(), is(AreaType.IGNORE)); + } + + @Test + public void testFromCoordinates_validString() { + IgnoreArea area = IgnoreArea.fromCoordinates("1,100.5,200.5,300.0,150.0,table"); + + assertThat(area.getPage(), is(1)); + assertThat(area.getX(), is(100.5)); + assertThat(area.getY(), is(200.5)); + assertThat(area.getWidth(), is(300.0)); + assertThat(area.getHeight(), is(150.0)); + assertThat(area.getType(), is(AreaType.TABLE)); + } + + @Test(expected = IllegalArgumentException.class) + public void testFromCoordinates_insufficientParts() { + IgnoreArea.fromCoordinates("1,100,200,300,150"); // missing type + } + + @Test(expected = IllegalArgumentException.class) + public void testFromCoordinates_invalidNumber() { + IgnoreArea.fromCoordinates("1,abc,200,300,150,figure"); + } + + @Test + public void testContains_tokenInsideArea() { + IgnoreArea area = new IgnoreArea(1, 100.0, 100.0, 200.0, 200.0, AreaType.FIGURE); + + LayoutToken token = new LayoutToken(); + token.setPage(1); + token.setX(150.0); + token.setY(150.0); + token.setWidth(20.0); + token.setHeight(10.0); + + assertThat(area.contains(token), is(true)); + } + + @Test + public void testContains_tokenOutsideArea() { + IgnoreArea area = new IgnoreArea(1, 100.0, 100.0, 200.0, 200.0, AreaType.FIGURE); + + LayoutToken token = new LayoutToken(); + token.setPage(1); + token.setX(500.0); + token.setY(500.0); + token.setWidth(20.0); + token.setHeight(10.0); + + assertThat(area.contains(token), is(false)); + } + + @Test + public void testContains_tokenOnDifferentPage() { + IgnoreArea area = new IgnoreArea(1, 100.0, 100.0, 200.0, 200.0, AreaType.FIGURE); + + LayoutToken token = new LayoutToken(); + token.setPage(2); + token.setX(150.0); + token.setY(150.0); + token.setWidth(20.0); + token.setHeight(10.0); + + assertThat(area.contains(token), is(false)); + } + + @Test + public void testContains_tokenIntersectsArea() { + IgnoreArea area = new IgnoreArea(1, 100.0, 100.0, 200.0, 200.0, AreaType.FIGURE); + + // Token partially inside the area + LayoutToken token = new LayoutToken(); + token.setPage(1); + token.setX(290.0); // Starts at the edge + token.setY(150.0); + token.setWidth(20.0); + token.setHeight(10.0); + + assertThat(area.contains(token), is(true)); + } + + @Test + public void testContains_tokenBarelyOutsideRight() { + IgnoreArea area = new IgnoreArea(1, 100.0, 100.0, 200.0, 200.0, AreaType.FIGURE); + + LayoutToken token = new LayoutToken(); + token.setPage(1); + token.setX(301.0); // Just outside the right edge (100 + 200 = 300) + token.setY(150.0); + token.setWidth(20.0); + token.setHeight(10.0); + + assertThat(area.contains(token), is(false)); + } + + @Test + @SuppressWarnings("deprecation") + public void testGetName_legacy() { + IgnoreArea area = new IgnoreArea(1, 100.0, 200.0, 300.0, 150.0, AreaType.TABLE); + + assertThat(area.getName(), is("table")); + } + + @Test + @SuppressWarnings("deprecation") + public void testSetName_legacy() { + IgnoreArea area = new IgnoreArea(); + area.setName("ignore"); + + assertThat(area.getType(), is(AreaType.IGNORE)); + } + + @Test + public void testEquals() { + IgnoreArea area1 = new IgnoreArea(1, 100.0, 200.0, 300.0, 150.0, AreaType.FIGURE); + IgnoreArea area2 = new IgnoreArea(1, 100.0, 200.0, 300.0, 150.0, AreaType.FIGURE); + + assertThat(area1.equals(area2), is(true)); + assertThat(area1.hashCode(), is(area2.hashCode())); + } + + @Test + public void testEquals_differentType() { + IgnoreArea area1 = new IgnoreArea(1, 100.0, 200.0, 300.0, 150.0, AreaType.FIGURE); + IgnoreArea area2 = new IgnoreArea(1, 100.0, 200.0, 300.0, 150.0, AreaType.TABLE); + + assertThat(area1.equals(area2), is(false)); + } + + @Test + public void testToString() { + IgnoreArea area = new IgnoreArea(1, 100.0, 200.0, 300.0, 150.0, AreaType.FIGURE); + + String result = area.toString(); + assertThat(result.contains("page=1"), is(true)); + assertThat(result.contains("type='figure'"), is(true)); + } + + @Test + public void testSetters() { + IgnoreArea area = new IgnoreArea(); + area.setPage(2); + area.setX(50.0); + area.setY(75.0); + area.setWidth(100.0); + area.setHeight(80.0); + area.setType(AreaType.IGNORE); + + assertThat(area.getPage(), is(2)); + assertThat(area.getX(), is(50.0)); + assertThat(area.getY(), is(75.0)); + assertThat(area.getWidth(), is(100.0)); + assertThat(area.getHeight(), is(80.0)); + assertThat(area.getType(), is(AreaType.IGNORE)); + } +} diff --git a/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java b/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java index 2259964955..446e3905df 100755 --- a/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java +++ b/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java @@ -27,6 +27,8 @@ import org.grobid.service.util.ZipUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import java.io.File; import java.io.InputStream; @@ -145,8 +147,10 @@ public Response processHeaderDocumentReturnXml_post( @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights, @DefaultValue("0") @FormDataParam(INCLUDE_DISCARDED_TEXT) String includeDiscardedText, @DefaultValue("0") @FormDataParam("start") int startPage, - @DefaultValue("2") @FormDataParam("end") int endPage) { + @DefaultValue("2") @FormDataParam("end") int endPage, + @FormDataParam("typedAreas") String typedAreas) { int consol = validateConsolidationParam(consolidate); + List typedAreasList = parseTypedAreas(typedAreas); return restProcessFiles.processStatelessHeaderDocument( inputStream, consol, validateIncludeRawParam(includeRawAffiliations), @@ -154,6 +158,7 @@ public Response processHeaderDocumentReturnXml_post( validateIncludeRawParam(includeDiscardedText), startPage, endPage, + typedAreasList, ExpectedResponseType.XML ); } @@ -168,14 +173,17 @@ public Response processHeaderFundingDocumentReturnXml_post( @DefaultValue("0") @FormDataParam(CONSOLIDATE_FUNDERS) String consolidateFunders, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights, - @DefaultValue("0") @FormDataParam(INCLUDE_DISCARDED_TEXT) String includeDiscardedText) { + @DefaultValue("0") @FormDataParam(INCLUDE_DISCARDED_TEXT) String includeDiscardedText, + @FormDataParam("typedAreas") String typedAreas) { int consolHeader = validateConsolidationParam(consolidateHeader); int consolFunders = validateConsolidationParam(consolidateFunders); + List typedAreasList = parseTypedAreas(typedAreas); return restProcessFiles.processStatelessHeaderFundingDocument( inputStream, consolHeader, consolFunders, validateIncludeRawParam(includeRawAffiliations), validateIncludeRawParam(includeRawCopyrights), - validateIncludeRawParam(includeDiscardedText) + validateIncludeRawParam(includeDiscardedText), + typedAreasList ); } @@ -191,7 +199,8 @@ public Response processStatelessHeaderDocumentReturnXml( @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights, @DefaultValue("0") @FormDataParam(INCLUDE_DISCARDED_TEXT) String includeDiscardedText, @DefaultValue("0") @FormDataParam("start") int startPage, - @DefaultValue("2") @FormDataParam("end") int endPage) { + @DefaultValue("2") @FormDataParam("end") int endPage, + @FormDataParam("typedAreas") String typedAreas) { return processHeaderDocumentReturnXml_post( inputStream, consolidate, @@ -199,7 +208,8 @@ public Response processStatelessHeaderDocumentReturnXml( includeRawCopyrights, includeDiscardedText, startPage, - endPage + endPage, + typedAreas ); } @@ -213,13 +223,19 @@ public Response processHeaderDocumentReturnBibTeX_post( @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights, @DefaultValue("0") @FormDataParam("start") int startPage, - @DefaultValue("2") @FormDataParam("end") int endPage) { + @DefaultValue("2") @FormDataParam("end") int endPage, + @FormDataParam("typedAreas") String typedAreas) { int consol = validateConsolidationParam(consolidate); + List typedAreasList = parseTypedAreas(typedAreas); return restProcessFiles.processStatelessHeaderDocument( inputStream, consol, validateIncludeRawParam(includeRawAffiliations), validateIncludeRawParam(includeRawCopyrights), + false, // includeDiscardedText - not used in BibTeX mode + startPage, + endPage, + typedAreasList, ExpectedResponseType.BIBTEX ); } @@ -234,14 +250,16 @@ public Response processStatelessHeaderDocumentReturnBibTeX( @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights, @DefaultValue("0") @FormDataParam("start") int startPage, - @DefaultValue("2") @FormDataParam("end") int endPage) { + @DefaultValue("2") @FormDataParam("end") int endPage, + @FormDataParam("typedAreas") String typedAreas) { return processHeaderDocumentReturnBibTeX_post( inputStream, consolidate, includeRawAffiliations, includeRawCopyrights, startPage, - endPage + endPage, + typedAreas ); } @@ -263,12 +281,13 @@ public Response processFulltextDocument_post( @DefaultValue("-1") @FormDataParam("end") int endPage, @FormDataParam("generateIDs") String generateIDs, @FormDataParam("segmentSentences") String segmentSentences, - @FormDataParam("teiCoordinates") List coordinates + @FormDataParam("teiCoordinates") List coordinates, + @FormDataParam("typedAreas") String typedAreas ) throws Exception { return processFulltext( inputStream, flavor, consolidateHeader, consolidateCitations, consolidateFunders, includeRawAffiliations, includeRawCitations, includeRawCopyrights, includeDiscardedText, - startPage, endPage, generateIDs, segmentSentences, coordinates + startPage, endPage, generateIDs, segmentSentences, coordinates, typedAreas ); } @@ -290,12 +309,13 @@ public Response processFulltextDocument( @DefaultValue("-1") @FormDataParam("end") int endPage, @FormDataParam("generateIDs") String generateIDs, @FormDataParam("segmentSentences") String segmentSentences, - @FormDataParam("teiCoordinates") List coordinates + @FormDataParam("teiCoordinates") List coordinates, + @FormDataParam("typedAreas") String typedAreas ) throws Exception { return processFulltext( inputStream, flavor, consolidateHeader, consolidateCitations, consolidateFunders, includeRawAffiliations, includeRawCitations, includeRawCopyrights, includeDiscardedText, - startPage, endPage, generateIDs, segmentSentences, coordinates + startPage, endPage, generateIDs, segmentSentences, coordinates, typedAreas ); } @@ -312,7 +332,8 @@ private Response processFulltext(InputStream inputStream, int endPage, String generateIDs, String segmentSentences, - List coordinates + List coordinates, + String typedAreas ) throws Exception { int consolHeader = validateConsolidationParam(consolidateHeader); int consolCitations = validateConsolidationParam(consolidateCitations); @@ -323,6 +344,7 @@ private Response processFulltext(InputStream inputStream, GrobidModels.Flavor flavorValidated = validateModelFlavor(flavor); List teiCoordinates = collectCoordinates(coordinates); + List typedAreasList = parseTypedAreas(typedAreas); if (flavorValidated == BLANK) { return restProcessFiles.processFulltextDocumentBlank( @@ -341,7 +363,7 @@ private Response processFulltext(InputStream inputStream, includeRaw, validateIncludeRawParam(includeRawCopyrights), validateIncludeRawParam(includeDiscardedText), - startPage, endPage, generate, segment, teiCoordinates + startPage, endPage, generate, segment, teiCoordinates, typedAreasList ); } @@ -360,6 +382,100 @@ private List collectCoordinates(List coordinates) { return teiCoordinates; } + private List parseTypedAreas(String ignoreAreasJson) { + List typedAreasList = new ArrayList<>(); + + if (ignoreAreasJson == null || ignoreAreasJson.trim().isEmpty()) { + return typedAreasList; + } + + try { + // Parse JSON array of typed areas + ObjectMapper mapper = new ObjectMapper(); + JsonNode rootNode = mapper.readTree(ignoreAreasJson); + + if (rootNode.isArray()) { + for (JsonNode node : rootNode) { + try { + int page = node.get("page").asInt(); + double x = node.get("x").asDouble(); + double y = node.get("y").asDouble(); + double width = node.get("width").asDouble(); + double height = node.get("height").asDouble(); + + // New format: "type" field is required and should be "figure", "table", or "ignore" + if (!node.has("type")) { + LOGGER.warn("Typed area missing required 'type' field: " + node.toString()); + continue; + } + + String typeString = node.get("type").asText(); + org.grobid.core.layout.AreaType areaType = + org.grobid.core.layout.AreaType.fromString(typeString); + + org.grobid.core.layout.IgnoreArea area = + new org.grobid.core.layout.IgnoreArea(page, x, y, width, height, areaType); + typedAreasList.add(area); + } catch (Exception e) { + LOGGER.warn("Failed to parse typed area from JSON: " + node.toString(), e); + } + } + } else { + LOGGER.warn("typedAreas should be a JSON array, but received: " + ignoreAreasJson); + } + } catch (Exception e) { + LOGGER.error("Failed to parse typed areas JSON: " + ignoreAreasJson, e); + } + + return typedAreasList; + } + + /** + * Legacy method for backward compatibility. + * @deprecated Use {@link #parseTypedAreas(String)} instead. + */ + @Deprecated + private List parseIgnoreAreas(String ignoreAreasJson) { + // Convert legacy ignore areas to typed areas with IGNORE type + List typedAreasList = new ArrayList<>(); + + if (ignoreAreasJson == null || ignoreAreasJson.trim().isEmpty()) { + return typedAreasList; + } + + try { + // Parse JSON array of ignore areas + ObjectMapper mapper = new ObjectMapper(); + JsonNode rootNode = mapper.readTree(ignoreAreasJson); + + if (rootNode.isArray()) { + for (JsonNode node : rootNode) { + try { + int page = node.get("page").asInt(); + double x = node.get("x").asDouble(); + double y = node.get("y").asDouble(); + double width = node.get("width").asDouble(); + double height = node.get("height").asDouble(); + // Legacy name field is ignored, all areas are treated as IGNORE type + + org.grobid.core.layout.IgnoreArea area = + new org.grobid.core.layout.IgnoreArea(page, x, y, width, height, + org.grobid.core.layout.AreaType.IGNORE); + typedAreasList.add(area); + } catch (Exception e) { + LOGGER.warn("Failed to parse ignore area from JSON: " + node.toString(), e); + } + } + } else { + LOGGER.warn("typedAreas should be a JSON array, but received: " + ignoreAreasJson); + } + } catch (Exception e) { + LOGGER.error("Failed to parse ignore areas JSON: " + ignoreAreasJson, e); + } + + return typedAreasList; + } + private boolean validateGenerateIdParam(String generateIDs) { boolean generate = false; if ((generateIDs != null) && (generateIDs.equals("1") || generateIDs.equals("true"))) { @@ -401,11 +517,12 @@ public Response processFulltextAssetDocument_post( @DefaultValue("-1") @FormDataParam("end") int endPage, @FormDataParam("generateIDs") String generateIDs, @FormDataParam("segmentSentences") String segmentSentences, - @FormDataParam("teiCoordinates") List coordinates) throws Exception { + @FormDataParam("teiCoordinates") List coordinates, + @FormDataParam("typedAreas") String typedAreas) throws Exception { return processStatelessFulltextAssetHelper( inputStream, flavor, consolidateHeader, consolidateCitations, consolidateFunders, includeRawAffiliations, includeRawCitations, includeRawCopyrights, - startPage, endPage, generateIDs, segmentSentences, coordinates + startPage, endPage, generateIDs, segmentSentences, coordinates, typedAreas ); } @@ -426,11 +543,12 @@ public Response processStatelessFulltextAssetDocument( @DefaultValue("-1") @FormDataParam("end") int endPage, @FormDataParam("generateIDs") String generateIDs, @FormDataParam("segmentSentences") String segmentSentences, - @FormDataParam("teiCoordinates") List coordinates) throws Exception { + @FormDataParam("teiCoordinates") List coordinates, + @FormDataParam("typedAreas") String typedAreas) throws Exception { return processStatelessFulltextAssetHelper( inputStream, flavor, consolidateHeader, consolidateCitations, consolidateFunders, includeRawAffiliations, includeRawCitations, includeRawCopyrights, - startPage, endPage, generateIDs, segmentSentences, coordinates + startPage, endPage, generateIDs, segmentSentences, coordinates, typedAreas ); } @@ -446,7 +564,8 @@ private Response processStatelessFulltextAssetHelper(InputStream inputStream, int endPage, String generateIDs, String segmentSentences, - List coordinates) throws Exception { + List coordinates, + String typedAreas) throws Exception { int consolHeader = validateConsolidationParam(consolidateHeader); int consolCitations = validateConsolidationParam(consolidateCitations); int consolFunders = validateConsolidationParam(consolidateFunders); @@ -454,13 +573,14 @@ private Response processStatelessFulltextAssetHelper(InputStream inputStream, boolean generate = validateGenerateIdParam(generateIDs); boolean segment = validateGenerateIdParam(segmentSentences); List teiCoordinates = collectCoordinates(coordinates); + List typedAreasList = parseTypedAreas(typedAreas); GrobidModels.Flavor validatedModelFlavor = validateModelFlavor(flavor); return restProcessFiles.processStatelessFulltextAssetDocument( inputStream, validatedModelFlavor, consolHeader, consolCitations, consolFunders, validateIncludeRawParam(includeRawAffiliations), includeRaw, validateIncludeRawParam(includeRawCopyrights), - startPage, endPage, generate, segment, teiCoordinates + startPage, endPage, generate, segment, teiCoordinates, typedAreasList ); } diff --git a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java index e402ebcf86..ed3b90967c 100644 --- a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java +++ b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java @@ -76,6 +76,7 @@ public Response processStatelessHeaderDocument( false, 0, 2, + null, expectedResponseType ); } @@ -97,6 +98,7 @@ public Response processStatelessHeaderDocument( false, startPage, endPage, + null, expectedResponseType ); } @@ -109,6 +111,7 @@ public Response processStatelessHeaderDocument( final boolean includeDiscardedText, int startPage, int endPage, + final List typedAreas, ExpectedResponseType expectedResponseType ) { LOGGER.debug(methodLogIn()); @@ -141,15 +144,20 @@ public Response processStatelessHeaderDocument( BiblioItem result = new BiblioItem(); // starts conversion process + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder() + .consolidateHeader(consolidate) + .includeRawAffiliations(includeRawAffiliations) + .includeRawCopyrights(includeRawCopyrights) + .includeDiscardedText(includeDiscardedText) + .startPage(startPage) + .endPage(endPage) + .typedAreas(typedAreas) + .build(); + retVal = engine.processHeader( originFile.getAbsolutePath(), md5Str, - consolidate, - includeRawAffiliations, - includeRawCopyrights, - includeDiscardedText, - startPage, - endPage, + config, result ); @@ -201,7 +209,8 @@ public Response processStatelessHeaderFundingDocument( final int consolidateFunders, final boolean includeRawAffiliations, final boolean includeRawCopyrights, - final boolean includeDiscardedText + final boolean includeDiscardedText, + final List typedAreas ) { LOGGER.debug(methodLogIn()); String retVal = null; @@ -231,14 +240,19 @@ public Response processStatelessHeaderFundingDocument( String md5Str = DatatypeConverter.printHexBinary(digest).toUpperCase(); // starts conversion process + GrobidAnalysisConfig config = GrobidAnalysisConfig.builder() + .consolidateHeader(consolidateHeader) + .consolidateFunders(consolidateFunders) + .includeRawAffiliations(includeRawAffiliations) + .includeRawCopyrights(includeRawCopyrights) + .includeDiscardedText(includeDiscardedText) + .typedAreas(typedAreas) + .build(); + retVal = engine.processHeaderFunding( originFile, md5Str, - consolidateHeader, - consolidateFunders, - includeRawAffiliations, - includeRawCopyrights, - includeDiscardedText + config ); if (GrobidRestUtils.isResultNullOrEmpty(retVal)) { @@ -300,7 +314,8 @@ public Response processFulltextDocument(final InputStream inputStream, final int endPage, final boolean generateIDs, final boolean segmentSentences, - final List teiCoordinates) throws Exception { + final List teiCoordinates, + final List typedAreas) throws Exception { LOGGER.debug(methodLogIn()); String retVal = null; @@ -344,6 +359,7 @@ public Response processFulltextDocument(final InputStream inputStream, .generateTeiCoordinates(teiCoordinates) .withSentenceSegmentation(segmentSentences) .flavor(flavor) + .typedAreas(typedAreas) .build(); retVal = engine.fullTextToTEI(originFile, flavor, md5Str, config); @@ -479,7 +495,8 @@ public Response processStatelessFulltextAssetDocument( final int endPage, final boolean generateIDs, final boolean segmentSentences, - final List teiCoordinates + final List teiCoordinates, + final List typedAreas ) throws Exception { LOGGER.debug(methodLogIn()); @@ -528,6 +545,7 @@ public Response processStatelessFulltextAssetDocument( .pdfAssetPath(new File(assetPath)) .withSentenceSegmentation(segmentSentences) .flavor(flavor) + .typedAreas(typedAreas) .build(); retVal = engine.fullTextToTEI(originFile, flavor, md5Str, config); From 52d3272612c1f5bc25d29dcd49fbe8ce26a8fee1 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 4 Mar 2026 00:55:21 +0100 Subject: [PATCH 2/4] feat: implement typed areas filtering for layout tokens and remove legacy ignore areas support --- doc/Typed-Areas-API.md | 16 +- .../org/grobid/core/document/Document.java | 61 +-- .../grobid/core/engines/FullTextParser.java | 489 ++++++++++++++++-- .../org/grobid/core/engines/HeaderParser.java | 5 +- .../engines/config/GrobidAnalysisConfig.java | 12 - .../org/grobid/service/GrobidRestService.java | 54 +- 6 files changed, 488 insertions(+), 149 deletions(-) diff --git a/doc/Typed-Areas-API.md b/doc/Typed-Areas-API.md index 9320a2f69e..057ac6a63d 100644 --- a/doc/Typed-Areas-API.md +++ b/doc/Typed-Areas-API.md @@ -378,16 +378,9 @@ Processed typed areas are integrated into the standard TEI output: ## Migration from Legacy ignoreAreas -The new `typedAreas` parameter replaces the legacy `ignoreAreas` parameter: +The legacy `ignoreAreas` parameter has been **removed**. The `typedAreas` parameter is now the only supported way to define areas for processing. -**Old format (deprecated):** -```json -[ - {"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, "name": "figure"} -] -``` - -**New format (required):** +**Required format:** ```json [ {"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, "type": "figure"}, @@ -395,10 +388,7 @@ The new `typedAreas` parameter replaces the legacy `ignoreAreas` parameter: ] ``` -**Key Changes:** -- `name` field replaced with required `type` field -- Support for figure and table processing (not just ignoring) -- Type-safe area classification with enum validation +If you were previously using `ignoreAreas`, replace it with `typedAreas` and set `"type": "ignore"` for each area. ## Troubleshooting diff --git a/grobid-core/src/main/java/org/grobid/core/document/Document.java b/grobid-core/src/main/java/org/grobid/core/document/Document.java index 81babc9f1f..6a8eee9ba9 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/Document.java +++ b/grobid-core/src/main/java/org/grobid/core/document/Document.java @@ -65,9 +65,11 @@ import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.IdentityHashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.SortedSet; import java.util.regex.Matcher; import java.util.stream.Collectors; @@ -168,6 +170,9 @@ public void setImages(List images) { protected transient List tableTokens = new ArrayList<>(); protected transient List ignoredTokens = new ArrayList<>(); + // tokens that fall within typed areas and should be excluded from body processing + protected transient Set excludedTokens = Collections.newSetFromMap(new IdentityHashMap<>()); + // the analyzer/tokenizer used for processing this document protected transient Analyzer analyzer = GrobidAnalyzer.getInstance(); @@ -791,7 +796,13 @@ public void setLabeledBlocks(SortedSetMultimap labeledBlo // helper public List getDocumentPieceTokenization(DocumentPiece dp) { - return tokenizations.subList(dp.getLeft().getTokenDocPos(), dp.getRight().getTokenDocPos() + 1); + List subList = tokenizations.subList(dp.getLeft().getTokenDocPos(), dp.getRight().getTokenDocPos() + 1); + if (!excludedTokens.isEmpty()) { + return subList.stream() + .filter(t -> !excludedTokens.contains(t)) + .collect(Collectors.toList()); + } + return subList; } public String getDocumentPieceText(DocumentPiece dp) { @@ -1766,9 +1777,13 @@ public void setIgnoredTokens(List ignoredTokens) { this.ignoredTokens = ignoredTokens != null ? ignoredTokens : new ArrayList<>(); } + public boolean isTokenExcluded(LayoutToken token) { + return excludedTokens.contains(token); + } + /** - * Filters out layout tokens that fall within the specified ignore areas and categorizes them by type. - * This replaces the old filterLayoutTokensByIgnoreAreas method to support typed areas. + * Filters out layout tokens that fall within the specified typed areas and categorizes them by type. + * Tokens in figure/table areas are collected for ML-based processing; tokens in ignore areas are discarded. * * @param typedAreas list of typed areas for specialized processing */ @@ -1806,14 +1821,12 @@ public void filterLayoutTokensByTypedAreas(List typedAreas) { } } - List filteredTokens = new ArrayList<>(); + excludedTokens.clear(); int figureTokenCount = 0; int tableTokenCount = 0; int ignoredTokenCount = 0; for (LayoutToken token : tokenizations) { - boolean tokenProcessed = false; - // Check if token intersects with any typed area for (IgnoreArea area : typedAreas) { if (area.contains(token)) { @@ -1821,52 +1834,24 @@ public void filterLayoutTokensByTypedAreas(List typedAreas) { case FIGURE: figureTokens.add(token); figureTokenCount++; - tokenProcessed = true; break; case TABLE: tableTokens.add(token); tableTokenCount++; - tokenProcessed = true; break; case IGNORE: ignoredTokens.add(token); ignoredTokenCount++; - tokenProcessed = true; break; } - if (tokenProcessed) { - break; - } + excludedTokens.add(token); + break; } } - - // Keep token only if it wasn't processed by any typed area - if (!tokenProcessed) { - filteredTokens.add(token); - } } - tokenizations = filteredTokens; - LOGGER.debug("Processed typed areas: {} figure tokens, {} table tokens, {} ignored tokens, {} main tokens remaining", - figureTokenCount, tableTokenCount, ignoredTokenCount, tokenizations.size()); + LOGGER.debug("Processed typed areas: {} figure tokens, {} table tokens, {} ignored tokens, {} excluded total", + figureTokenCount, tableTokenCount, ignoredTokenCount, excludedTokens.size()); } - /** - * Legacy method for backward compatibility. - * @deprecated Use {@link #filterLayoutTokensByTypedAreas(List)} instead. - */ - @Deprecated - public void filterLayoutTokensByIgnoreAreas(List ignoreAreas) { - // Convert all ignore areas to IGNORE type and use new method - List typedAreas = new ArrayList<>(); - if (ignoreAreas != null) { - for (IgnoreArea area : ignoreAreas) { - // Create a new area with IGNORE type - IgnoreArea ignoreArea = new IgnoreArea(area.getPage(), area.getX(), area.getY(), - area.getWidth(), area.getHeight(), AreaType.IGNORE); - typedAreas.add(ignoreArea); - } - } - filterLayoutTokensByTypedAreas(typedAreas); - } } diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 3cb8d4451f..4fcae50d63 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -156,14 +156,11 @@ public Document processing(DocumentSource documentSource, // general segmentation Document doc = parsers.getSegmentationParser(flavor).processing(documentSource, config); - // Apply typed areas filtering if configured (takes precedence over legacy ignoreAreas) + // Apply typed areas filtering if configured if (config.getTypedAreas() != null && !config.getTypedAreas().isEmpty()) { doc.filterLayoutTokensByTypedAreas(config.getTypedAreas()); // Apply specialized processing for figures and tables processTypedAreas(doc); - } else if (config.getIgnoreAreas() != null && !config.getIgnoreAreas().isEmpty()) { - // Legacy support for old ignoreAreas - doc.filterLayoutTokensByIgnoreAreas(config.getIgnoreAreas()); } SortedSet documentBodyParts = doc.getDocumentPart(SegmentationLabels.BODY); @@ -288,7 +285,11 @@ else if (config.getConsolidateCitations() == 2) bodyFigures = processFigures(bodyResults, bodyTokenization.getTokenization()); doc.setFigures(bodyFigures); - bodyResults = fixFiguresLabellingResults(doc, bodyResults); + // Skip graphic object reassignment when user provided figure areas, + // since those areas already account for their graphic objects + if (doc.getFigureAreas().isEmpty()) { + bodyResults = fixFiguresLabellingResults(doc, bodyResults); + } // Figures @@ -494,7 +495,7 @@ private void postProcessFigureCaptions(List
figures, Document doc) { } private static String fixFiguresLabellingResults(Document doc, String bodyResults) { - List>>> updatedFigures = doc.assignGraphicObjectsToFigures(); + List>>> updatedFigures = doc. assignGraphicObjectsToFigures(); for(Triple>> update: updatedFigures) { List> difference = update.getRight(); @@ -735,14 +736,11 @@ public Document processingHeaderFunding(DocumentSource documentSource, // general segmentation Document doc = parsers.getSegmentationParser().processing(documentSource, config); - // Apply typed areas filtering if configured (takes precedence over legacy ignoreAreas) + // Apply typed areas filtering if configured if (config.getTypedAreas() != null && !config.getTypedAreas().isEmpty()) { doc.filterLayoutTokensByTypedAreas(config.getTypedAreas()); // Apply specialized processing for figures and tables processTypedAreas(doc); - } else if (config.getIgnoreAreas() != null && !config.getIgnoreAreas().isEmpty()) { - // Legacy support for old ignoreAreas - doc.filterLayoutTokensByIgnoreAreas(config.getIgnoreAreas()); } // header processing @@ -1084,6 +1082,13 @@ else if (nbAuthorType > (bibDataSets.size() / 2)) } LayoutToken token = tokens.get(n); + + // Skip tokens that fall within typed areas (figure, table, ignore) + if (doc.isTokenExcluded(token)) { + n++; + continue; + } + layoutTokens.add(token); features = new FeaturesVectorFulltext(); @@ -3564,6 +3569,383 @@ public static List getDocumentFullTextTokens(List> generateFeaturesForTokens(List tokens, Document doc) { + if (CollectionUtils.isEmpty(tokens)) { + return null; + } + + FeatureFactory featureFactory = FeatureFactory.getInstance(); + StringBuilder fulltext = new StringBuilder(); + String currentFont = null; + int currentFontSize = -1; + + List blocks = doc.getBlocks(); + if (CollectionUtils.isEmpty(blocks)) { + return null; + } + + FeaturesVectorFulltext features; + FeaturesVectorFulltext previousFeatures = null; + + List filteredTokens = new ArrayList<>(); + + int mm = 0; // page position + int nn = 0; // document position + double lineStartX = Double.NaN; + boolean indented = false; + boolean previousNewline = false; + boolean newline; + + // Compute total text length for relative position + int fulltextLength = 0; + for (LayoutToken t : tokens) { + String text = t.getText(); + if (text != null) { + String cleaned = text.replace(" ", ""); + if (!cleaned.isEmpty() && !cleaned.equals("\n")) { + fulltextLength += cleaned.length(); + } + } + } + + // Track block boundaries and graphics per block + int previousBlockPtr = -1; + boolean graphicVector = false; + boolean graphicBitmap = false; + double density = 0.0; + double spacingPreviousBlock = 0.0; + double lowestPos = 0.0; + int currentPage = -1; + + for (int i = 0; i < tokens.size(); i++) { + LayoutToken token = tokens.get(i); + + // Detect block boundary changes + int blockPtr = token.getBlockPtr(); + boolean isNewBlock = (blockPtr != previousBlockPtr); + + if (isNewBlock && blockPtr >= 0 && blockPtr < blocks.size()) { + Block block = blocks.get(blockPtr); + graphicVector = false; + graphicBitmap = false; + + double pageHeight = block.getPage().getHeight(); + int localPage = block.getPage().getNumber(); + if (localPage != currentPage) { + currentPage = localPage; + mm = 0; + lowestPos = 0.0; + spacingPreviousBlock = 0.0; + } + + if (lowestPos > block.getY()) { + spacingPreviousBlock = doc.getMaxBlockSpacing() / 5.0; + } else { + spacingPreviousBlock = block.getY() - lowestPos; + } + + String localText = block.getText(); + if (localText != null && !localText.contains("@PAGE") && !localText.contains("@IMAGE")) { + if (block.getHeight() != 0.0 && block.getWidth() != 0.0) { + density = (double) localText.length() / (block.getHeight() * block.getWidth()); + } + } + + List localImages = Document.getConnectedGraphics(block, doc); + if (localImages != null) { + for (GraphicObject localImage : localImages) { + if (localImage.getType() == GraphicObjectType.BITMAP) + graphicBitmap = true; + if (localImage.getType() == GraphicObjectType.VECTOR || localImage.getType() == GraphicObjectType.VECTOR_BOX) + graphicVector = true; + } + } + + previousBlockPtr = blockPtr; + } + + features = new FeaturesVectorFulltext(); + features.token = token; + + double coordinateLineY = token.getY(); + + String text = token.getText(); + if (text == null || text.isEmpty()) { + continue; + } + text = text.replace(" ", ""); + if (text.isEmpty()) { + mm++; + nn++; + continue; + } + if (text.equals("\n")) { + previousNewline = true; + mm++; + nn++; + continue; + } + newline = false; + + // final sanitisation and filtering + text = text.replaceAll("[ \n]", ""); + if (TextUtilities.filterLine(text)) { + continue; + } + + if (previousNewline) { + newline = true; + previousNewline = false; + if (previousFeatures != null) { + double previousLineStartX = lineStartX; + lineStartX = token.getX(); + double characterWidth = token.width / text.length(); + if (!Double.isNaN(previousLineStartX)) { + if (previousLineStartX - lineStartX > characterWidth) + indented = false; + else if (lineStartX - previousLineStartX > characterWidth) + indented = true; + } + } + } + + filteredTokens.add(token); + features.string = text; + + if (graphicBitmap) { + features.bitmapAround = true; + } + if (graphicVector) { + features.vectorAround = true; + } + + if (newline) { + features.lineStatus = "LINESTART"; + lineStartX = token.getX(); + if (previousFeatures != null) { + if (!"LINESTART".equals(previousFeatures.lineStatus)) + previousFeatures.lineStatus = "LINEEND"; + } + } + + Matcher m0 = featureFactory.isPunct.matcher(text); + if (m0.find()) { + features.punctType = "PUNCT"; + } + if (text.equals("(") || text.equals("[")) { + features.punctType = "OPENBRACKET"; + } else if (text.equals(")") || text.equals("]")) { + features.punctType = "ENDBRACKET"; + } else if (text.equals(".")) { + features.punctType = "DOT"; + } else if (text.equals(",")) { + features.punctType = "COMMA"; + } else if (text.equals("-")) { + features.punctType = "HYPHEN"; + } else if (text.equals("\"") || text.equals("\'") || text.equals("`")) { + features.punctType = "QUOTE"; + } + + if (indented) { + features.alignmentStatus = "LINEINDENT"; + } else { + features.alignmentStatus = "ALIGNEDLEFT"; + } + + if (isNewBlock) { + features.lineStatus = "LINESTART"; + if (previousFeatures != null) { + if (!"LINESTART".equals(previousFeatures.lineStatus)) + previousFeatures.lineStatus = "LINEEND"; + } + lineStartX = token.getX(); + features.blockStatus = "BLOCKSTART"; + } else { + // Look ahead for end of line + boolean endline = false; + boolean endblock = false; + int ii = 1; + boolean endloop = false; + while ((i + ii < tokens.size()) && (!endloop)) { + LayoutToken tok = tokens.get(i + ii); + if (tok != null) { + String toto = tok.getText(); + if (toto != null) { + if (toto.equals("\n")) { + endline = true; + endloop = true; + } else { + if (toto.length() != 0 + && !toto.startsWith("@IMAGE") + && !toto.startsWith("@PAGE") + && !text.contains(".pbm") + && !text.contains(".svg") + && !text.contains(".png") + && !text.contains(".jpg")) { + endloop = true; + } + } + } + } + // Check if we're switching blocks + if (tok.getBlockPtr() != token.getBlockPtr()) { + endblock = true; + endline = true; + endloop = true; + } + if (i + ii == tokens.size() - 1) { + endblock = true; + endline = true; + } + ii++; + } + + if (!endline && !newline) { + features.lineStatus = "LINEIN"; + } else if (!newline) { + features.lineStatus = "LINEEND"; + previousNewline = true; + } + + if (!endblock && features.blockStatus == null) + features.blockStatus = "BLOCKIN"; + else if (features.blockStatus == null) { + features.blockStatus = "BLOCKEND"; + } + } + + if (text.length() == 1) { + features.singleChar = true; + } + + if (Character.isUpperCase(text.charAt(0))) { + features.capitalisation = "INITCAP"; + } + + if (featureFactory.test_all_capital(text)) { + features.capitalisation = "ALLCAP"; + } + + if (featureFactory.test_digit(text)) { + features.digit = "CONTAINSDIGITS"; + } + + Matcher m = featureFactory.isDigit.matcher(text); + if (m.find()) { + features.digit = "ALLDIGIT"; + } + + if (currentFont == null) { + currentFont = token.getFont(); + features.fontStatus = "NEWFONT"; + } else if (!currentFont.equals(token.getFont())) { + currentFont = token.getFont(); + features.fontStatus = "NEWFONT"; + } else { + features.fontStatus = "SAMEFONT"; + } + + int newFontSize = (int) token.getFontSize(); + if (currentFontSize == -1) { + currentFontSize = newFontSize; + features.fontSize = "HIGHERFONT"; + } else if (currentFontSize == newFontSize) { + features.fontSize = "SAMEFONTSIZE"; + } else if (currentFontSize < newFontSize) { + features.fontSize = "HIGHERFONT"; + currentFontSize = newFontSize; + } else { + features.fontSize = "LOWERFONT"; + currentFontSize = newFontSize; + } + + if (token.isBold()) + features.bold = true; + + if (token.isItalic()) + features.italic = true; + + if (features.capitalisation == null) + features.capitalisation = "NOCAPS"; + + if (features.digit == null) + features.digit = "NODIGIT"; + + if (features.punctType == null) + features.punctType = "NOPUNCT"; + + features.relativeDocumentPosition = featureFactory + .linearScaling(nn, fulltextLength, NBBINS_POSITION); + + features.relativePagePositionChar = featureFactory + .linearScaling(mm, 0, NBBINS_POSITION); + + double pageHeight = 1.0; + if (token.getPage() >= 0 && doc.getPages() != null && token.getPage() < doc.getPages().size()) { + Page page = doc.getPages().get(token.getPage()); + if (page != null) { + pageHeight = page.getHeight(); + } + } + int pagePos = featureFactory.linearScaling(coordinateLineY, pageHeight, NBBINS_POSITION); + if (pagePos > NBBINS_POSITION) + pagePos = NBBINS_POSITION; + features.relativePagePosition = pagePos; + + if (spacingPreviousBlock != 0.0) { + features.spacingWithPreviousBlock = featureFactory + .linearScaling(spacingPreviousBlock - doc.getMinBlockSpacing(), + doc.getMaxBlockSpacing() - doc.getMinBlockSpacing(), NBBINS_SPACE); + } + + if (density != -1.0) { + features.characterDensity = featureFactory + .linearScaling(density - doc.getMinCharacterDensity(), + doc.getMaxCharacterDensity() - doc.getMinCharacterDensity(), NBBINS_DENSITY); + } + + features.calloutType = "UNKNOWN"; + features.calloutKnown = false; + + if (token.isSuperscript()) { + features.superscript = true; + } + + // Deferred print pattern: print previous features before overwriting + if (previousFeatures != null) { + if (features.blockStatus.equals("BLOCKSTART") && + previousFeatures.blockStatus.equals("BLOCKIN")) { + previousFeatures.blockStatus = "BLOCKEND"; + previousFeatures.lineStatus = "LINEEND"; + } + fulltext.append(previousFeatures.printVector()); + } + + mm += text.length(); + nn += text.length(); + previousFeatures = features; + } + + // Flush last feature + if (previousFeatures != null) { + fulltext.append(previousFeatures.printVector()); + } + + if (fulltext.length() == 0) { + return null; + } + + return Pair.of(fulltext.toString(), filteredTokens); + } + /** * Process typed areas (figures, tables) using specialized models. * This method applies the appropriate figure and table parsers to pre-identified areas. @@ -3576,42 +3958,83 @@ protected void processTypedAreas(Document doc) { LOGGER.debug("Processing typed areas: {} figures, {} tables", doc.getFigureAreas().size(), doc.getTableAreas().size()); - // Process figure areas + // Process figure areas using the figure ML model if (!doc.getFigureAreas().isEmpty() && !doc.getFigureTokens().isEmpty()) { - try { - Figure processedFigure = parsers.getFigureParser() - .processing(doc.getFigureTokens(), null); + if (doc.getAnnexFigures() == null) { + doc.setAnnexFigures(new ArrayList<>()); + } - if (processedFigure != null) { - // Add processed figure to document's annex figures - if (doc.getAnnexFigures() == null) { - doc.setAnnexFigures(new ArrayList<>()); - } - doc.getAnnexFigures().add(processedFigure); - LOGGER.debug("Processed figure from typed areas"); + Figure figure = null; + try { + Pair> featurePair = + generateFeaturesForTokens(doc.getFigureTokens(), doc); + if (featurePair != null && isNotBlank(featurePair.getLeft())) { + figure = parsers.getFigureParser().processing( + featurePair.getRight(), featurePair.getLeft()); } } catch (Exception e) { - LOGGER.warn("Error processing figure areas: " + e.getMessage(), e); + LOGGER.warn("Figure ML processing failed, falling back to direct construction", e); + } + + if (figure == null) { + // Fallback: create Figure directly from tokens + figure = new Figure(); + figure.setContent(new StringBuilder(LayoutTokensUtil.toText(doc.getFigureTokens()))); } + figure.setLayoutTokens(doc.getFigureTokens()); + for (LayoutToken lt : doc.getFigureTokens()) { + if (!LayoutTokensUtil.spaceyToken(lt.t()) && !LayoutTokensUtil.newLineToken(lt.t())) { + figure.setPage(lt.getPage()); + break; + } + } + doc.getAnnexFigures().add(figure); + LOGGER.debug("Created figure from typed areas via ML processing"); } - // Process table areas + // Process table areas using the table ML model if (!doc.getTableAreas().isEmpty() && !doc.getTableTokens().isEmpty()) { + if (doc.getAnnexTables() == null) { + doc.setAnnexTables(new ArrayList<>()); + } + + List
tables = null; try { - List
processedTables = parsers.getTableParser() - .processing(doc.getTableTokens(), null); + Pair> featurePair = + generateFeaturesForTokens(doc.getTableTokens(), doc); + if (featurePair != null && isNotBlank(featurePair.getLeft())) { + tables = parsers.getTableParser().processing( + featurePair.getRight(), featurePair.getLeft()); + } + } catch (Exception e) { + LOGGER.warn("Table ML processing failed, falling back to direct construction", e); + } - if (processedTables != null && !processedTables.isEmpty()) { - // Add processed tables to document's annex tables - if (doc.getAnnexTables() == null) { - doc.setAnnexTables(new ArrayList<>()); + if (CollectionUtils.isNotEmpty(tables)) { + for (Table table : tables) { + table.setLayoutTokens(doc.getTableTokens()); + for (LayoutToken lt : doc.getTableTokens()) { + if (!LayoutTokensUtil.spaceyToken(lt.t()) && !LayoutTokensUtil.newLineToken(lt.t())) { + table.setPage(lt.getPage()); + break; + } } - doc.getAnnexTables().addAll(processedTables); - LOGGER.debug("Processed {} tables from typed areas", processedTables.size()); + doc.getAnnexTables().add(table); } - } catch (Exception e) { - LOGGER.warn("Error processing table areas: " + e.getMessage(), e); + } else { + // Fallback: create Table directly from tokens + Table table = new Table(); + table.setLayoutTokens(doc.getTableTokens()); + table.setContent(new StringBuilder(LayoutTokensUtil.toText(doc.getTableTokens()))); + for (LayoutToken lt : doc.getTableTokens()) { + if (!LayoutTokensUtil.spaceyToken(lt.t()) && !LayoutTokensUtil.newLineToken(lt.t())) { + table.setPage(lt.getPage()); + break; + } + } + doc.getAnnexTables().add(table); } + LOGGER.debug("Created table(s) from typed areas via ML processing"); } // Note: ignored areas are intentionally discarded and no further processing is performed diff --git a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java index acac33e47e..fa9370feb8 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java @@ -95,12 +95,9 @@ public Pair processing(File input, String md5Str, BiblioItem r documentSource.setMD5(md5Str); Document doc = parsers.getSegmentationParser().processing(documentSource, config); - // Apply typed areas filtering if configured (takes precedence over legacy ignoreAreas) + // Apply typed areas filtering if configured if (config.getTypedAreas() != null && !config.getTypedAreas().isEmpty()) { doc.filterLayoutTokensByTypedAreas(config.getTypedAreas()); - } else if (config.getIgnoreAreas() != null && !config.getIgnoreAreas().isEmpty()) { - // Legacy support for old ignoreAreas - doc.filterLayoutTokensByIgnoreAreas(config.getIgnoreAreas()); } String tei = processingHeaderSection(config, doc, resHeader, true); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/config/GrobidAnalysisConfig.java b/grobid-core/src/main/java/org/grobid/core/engines/config/GrobidAnalysisConfig.java index bfde090d5f..1ea85b37ec 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/config/GrobidAnalysisConfig.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/config/GrobidAnalysisConfig.java @@ -96,9 +96,6 @@ private GrobidAnalysisConfig() { // if true, the TEI text will be segmented into sentences private boolean withSentenceSegmentation = false; - // list of areas to ignore during processing (legacy) - private List ignoreAreas = null; - // list of typed areas for specialized processing private List typedAreas = null; @@ -223,11 +220,6 @@ public GrobidAnalysisConfigBuilder flavor(GrobidModels.Flavor a) { return this; } - public GrobidAnalysisConfigBuilder ignoreAreas(List areas) { - config.ignoreAreas = areas; - return this; - } - public GrobidAnalysisConfigBuilder typedAreas(List areas) { config.typedAreas = areas; return this; @@ -360,10 +352,6 @@ public String getFlavor() { return flavor; } - public List getIgnoreAreas() { - return ignoreAreas; - } - public List getTypedAreas() { return typedAreas; } diff --git a/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java b/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java index 446e3905df..91c73ed8ee 100755 --- a/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java +++ b/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java @@ -382,17 +382,17 @@ private List collectCoordinates(List coordinates) { return teiCoordinates; } - private List parseTypedAreas(String ignoreAreasJson) { + private List parseTypedAreas(String typedAreasJson) { List typedAreasList = new ArrayList<>(); - if (ignoreAreasJson == null || ignoreAreasJson.trim().isEmpty()) { + if (typedAreasJson == null || typedAreasJson.trim().isEmpty()) { return typedAreasList; } try { // Parse JSON array of typed areas ObjectMapper mapper = new ObjectMapper(); - JsonNode rootNode = mapper.readTree(ignoreAreasJson); + JsonNode rootNode = mapper.readTree(typedAreasJson); if (rootNode.isArray()) { for (JsonNode node : rootNode) { @@ -421,60 +421,16 @@ private List parseTypedAreas(String ignoreAre } } } else { - LOGGER.warn("typedAreas should be a JSON array, but received: " + ignoreAreasJson); + LOGGER.warn("typedAreas should be a JSON array, but received: " + typedAreasJson); } } catch (Exception e) { - LOGGER.error("Failed to parse typed areas JSON: " + ignoreAreasJson, e); + LOGGER.error("Failed to parse typed areas JSON: " + typedAreasJson, e); } return typedAreasList; } - /** - * Legacy method for backward compatibility. - * @deprecated Use {@link #parseTypedAreas(String)} instead. - */ - @Deprecated - private List parseIgnoreAreas(String ignoreAreasJson) { - // Convert legacy ignore areas to typed areas with IGNORE type - List typedAreasList = new ArrayList<>(); - - if (ignoreAreasJson == null || ignoreAreasJson.trim().isEmpty()) { - return typedAreasList; - } - - try { - // Parse JSON array of ignore areas - ObjectMapper mapper = new ObjectMapper(); - JsonNode rootNode = mapper.readTree(ignoreAreasJson); - - if (rootNode.isArray()) { - for (JsonNode node : rootNode) { - try { - int page = node.get("page").asInt(); - double x = node.get("x").asDouble(); - double y = node.get("y").asDouble(); - double width = node.get("width").asDouble(); - double height = node.get("height").asDouble(); - // Legacy name field is ignored, all areas are treated as IGNORE type - org.grobid.core.layout.IgnoreArea area = - new org.grobid.core.layout.IgnoreArea(page, x, y, width, height, - org.grobid.core.layout.AreaType.IGNORE); - typedAreasList.add(area); - } catch (Exception e) { - LOGGER.warn("Failed to parse ignore area from JSON: " + node.toString(), e); - } - } - } else { - LOGGER.warn("typedAreas should be a JSON array, but received: " + ignoreAreasJson); - } - } catch (Exception e) { - LOGGER.error("Failed to parse ignore areas JSON: " + ignoreAreasJson, e); - } - - return typedAreasList; - } private boolean validateGenerateIdParam(String generateIDs) { boolean generate = false; From 627e6749d909bd0e525fcd66586643363fa652ed Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 4 Mar 2026 13:09:44 +0100 Subject: [PATCH 3/4] feat: update Typed Areas API documentation and enhance token filtering logic --- doc/Typed-Areas-API.md | 138 ++---------------- .../org/grobid/core/document/Document.java | 70 ++++++++- .../grobid/core/engines/FullTextParser.java | 10 +- .../org/grobid/service/GrobidRestService.java | 10 ++ 4 files changed, 89 insertions(+), 139 deletions(-) diff --git a/doc/Typed-Areas-API.md b/doc/Typed-Areas-API.md index 057ac6a63d..9ff1ec18c5 100644 --- a/doc/Typed-Areas-API.md +++ b/doc/Typed-Areas-API.md @@ -148,135 +148,25 @@ curl -v -H "Accept: application/xml" \ localhost:8070/api/processFulltextDocument ``` -### Python Examples +### Using a JSON File -**Using requests library:** -```python -import requests -import json +For complex area definitions, store them in a JSON file and pass it to curl: -# Define typed areas -typed_areas = [ - {"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, "type": "figure"}, - {"page": 1, "x": 450, "y": 200, "width": 250, "height": 200, "type": "table"}, - {"page": 1, "x": 50, "y": 750, "width": 500, "height": 30, "type": "ignore"} -] - -# Process document -with open('document.pdf', 'rb') as f: - files = {'input': f} - data = { - 'typedAreas': json.dumps(typed_areas), - 'consolidateHeader': '1', - 'segmentSentences': '1' - } - - response = requests.post( - 'http://localhost:8070/api/processFulltextDocument', - files=files, - data=data, - headers={'Accept': 'application/xml'} - ) - - if response.status_code == 200: - print(response.text) - else: - print(f"Error: {response.status_code} - {response.text}") -``` - -**Complex processing with multiple parameters:** -```python -import requests -import json - -def process_with_typed_areas(pdf_path, typed_areas, endpoint="processFulltextDocument"): - """Process a PDF with typed areas and additional parameters.""" - - url = f"http://localhost:8070/api/{endpoint}" - - with open(pdf_path, 'rb') as f: - files = {'input': f} - data = { - 'typedAreas': json.dumps(typed_areas), - 'consolidateHeader': '1', - 'consolidateCitations': '1', - 'segmentSentences': '1', - 'generateIDs': '1', - 'includeRawCitations': '1' - } - - response = requests.post( - url, - files=files, - data=data, - headers={'Accept': 'application/xml'} - ) - - return response - -# Example usage -figure_areas = [ - {"page": 1, "x": 85, "y": 120, "width": 440, "height": 280, "type": "figure"}, - {"page": 2, "x": 85, "y": 200, "width": 300, "height": 200, "type": "table"} +**Create `typed_areas.json`:** +```json +[ + {"page": 1, "x": 100, "y": 200, "width": 300, "height": 150, "type": "figure"}, + {"page": 1, "x": 450, "y": 200, "width": 250, "height": 200, "type": "table"}, + {"page": 1, "x": 50, "y": 750, "width": 500, "height": 30, "type": "ignore"} ] - -response = process_with_typed_areas("research_paper.pdf", figure_areas) -print(response.status_code) ``` -### JavaScript Examples - -**Using fetch API:** -```javascript -async function processWithTypedAreas(pdfFile, typedAreas) { - const formData = new FormData(); - formData.append('input', pdfFile); - formData.append('typedAreas', JSON.stringify(typedAreas)); - formData.append('consolidateHeader', '1'); - formData.append('segmentSentences', '1'); - - try { - const response = await fetch( - 'http://localhost:8070/api/processFulltextDocument', - { - method: 'POST', - body: formData, - headers: { - 'Accept': 'application/xml' - } - } - ); - - if (response.ok) { - const result = await response.text(); - return result; - } else { - throw new Error(`HTTP error! status: ${response.status}`); - } - } catch (error) { - console.error('Error processing document:', error); - throw error; - } -} - -// Usage example -const typedAreas = [ - {page: 1, x: 100, y: 200, width: 300, height: 150, type: "figure"}, - {page: 1, x: 450, y: 200, width: 250, height: 200, type: "table"} -]; - -const fileInput = document.getElementById('pdf-input'); -fileInput.addEventListener('change', async (event) => { - const file = event.target.files[0]; - if (file) { - try { - const result = await processWithTypedAreas(file, typedAreas); - console.log('Processing result:', result); - } catch (error) { - console.error('Processing failed:', error); - } - } -}); +**Pass the file content as the form field value:** +```bash +curl -v -H "Accept: application/xml" \ + --form input=@./document.pdf \ + --form "typedAreas=$(cat typed_areas.json)" \ + localhost:8070/api/processFulltextDocument ``` ## Error Handling diff --git a/grobid-core/src/main/java/org/grobid/core/document/Document.java b/grobid-core/src/main/java/org/grobid/core/document/Document.java index 6a8eee9ba9..9d8dae3b2e 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/Document.java +++ b/grobid-core/src/main/java/org/grobid/core/document/Document.java @@ -71,6 +71,7 @@ import java.util.Map; import java.util.Set; import java.util.SortedSet; +import java.util.TreeSet; import java.util.regex.Matcher; import java.util.stream.Collectors; @@ -796,13 +797,7 @@ public void setLabeledBlocks(SortedSetMultimap labeledBlo // helper public List getDocumentPieceTokenization(DocumentPiece dp) { - List subList = tokenizations.subList(dp.getLeft().getTokenDocPos(), dp.getRight().getTokenDocPos() + 1); - if (!excludedTokens.isEmpty()) { - return subList.stream() - .filter(t -> !excludedTokens.contains(t)) - .collect(Collectors.toList()); - } - return subList; + return tokenizations.subList(dp.getLeft().getTokenDocPos(), dp.getRight().getTokenDocPos() + 1); } public String getDocumentPieceText(DocumentPiece dp) { @@ -1777,10 +1772,49 @@ public void setIgnoredTokens(List ignoredTokens) { this.ignoredTokens = ignoredTokens != null ? ignoredTokens : new ArrayList<>(); } - public boolean isTokenExcluded(LayoutToken token) { + private boolean isTokenExcluded(LayoutToken token) { return excludedTokens.contains(token); } + /** + * Filters document pieces by splitting them around excluded token runs. + * Returns new pieces that skip over any tokens in the excludedTokens set. + */ + public SortedSet filterDocumentPiecesByExcludedTokens(SortedSet pieces) { + if (excludedTokens.isEmpty() || pieces == null || pieces.isEmpty()) { + return pieces; + } + SortedSet filtered = new TreeSet<>(); + for (DocumentPiece piece : pieces) { + int startPos = piece.getLeft().getTokenDocPos(); + int endPos = piece.getRight().getTokenDocPos(); + int runStart = -1; + for (int i = startPos; i <= endPos; i++) { + LayoutToken token = tokenizations.get(i); + if (!excludedTokens.contains(token)) { + if (runStart == -1) runStart = i; + } else { + if (runStart != -1) { + filtered.add(createPiece(runStart, i - 1)); + runStart = -1; + } + } + } + if (runStart != -1) { + filtered.add(createPiece(runStart, endPos)); + } + } + return filtered; + } + + private DocumentPiece createPiece(int startTokenDocPos, int endTokenDocPos) { + int startBlock = tokenizations.get(startTokenDocPos).getBlockPtr(); + int endBlock = tokenizations.get(endTokenDocPos).getBlockPtr(); + return new DocumentPiece( + new DocumentPointer(this, startBlock, startTokenDocPos), + new DocumentPointer(this, endBlock, endTokenDocPos)); + } + /** * Filters out layout tokens that fall within the specified typed areas and categorizes them by type. * Tokens in figure/table areas are collected for ML-based processing; tokens in ignore areas are discarded. @@ -1850,8 +1884,28 @@ public void filterLayoutTokensByTypedAreas(List typedAreas) { } } + recalculateBlockPointers(); + LOGGER.debug("Processed typed areas: {} figure tokens, {} table tokens, {} ignored tokens, {} excluded total", figureTokenCount, tableTokenCount, ignoredTokenCount, excludedTokens.size()); } + /** + * Recalculate blockPtr for all tokens based on the current blocks list. + * Ensures that each token's blockPtr correctly points to the block + * whose startToken <= tokenDocPos < nextBlock.startToken. + */ + private void recalculateBlockPointers() { + if (blocks == null || blocks.isEmpty() || tokenizations == null || tokenizations.isEmpty()) { + return; + } + int blockIdx = 0; + for (int i = 0; i < tokenizations.size(); i++) { + while (blockIdx < blocks.size() - 1 + && blocks.get(blockIdx + 1).getStartToken() <= i) { + blockIdx++; + } + tokenizations.get(i).setBlockPtr(blockIdx); + } + } } diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 4fcae50d63..1ce9833338 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -164,6 +164,8 @@ public Document processing(DocumentSource documentSource, } SortedSet documentBodyParts = doc.getDocumentPart(SegmentationLabels.BODY); + // Filter body pieces to exclude typed area regions + documentBodyParts = doc.filterDocumentPiecesByExcludedTokens(documentBodyParts); // header processing BiblioItem headerResults = new BiblioItem(); @@ -495,7 +497,7 @@ private void postProcessFigureCaptions(List
figures, Document doc) { } private static String fixFiguresLabellingResults(Document doc, String bodyResults) { - List>>> updatedFigures = doc. assignGraphicObjectsToFigures(); + List>>> updatedFigures = doc.assignGraphicObjectsToFigures(); for(Triple>> update: updatedFigures) { List> difference = update.getRight(); @@ -1083,12 +1085,6 @@ else if (nbAuthorType > (bibDataSets.size() / 2)) LayoutToken token = tokens.get(n); - // Skip tokens that fall within typed areas (figure, table, ignore) - if (doc.isTokenExcluded(token)) { - n++; - continue; - } - layoutTokens.add(token); features = new FeaturesVectorFulltext(); diff --git a/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java b/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java index 91c73ed8ee..8b0ba0b0c6 100755 --- a/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java +++ b/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java @@ -15,6 +15,7 @@ import org.grobid.core.engines.config.GrobidAnalysisConfig; import org.grobid.core.factory.AbstractEngineFactory; import org.grobid.core.factory.GrobidPoolingFactory; +import org.grobid.core.layout.AreaType; import org.grobid.core.utilities.GrobidProperties; import org.grobid.service.data.ServiceInfo; import org.grobid.service.process.GrobidRestProcessFiles; @@ -34,6 +35,7 @@ import java.io.InputStream; import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.NoSuchElementException; import static org.grobid.core.GrobidModels.Flavor.BLANK; @@ -427,6 +429,14 @@ private List parseTypedAreas(String typedArea LOGGER.error("Failed to parse typed areas JSON: " + typedAreasJson, e); } + if (!typedAreasList.isEmpty()) { + Map countsByType = typedAreasList.stream() + .collect(java.util.stream.Collectors.groupingBy( + org.grobid.core.layout.IgnoreArea::getType, + java.util.stream.Collectors.counting())); + LOGGER.info("Received {} typed areas: {}", typedAreasList.size(), countsByType); + } + return typedAreasList; } From f91f218dc2cdb947724c64ebb436df80def3bb53 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 4 Mar 2026 21:04:44 +0100 Subject: [PATCH 4/4] feat: enhance table processing by grouping tokens by ignore areas --- .../org/grobid/core/document/Document.java | 8 ++ .../grobid/core/engines/FullTextParser.java | 83 ++++++++++++------- 2 files changed, 62 insertions(+), 29 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/Document.java b/grobid-core/src/main/java/org/grobid/core/document/Document.java index 9d8dae3b2e..0053982525 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/Document.java +++ b/grobid-core/src/main/java/org/grobid/core/document/Document.java @@ -66,6 +66,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.IdentityHashMap; +import java.util.LinkedHashMap; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -169,6 +170,7 @@ public void setImages(List images) { // tokens extracted from typed areas for specialized processing protected transient List figureTokens = new ArrayList<>(); protected transient List tableTokens = new ArrayList<>(); + protected transient Map> tableTokensByArea = new LinkedHashMap<>(); protected transient List ignoredTokens = new ArrayList<>(); // tokens that fall within typed areas and should be excluded from body processing @@ -1764,6 +1766,10 @@ public void setTableTokens(List tableTokens) { this.tableTokens = tableTokens != null ? tableTokens : new ArrayList<>(); } + public Map> getTableTokensByArea() { + return tableTokensByArea; + } + public List getIgnoredTokens() { return ignoredTokens; } @@ -1831,6 +1837,7 @@ public void filterLayoutTokensByTypedAreas(List typedAreas) { // Clear previous token lists figureTokens.clear(); tableTokens.clear(); + tableTokensByArea.clear(); ignoredTokens.clear(); figureAreas.clear(); tableAreas.clear(); @@ -1871,6 +1878,7 @@ public void filterLayoutTokensByTypedAreas(List typedAreas) { break; case TABLE: tableTokens.add(token); + tableTokensByArea.computeIfAbsent(area, k -> new ArrayList<>()).add(token); tableTokenCount++; break; case IGNORE: diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 1ce9833338..64df73f9f7 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -347,6 +347,10 @@ else if (config.getConsolidateCitations() == 2) LOGGER.debug("Fulltext model: The featured body is empty"); } + // Save typed area tables before annex processing (which overwrites doc.annexTables) + List
typedAreaTables = doc.getAnnexTables() != null + ? new ArrayList<>(doc.getAnnexTables()) : new ArrayList<>(); + // possible annexes (view as a piece of full text similar to the body) documentBodyParts = doc.getDocumentPart(SegmentationLabels.ANNEX); featSeg = getBodyTextFeatured(doc, documentBodyParts); @@ -421,6 +425,15 @@ else if (config.getConsolidateCitations() == 2) doc.setAnnexEquations(annexEquations); } + // Merge typed area tables back (they were overwritten by annex processing) + if (!typedAreaTables.isEmpty()) { + if (annexTables == null) { + annexTables = new ArrayList<>(); + } + annexTables.addAll(typedAreaTables); + doc.setAnnexTables(annexTables); + } + // post-process reference and footnote callout to keep them consistent (e.g. for example avoid that a footnote // callout in superscript is by error labeled as a numerical reference callout) List markerTypes = null; @@ -3988,49 +4001,61 @@ protected void processTypedAreas(Document doc) { LOGGER.debug("Created figure from typed areas via ML processing"); } - // Process table areas using the table ML model - if (!doc.getTableAreas().isEmpty() && !doc.getTableTokens().isEmpty()) { + // Process table areas using the table ML model - each area separately + if (!doc.getTableAreas().isEmpty() && !doc.getTableTokensByArea().isEmpty()) { if (doc.getAnnexTables() == null) { doc.setAnnexTables(new ArrayList<>()); } - List
tables = null; - try { - Pair> featurePair = - generateFeaturesForTokens(doc.getTableTokens(), doc); - if (featurePair != null && isNotBlank(featurePair.getLeft())) { - tables = parsers.getTableParser().processing( - featurePair.getRight(), featurePair.getLeft()); + for (Map.Entry> entry : doc.getTableTokensByArea().entrySet()) { + IgnoreArea area = entry.getKey(); + List areaTokens = entry.getValue(); + if (areaTokens.isEmpty()) { + continue; } - } catch (Exception e) { - LOGGER.warn("Table ML processing failed, falling back to direct construction", e); - } - if (CollectionUtils.isNotEmpty(tables)) { - for (Table table : tables) { - table.setLayoutTokens(doc.getTableTokens()); - for (LayoutToken lt : doc.getTableTokens()) { + List
tables = null; + try { + Pair> featurePair = + generateFeaturesForTokens(areaTokens, doc); + if (featurePair != null && isNotBlank(featurePair.getLeft())) { + tables = parsers.getTableParser().processing( + featurePair.getRight(), featurePair.getLeft()); + } + } catch (Exception e) { + LOGGER.warn("Table ML processing failed for area {}, falling back to direct construction", area, e); + } + + if (CollectionUtils.isNotEmpty(tables)) { + for (Table table : tables) { + for (LayoutToken lt : areaTokens) { + if (!LayoutTokensUtil.spaceyToken(lt.t()) && !LayoutTokensUtil.newLineToken(lt.t())) { + table.setPage(lt.getPage()); + break; + } + } + LOGGER.info("Typed area table from {}: hasContent={}, tokenCount={}", + area, table.getContent() != null && table.getContent().length() > 0, + table.getLayoutTokens() != null ? table.getLayoutTokens().size() : 0); + doc.getAnnexTables().add(table); + } + } else { + // Fallback: create Table directly from tokens + Table table = new Table(); + table.setLayoutTokens(areaTokens); + table.setContent(new StringBuilder(LayoutTokensUtil.toText(areaTokens))); + for (LayoutToken lt : areaTokens) { if (!LayoutTokensUtil.spaceyToken(lt.t()) && !LayoutTokensUtil.newLineToken(lt.t())) { table.setPage(lt.getPage()); break; } } + LOGGER.info("Typed area table (fallback) from {}: tokenCount={}", area, areaTokens.size()); doc.getAnnexTables().add(table); } - } else { - // Fallback: create Table directly from tokens - Table table = new Table(); - table.setLayoutTokens(doc.getTableTokens()); - table.setContent(new StringBuilder(LayoutTokensUtil.toText(doc.getTableTokens()))); - for (LayoutToken lt : doc.getTableTokens()) { - if (!LayoutTokensUtil.spaceyToken(lt.t()) && !LayoutTokensUtil.newLineToken(lt.t())) { - table.setPage(lt.getPage()); - break; - } - } - doc.getAnnexTables().add(table); } - LOGGER.debug("Created table(s) from typed areas via ML processing"); + LOGGER.info("Created {} table(s) from {} typed areas", + doc.getAnnexTables().size(), doc.getTableTokensByArea().size()); } // Note: ignored areas are intentionally discarded and no further processing is performed