Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,8 @@ project("grobid-core") {
compile 'javax.xml.bind:jaxb-api:2.3.0'
compile 'black.ninia:jep:3.8.2'
compile 'org.slf4j:slf4j-log4j12:1.7.25'
compile "fr.opensagres.xdocreport:org.apache.poi.xwpf.converter.pdf:1.0.6"
//compile group: 'org.apache.tika', name: 'tika-core', version: '1.22'

shadedLib "org.apache.lucene:lucene-analyzers-common:4.5.1"

Expand Down
139 changes: 132 additions & 7 deletions grobid-core/src/main/java/org/grobid/core/document/DocumentSource.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,19 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.poi.xwpf.converter.pdf.PdfConverter;
import org.apache.poi.xwpf.converter.pdf.PdfOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

/**
* Input document to be processed, which could come from a PDF or directly be an XML file.
* If from a PDF document, this is the place where pdftoxml is called.
* Input document to be processed, which could come from a PDF, a doc/docx or directly be an XML file.
* If from a PDF document, this is the place where pdfalto is called.
* If from a doc/docx document, this is the place where a conversion with Apache POI is realized.
*/
public class DocumentSource {
private static final Logger LOGGER = LoggerFactory.getLogger(DocumentSource.class);
Expand All @@ -30,9 +35,10 @@ public class DocumentSource {
public static final int PDFTOXML_FILES_AMOUNT_LIMIT = 5000;

private File pdfFile;
private File docxFile;
private File xmlFile;
boolean cleanupXml = false;

boolean cleanupPdf = false;

private DocumentSource() {
}
Expand All @@ -41,6 +47,10 @@ public static DocumentSource fromPdf(File pdfFile) {
return fromPdf(pdfFile, -1, -1);
}

public static DocumentSource fromDocx(File docxFile) {
return fromDocx(docxFile, -1, -1);
}

/**
* By default the XML extracted from the PDF is without images, to avoid flooding the grobid-home/tmp directory,
* but with the extra annotation file and with outline
Expand All @@ -49,6 +59,10 @@ public static DocumentSource fromPdf(File pdfFile, int startPage, int endPage) {
return fromPdf(pdfFile, startPage, endPage, false, true, false);
}

public static DocumentSource fromDocx(File docxFile, int startPage, int endPage) {
return fromDocx(docxFile, startPage, endPage, false, true, false);
}

public static DocumentSource fromPdf(File pdfFile, int startPage, int endPage,
boolean withImages, boolean withAnnotations, boolean withOutline) {
if (!pdfFile.exists() || pdfFile.isDirectory()) {
Expand All @@ -71,6 +85,35 @@ public static DocumentSource fromPdf(File pdfFile, int startPage, int endPage,
return source;
}

public static DocumentSource fromDocx(File docxFile, int startPage, int endPage,
boolean withImages, boolean withAnnotations, boolean withOutline) {
if (!docxFile.exists() || docxFile.isDirectory()) {
throw new GrobidException("Input doc/docx file " + docxFile + " does not exist or a directory",
GrobidExceptionStatus.BAD_INPUT_DATA);
}

DocumentSource source = new DocumentSource();
source.cleanupXml = true;
source.cleanupPdf = true;

// preliminary convert doc/docx file into PDF
File pdfFile = source.docxToPdf(docxFile, GrobidProperties.getTempPath());
// create an ALTO representation
if (pdfFile != null) {
try {
source.xmlFile = source.pdf2xml(null, false, startPage, endPage, pdfFile,
GrobidProperties.getTempPath(), withImages, withAnnotations, withOutline);
} catch (Exception e) {
source.close(withImages, withAnnotations, withOutline);
throw e;
} finally {
source.cleanPdfFile(pdfFile);
}
}
source.docxFile = docxFile;
return source;
}

private String getPdfToXmlCommand(boolean withImage, boolean withAnnotations, boolean withOutline) {
StringBuilder pdfToXml = new StringBuilder();
pdfToXml.append(GrobidProperties.getPdfToXMLPath().getAbsolutePath());
Expand Down Expand Up @@ -351,11 +394,85 @@ private boolean cleanXmlFile(File pathToXml, boolean cleanImages, boolean cleanA
return success;
}

private boolean cleanPdfFile(File pathToPdf) {
boolean success = false;
try {
if (pathToPdf != null) {
if (pathToPdf.exists()) {
success = pathToPdf.delete();
if (!success) {
throw new GrobidResourceException("Deletion of a temporary PDF file failed for file '" + pathToPdf.getAbsolutePath() + "'");
}
}
}
} catch (Exception e) {
if (e instanceof GrobidResourceException) {
throw (GrobidResourceException) e;
} else {
throw new GrobidResourceException("An exception occurred while deleting an PDF file '" + pathToPdf + "'.", e);
}
}

return success;
}

/**
* Convert doc/docx file to pdf format using Apache POI (via opensagres converter).
* The current thread is used for the execution.
*
* @param docxPath docx/doc file
* @param tmpPath temp path to save the converted file
* @return the converted file or null if conversion was impossible/failed
*/
private File docxToPdf(File docxFile, File tmpPath) {
// target PDF file
if (docxFile == null || !docxFile.exists()) {
LOGGER.error("Invalid doc/docx file for PDF conversion");
return null;
}

File pdfFile = new File(tmpPath, KeyGen.getKey() + ".pdf");
try (
InputStream is = new FileInputStream(docxFile);
OutputStream out = new FileOutputStream(pdfFile);
) {
long start = System.currentTimeMillis();
// load the docx file into XWPFDocument
XWPFDocument document = new XWPFDocument(is);
// PDF options
PdfOptions options = PdfOptions.create();

// note: the default font encoding will be unicode, but it does not always work given the docx fonts,
// it is possible to set explicitely a font encoding like this:
// options = PdfOptions.create().fontEncoding("windows-1250");

// ensure PDF/A conformance level, for safer PDF processing by pdfalto
/*options.setConfiguration( new IPdfWriterConfiguration() {
public void configure( PdfWriter writer ) {
writer.setPDFXConformance( PdfWriter.PDFA1A );
}
});*/

// converting XWPFDocument to PDF
PdfConverter.getInstance().convert(document, out, options);
LOGGER.info("docx file converted to PDF in : " + (System.currentTimeMillis() - start) + " milli seconds");

// TBD: for using the more recent version 2.0.2 of fr.opensagres.poi.xwpf.converter.core, see
// https://stackoverflow.com/questions/51330192/trying-to-make-simple-pdf-document-with-apache-poi
} catch (Throwable e) {
LOGGER.error("converting doc/docx into PDF failed", e);
pdfFile = null;
}
return pdfFile;
}

public void close(boolean cleanImages, boolean cleanAnnotations, boolean cleanOutline) {
try {
if (cleanupXml) {
cleanXmlFile(xmlFile, cleanImages, cleanAnnotations, cleanOutline);
}
if (cleanupPdf) {
cleanPdfFile(pdfFile);
}
} catch (Exception e) {
LOGGER.error("Cannot cleanup resources (just printing exception):", e);
Expand All @@ -369,21 +486,29 @@ public static void close(DocumentSource source, boolean cleanImages, boolean cle
}

public File getPdfFile() {
return pdfFile;
return this.pdfFile;
}

public void setPdfFile(File pdfFile) {
this.pdfFile = pdfFile;
}

public File getXmlFile() {
return xmlFile;
return this.xmlFile;
}

public void setXmlFile(File xmlFile) {
public void setXmlFile(File docxFile) {
this.xmlFile = xmlFile;
}

public File getDocxFile() {
return this.xmlFile;
}

public void setDocxFile(File docxFile) {
this.docxFile = docxFile;
}

}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;

import java.nio.charset.StandardCharsets;

Expand Down Expand Up @@ -105,11 +106,18 @@ public FullTextParser(EngineParsers parsers) {
tmpPath = GrobidProperties.getTempPath();
}

public Document processing(File inputPdf,
public Document processing(File input,
GrobidAnalysisConfig config) throws Exception {
DocumentSource documentSource =
DocumentSource.fromPdf(inputPdf, config.getStartPage(), config.getEndPage(),
config.getPdfAssetPath() != null, true, false);
DocumentSource documentSource = null;
String extension = FilenameUtils.getExtension(input.getName());
if ( extension != null && (extension.toLowerCase().equals("docx")) ) {
documentSource = DocumentSource.fromDocx(input, config.getStartPage(), config.getEndPage(),
config.getPdfAssetPath() != null, true, false);
} else {
documentSource = DocumentSource.fromPdf(input, config.getStartPage(), config.getEndPage(),
config.getPdfAssetPath() != null, true, false);
}

return processing(documentSource, config);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
package org.grobid.core.engines;

import com.google.common.base.Splitter;

import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.io.FilenameUtils;

import org.grobid.core.GrobidModels;
import org.grobid.core.data.BiblioItem;
import org.grobid.core.data.Date;
Expand Down Expand Up @@ -75,7 +78,13 @@ public HeaderParser(EngineParsers parsers) {
public Pair<String, Document> processing(File input, BiblioItem resHeader, GrobidAnalysisConfig config) {
DocumentSource documentSource = null;
try {
documentSource = DocumentSource.fromPdf(input, config.getStartPage(), config.getEndPage());
String extension = FilenameUtils.getExtension(input.getName());
if ( extension != null && (extension.toLowerCase().equals("docx")) ) {
documentSource = DocumentSource.fromDocx(input, config.getStartPage(), config.getEndPage());
} else {
documentSource = DocumentSource.fromPdf(input, config.getStartPage(), config.getEndPage());
}

Document doc = parsers.getSegmentationParser().processing(documentSource, config);

String tei = processingHeaderSection(config.getConsolidateHeader(), doc, resHeader);
Expand All @@ -91,10 +100,17 @@ public Pair<String, Document> processing(File input, BiblioItem resHeader, Grobi
* Processing without application of the segmentation model, regex are used to identify the header
* zone.
*/
public Pair<String, Document> processing2(String pdfInput, BiblioItem resHeader, GrobidAnalysisConfig config) {
public Pair<String, Document> processing2(String input, BiblioItem resHeader, GrobidAnalysisConfig config) {
DocumentSource documentSource = null;
try {
documentSource = DocumentSource.fromPdf(new File(pdfInput), config.getStartPage(), config.getEndPage());
String extension = FilenameUtils.getExtension(input);
if ( extension != null && (extension.toLowerCase().equals("docx")) ) {
documentSource = DocumentSource.fromDocx(new File(input), config.getStartPage(), config.getEndPage());
}
else {
documentSource = DocumentSource.fromPdf(new File(input), config.getStartPage(), config.getEndPage());
}

Document doc = new Document(documentSource);
doc.addTokenizedDocument(config);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,19 @@ public static String readFile(String pPathToFile) throws IOException {
}

/**
* Write an input stream in temp directory.
* Write an input stream in temp directory, default is PDF file
*/
public static File writeInputFile(InputStream inputStream) {
return writeInputFile(inputStream, "pdf");
}

public static File writeInputFile(InputStream inputStream, String extension) {
LOGGER.debug(">> set origin document for stateless service'...");

File originFile = null;
OutputStream out = null;
try {
originFile = newTempFile("origin", ".pdf");
originFile = newTempFile("origin", extension);

out = new FileOutputStream(originFile);

Expand Down Expand Up @@ -99,6 +103,8 @@ public static File writeInputFile(InputStream inputStream) {
*/
public static File newTempFile(String fileName, String extension) {
try {
if (!extension.startsWith("."))
extension = "." + extension;
return File.createTempFile(fileName, extension, GrobidProperties.getTempPath());
} catch (IOException e) {
throw new GrobidResourceException(
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading