grobidOrg · kermitt2 · Oct 21, 2019 · Oct 21, 2019 · Dec 27, 2019 · Dec 27, 2019
diff --git a/build.gradle b/build.gradle
@@ -226,6 +226,8 @@ project("grobid-core") {
         compile 'javax.xml.bind:jaxb-api:2.3.0'
         compile 'black.ninia:jep:3.8.2'
         compile 'org.slf4j:slf4j-log4j12:1.7.25'
+        compile "fr.opensagres.xdocreport:org.apache.poi.xwpf.converter.pdf:1.0.6"
+        //compile group: 'org.apache.tika', name: 'tika-core', version: '1.22'
 
         shadedLib "org.apache.lucene:lucene-analyzers-common:4.5.1"
 

diff --git a/grobid-core/src/main/java/org/grobid/core/document/DocumentSource.java b/grobid-core/src/main/java/org/grobid/core/document/DocumentSource.java
@@ -12,14 +12,19 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.File;
+import java.io.*;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 
+import org.apache.poi.xwpf.converter.pdf.PdfConverter;
+import org.apache.poi.xwpf.converter.pdf.PdfOptions;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+
 /**
- * Input document to be processed, which could come from a PDF or directly be an XML file. 
- * If from a PDF document, this is the place where pdftoxml is called.
+ * Input document to be processed, which could come from a PDF, a doc/docx or directly be an XML file. 
+ * If from a PDF document, this is the place where pdfalto is called.
+ * If from a doc/docx document, this is the place where a conversion with Apache POI is realized. 
  */
 public class DocumentSource {
     private static final Logger LOGGER = LoggerFactory.getLogger(DocumentSource.class);
@@ -30,9 +35,10 @@ public class DocumentSource {
     public static final int PDFTOXML_FILES_AMOUNT_LIMIT = 5000;
 
     private File pdfFile;
+    private File docxFile;
     private File xmlFile;
     boolean cleanupXml = false;
-
+    boolean cleanupPdf = false;
 
     private DocumentSource() {
     }
@@ -41,6 +47,10 @@ public static DocumentSource fromPdf(File pdfFile) {
         return fromPdf(pdfFile, -1, -1);
     }
 
+    public static DocumentSource fromDocx(File docxFile) {
+        return fromDocx(docxFile, -1, -1);
+    }
+
     /**
      * By default the XML extracted from the PDF is without images, to avoid flooding the grobid-home/tmp directory,
 	 * but with the extra annotation file and with outline	
@@ -49,6 +59,10 @@ public static DocumentSource fromPdf(File pdfFile, int startPage, int endPage) {
         return fromPdf(pdfFile, startPage, endPage, false, true, false);
     }
 
+    public static DocumentSource fromDocx(File docxFile, int startPage, int endPage) {
+        return fromDocx(docxFile, startPage, endPage, false, true, false);
+    }
+
     public static DocumentSource fromPdf(File pdfFile, int startPage, int endPage, 
 										 boolean withImages, boolean withAnnotations, boolean withOutline) {
         if (!pdfFile.exists() || pdfFile.isDirectory()) {
@@ -71,6 +85,35 @@ public static DocumentSource fromPdf(File pdfFile, int startPage, int endPage,
         return source;
     }
 
+    public static DocumentSource fromDocx(File docxFile, int startPage, int endPage, 
+                                         boolean withImages, boolean withAnnotations, boolean withOutline) {
+        if (!docxFile.exists() || docxFile.isDirectory()) {
+            throw new GrobidException("Input doc/docx file " + docxFile + " does not exist or a directory", 
+                GrobidExceptionStatus.BAD_INPUT_DATA);
+        }
+
+        DocumentSource source = new DocumentSource();
+        source.cleanupXml = true;
+        source.cleanupPdf = true;
+
+        // preliminary convert doc/docx file into PDF
+        File pdfFile = source.docxToPdf(docxFile, GrobidProperties.getTempPath());
+        // create an ALTO representation
+        if (pdfFile != null) {
+            try {
+                source.xmlFile = source.pdf2xml(null, false, startPage, endPage, pdfFile, 
+                    GrobidProperties.getTempPath(), withImages, withAnnotations, withOutline);
+            } catch (Exception e) {
+                source.close(withImages, withAnnotations, withOutline);
+                throw e;
+            } finally {
+                source.cleanPdfFile(pdfFile);
+            }
+        }
+        source.docxFile = docxFile;
+        return source;
+    }
+
     private String getPdfToXmlCommand(boolean withImage, boolean withAnnotations, boolean withOutline) {
         StringBuilder pdfToXml = new StringBuilder();
         pdfToXml.append(GrobidProperties.getPdfToXMLPath().getAbsolutePath());
@@ -351,11 +394,85 @@ private boolean cleanXmlFile(File pathToXml, boolean cleanImages, boolean cleanA
         return success;
     }
 
+    private boolean cleanPdfFile(File pathToPdf) {
+        boolean success = false;
+        try {
+            if (pathToPdf != null) {
+                if (pathToPdf.exists()) {
+                    success = pathToPdf.delete();
+                    if (!success) {
+                        throw new GrobidResourceException("Deletion of a temporary PDF file failed for file '" + pathToPdf.getAbsolutePath() + "'");
+                    }
+                }
+            }
+        } catch (Exception e) {
+            if (e instanceof GrobidResourceException) {
+                throw (GrobidResourceException) e;
+            } else {
+                throw new GrobidResourceException("An exception occurred while deleting an PDF file '" + pathToPdf + "'.", e);
+            }
+        }
+
+        return success;
+    }
+
+    /**
+     * Convert doc/docx file to pdf format using Apache POI (via opensagres converter). 
+     * The current thread is used for the execution.
+     *
+     * @param docxPath  docx/doc file
+     * @param tmpPath   temp path to save the converted file
+     * @return the converted file or null if conversion was impossible/failed
+     */
+    private File docxToPdf(File docxFile, File tmpPath) {
+        // target PDF file 
+        if (docxFile == null || !docxFile.exists()) {
+            LOGGER.error("Invalid doc/docx file for PDF conversion");
+            return null;
+        }
+
+        File pdfFile = new File(tmpPath, KeyGen.getKey() + ".pdf");
+        try (
+            InputStream is = new FileInputStream(docxFile);
+            OutputStream out = new FileOutputStream(pdfFile);
+        ) {
+            long start = System.currentTimeMillis();
+            // load the docx file into XWPFDocument
+            XWPFDocument document = new XWPFDocument(is);
+            // PDF options
+            PdfOptions options = PdfOptions.create();
+
+            // note: the default font encoding will be unicode, but it does not always work given the docx fonts,
+            // it is possible to set explicitely a font encoding like this:
+            // options = PdfOptions.create().fontEncoding("windows-1250");
+
+            // ensure PDF/A conformance level, for safer PDF processing by pdfalto 
+            /*options.setConfiguration( new IPdfWriterConfiguration() {
+                public void configure( PdfWriter writer ) {
+                    writer.setPDFXConformance( PdfWriter.PDFA1A );
+                }
+            });*/
+
+            // converting XWPFDocument to PDF
+            PdfConverter.getInstance().convert(document, out, options);
+            LOGGER.info("docx file converted to PDF in : " + (System.currentTimeMillis() - start) + " milli seconds");
+
+            // TBD: for using the more recent version 2.0.2 of fr.opensagres.poi.xwpf.converter.core, see
+            // https://stackoverflow.com/questions/51330192/trying-to-make-simple-pdf-document-with-apache-poi
+        } catch (Throwable e) {
+            LOGGER.error("converting doc/docx into PDF failed", e);
+            pdfFile = null;
+        }
+        return pdfFile;
+    }
 
     public void close(boolean cleanImages, boolean cleanAnnotations, boolean cleanOutline) {
         try {
             if (cleanupXml) {
                 cleanXmlFile(xmlFile, cleanImages, cleanAnnotations, cleanOutline);
+            } 
+            if (cleanupPdf) {
+                cleanPdfFile(pdfFile);
             }
         } catch (Exception e) {
             LOGGER.error("Cannot cleanup resources (just printing exception):", e);
@@ -369,21 +486,29 @@ public static void close(DocumentSource source, boolean cleanImages, boolean cle
     }
 
     public File getPdfFile() {
-        return pdfFile;
+        return this.pdfFile;
     }
 
     public void setPdfFile(File pdfFile) {
         this.pdfFile = pdfFile;
     }
 
     public File getXmlFile() {
-        return xmlFile;
+        return this.xmlFile;
     }
 
-    public void setXmlFile(File xmlFile) {
+    public void setXmlFile(File docxFile) {
         this.xmlFile = xmlFile;
     }
 
+    public File getDocxFile() {
+        return this.xmlFile;
+    }
+
+    public void setDocxFile(File docxFile) {
+        this.docxFile = docxFile;
+    }
+
 }
 
 

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
@@ -5,6 +5,7 @@
 import org.apache.commons.collections4.CollectionUtils;
 import org.apache.commons.lang3.tuple.Pair;
 import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.FilenameUtils;
 
 import java.nio.charset.StandardCharsets;
 
@@ -105,11 +106,18 @@ public FullTextParser(EngineParsers parsers) {
         tmpPath = GrobidProperties.getTempPath();
     }
 
-	public Document processing(File inputPdf,
+	public Document processing(File input,
 							   GrobidAnalysisConfig config) throws Exception {
-		DocumentSource documentSource = 
-			DocumentSource.fromPdf(inputPdf, config.getStartPage(), config.getEndPage(), 
-				config.getPdfAssetPath() != null, true, false);
+        DocumentSource documentSource = null;
+        String extension = FilenameUtils.getExtension(input.getName());
+        if ( extension != null && (extension.toLowerCase().equals("docx")) ) {
+            documentSource = DocumentSource.fromDocx(input, config.getStartPage(), config.getEndPage(),
+                config.getPdfAssetPath() != null, true, false);
+        } else {
+            documentSource = DocumentSource.fromPdf(input, config.getStartPage(), config.getEndPage(),
+                config.getPdfAssetPath() != null, true, false);
+        }
+
 		return processing(documentSource, config);
 	}
 

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java
@@ -1,8 +1,11 @@
 package org.grobid.core.engines;
 
 import com.google.common.base.Splitter;
+
 import org.apache.commons.lang3.tuple.ImmutablePair;
 import org.apache.commons.lang3.tuple.Pair;
+import org.apache.commons.io.FilenameUtils;
+
 import org.grobid.core.GrobidModels;
 import org.grobid.core.data.BiblioItem;
 import org.grobid.core.data.Date;
@@ -75,7 +78,13 @@ public HeaderParser(EngineParsers parsers) {
     public Pair<String, Document> processing(File input, BiblioItem resHeader, GrobidAnalysisConfig config) {
         DocumentSource documentSource = null;
         try {
-            documentSource = DocumentSource.fromPdf(input, config.getStartPage(), config.getEndPage());
+            String extension = FilenameUtils.getExtension(input.getName());
+            if ( extension != null && (extension.toLowerCase().equals("docx")) ) {
+                documentSource = DocumentSource.fromDocx(input, config.getStartPage(), config.getEndPage());
+            } else {
+                documentSource = DocumentSource.fromPdf(input, config.getStartPage(), config.getEndPage());
+            }
+
             Document doc = parsers.getSegmentationParser().processing(documentSource, config);
 
             String tei = processingHeaderSection(config.getConsolidateHeader(), doc, resHeader);
@@ -91,10 +100,17 @@ public Pair<String, Document> processing(File input, BiblioItem resHeader, Grobi
      * Processing without application of the segmentation model, regex are used to identify the header
      * zone.
      */
-    public Pair<String, Document> processing2(String pdfInput, BiblioItem resHeader, GrobidAnalysisConfig config) {
+    public Pair<String, Document> processing2(String input, BiblioItem resHeader, GrobidAnalysisConfig config) {
         DocumentSource documentSource = null;
         try {
-            documentSource = DocumentSource.fromPdf(new File(pdfInput), config.getStartPage(), config.getEndPage());
+            String extension = FilenameUtils.getExtension(input);
+            if ( extension != null && (extension.toLowerCase().equals("docx")) ) {
+                documentSource = DocumentSource.fromDocx(new File(input), config.getStartPage(), config.getEndPage());
+            }
+            else {
+                documentSource = DocumentSource.fromPdf(new File(input), config.getStartPage(), config.getEndPage());
+            }
+
             Document doc = new Document(documentSource);
             doc.addTokenizedDocument(config);
 

diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/IOUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/IOUtilities.java
@@ -57,15 +57,19 @@ public static String readFile(String pPathToFile) throws IOException {
     }
 
     /**
-     * Write an input stream in temp directory.
+     * Write an input stream in temp directory, default is PDF file
      */
     public static File writeInputFile(InputStream inputStream) {
+        return writeInputFile(inputStream, "pdf");
+    }
+
+    public static File writeInputFile(InputStream inputStream, String extension) {
         LOGGER.debug(">> set origin document for stateless service'...");
 
         File originFile = null;
         OutputStream out = null;
         try {
-            originFile = newTempFile("origin", ".pdf");
+            originFile = newTempFile("origin", extension);
 
             out = new FileOutputStream(originFile);
 
@@ -99,6 +103,8 @@ public static File writeInputFile(InputStream inputStream) {
      */
     public static File newTempFile(String fileName, String extension) {
         try {
+            if (!extension.startsWith("."))
+                extension = "." + extension;
             return File.createTempFile(fileName, extension, GrobidProperties.getTempPath());
         } catch (IOException e) {
             throw new GrobidResourceException(

diff --git a/grobid-core/src/test/resources/docx/Hindawi_template.docx b/grobid-core/src/test/resources/docx/Hindawi_template.docx
diff --git a/grobid-core/src/test/resources/docx/IAP_Template.docx b/grobid-core/src/test/resources/docx/IAP_Template.docx
diff --git a/grobid-core/src/test/resources/docx/IJSRP-paper-submission-format-double-column.docx b/grobid-core/src/test/resources/docx/IJSRP-paper-submission-format-double-column.docx
diff --git a/grobid-core/src/test/resources/docx/IJSRP-paper-submission-format-single-column.docx b/grobid-core/src/test/resources/docx/IJSRP-paper-submission-format-single-column.docx
diff --git a/grobid-core/src/test/resources/docx/JIM_paper_Template.docx b/grobid-core/src/test/resources/docx/JIM_paper_Template.docx
diff --git a/grobid-core/src/test/resources/docx/JPCSExampleWordDocument.docx b/grobid-core/src/test/resources/docx/JPCSExampleWordDocument.docx
diff --git a/grobid-core/src/test/resources/docx/Journal-Manuscript-Format-MS-Office-2007.docx b/grobid-core/src/test/resources/docx/Journal-Manuscript-Format-MS-Office-2007.docx
diff --git a/grobid-core/src/test/resources/docx/Microsoft-Word-template.docx b/grobid-core/src/test/resources/docx/Microsoft-Word-template.docx
diff --git a/grobid-core/src/test/resources/docx/aebj_template_201403.docx b/grobid-core/src/test/resources/docx/aebj_template_201403.docx
diff --git a/grobid-core/src/test/resources/docx/conference_article_two_columns.docx b/grobid-core/src/test/resources/docx/conference_article_two_columns.docx
diff --git a/grobid-core/src/test/resources/docx/interim-layout.docx b/grobid-core/src/test/resources/docx/interim-layout.docx
diff --git a/grobid-core/src/test/resources/docx/journal_Word_template.docx b/grobid-core/src/test/resources/docx/journal_Word_template.docx
diff --git a/grobid-core/src/test/resources/docx/pg4-sample-word-template.docx b/grobid-core/src/test/resources/docx/pg4-sample-word-template.docx
diff --git a/grobid-core/src/test/resources/docx/science_manuscript_word_template.docx b/grobid-core/src/test/resources/docx/science_manuscript_word_template.docx