diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 75c8eba0d5..6d9c82a281 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -755,8 +755,8 @@ - - + + diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index cb37992bcc..e3f2366220 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -140,9 +140,15 @@ public void testFontNameExtraction() throws Exception { assertContains("ABCDEE+Calibri", r.metadata.get(Font.FONT_NAME)); } + @Test + public void testGarbageBeforeHeader() throws Exception { + Metadata metadata = getXML("testPDF_garbageBeforeHeader.pdf").metadata; + assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); + } + @Test public void testPdfParsingMetadataOnly() throws Exception { - + Metadata metadata = getXML("testPDF.pdf").metadata; assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("Bertrand Delacr\u00e9taz", metadata.get(TikaCoreProperties.CREATOR)); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_garbageBeforeHeader.pdf b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_garbageBeforeHeader.pdf new file mode 100644 index 0000000000..41e88afcd1 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_garbageBeforeHeader.pdf differ