Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion python/test/reader/reader2doc_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def runTest(self):
self.assertTrue(result_df.select("document").count() > 0)

@pytest.mark.fast
class Reader2DocTestHierarchy(unittest.TestCase):
class Reader2DocTestHTMLHierarchy(unittest.TestCase):

def setUp(self):
spark = SparkContextForTest.spark
Expand All @@ -159,6 +159,39 @@ def runTest(self):

all_sentences = [elem for row in rows for elem in row.sentence]

# Check for required metadata keys
for s in all_sentences:
metadata = s.metadata
assert (
"element_id" in metadata or "parent_id" in metadata
), f"❌ Missing 'element_id' or 'parent_id' in metadata: {metadata}"


@pytest.mark.fast
class Reader2DocTestPDFHierarchy(unittest.TestCase):

def setUp(self):
spark = SparkContextForTest.spark
self.empty_df = spark.createDataFrame([], "string").toDF("text")

def runTest(self):
reader2doc = Reader2Doc() \
.setContentType("application/pdf") \
.setContentPath(f"file:///{os.getcwd()}/../src/test/resources/reader/pdf/hierarchy_test.pdf") \
.setOutputCol("document")

sentence_detector = SentenceDetector() \
.setInputCols(["document"]) \
.setOutputCol("sentence")

pipeline = Pipeline(stages=[reader2doc, sentence_detector])
model = pipeline.fit(self.empty_df)

result_df = model.transform(self.empty_df)
rows = result_df.select("sentence").collect()

all_sentences = [elem for row in rows for elem in row.sentence]

# Check for required metadata keys
for s in all_sentences:
metadata = s.metadata
Expand Down
30 changes: 22 additions & 8 deletions src/main/scala/com/johnsnowlabs/reader/PdfReader.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, udf}

import java.io.ByteArrayOutputStream
import java.util.UUID
import javax.imageio.ImageIO
import scala.collection.JavaConverters._
import scala.collection.mutable
Expand Down Expand Up @@ -117,15 +118,24 @@ class PdfReader(

private def extractElementsFromPdf(pdfDoc: PDDocument): Seq[HTMLElement] = {
val collectedElements = mutable.ListBuffer[HTMLElement]()
var currentParentId: Option[String] = None

val textStripper = new PDFTextStripper() {
override def writeString(
text: String,
textPositions: java.util.List[TextPosition]): Unit = {
val lineGroups = groupTextPositionsByLine(textPositions)
val lineElements = lineGroups.flatMap { case (_, linePositions) =>
classifyLineElement(linePositions, getCurrentPageNo)
classifyLineElement(linePositions, getCurrentPageNo, currentParentId)
}

// Update parentId when encountering titles
lineElements.foreach { elem =>
collectedElements += elem
if (elem.elementType == ElementType.TITLE)
currentParentId = Some(elem.metadata("element_id"))
}
collectedElements ++= lineElements

}
}
textStripper.setSortByPosition(true)
Expand All @@ -143,18 +153,21 @@ class PdfReader(

private def classifyLineElement(
linePositions: Seq[TextPosition],
pageNumber: Int): Option[HTMLElement] = {
pageNumber: Int,
currentParentId: Option[String]): Option[HTMLElement] = {
val lineText = linePositions.map(_.getUnicode).mkString.trim
if (lineText.isEmpty) return None

val averageFontSize = linePositions.map(_.getFontSize).sum / linePositions.size
val mostCommonFontName = linePositions.groupBy(_.getFont.getName).maxBy(_._2.size)._1

val isTitleLine = isTitle(averageFontSize, mostCommonFontName)
val elementType =
if (isTitle(averageFontSize, mostCommonFontName)) ElementType.TITLE
else ElementType.NARRATIVE_TEXT
if (isTitleLine) ElementType.TITLE else ElementType.NARRATIVE_TEXT

val metadata = mutable.Map("pageNumber" -> pageNumber.toString)
val metadata =
mutable.Map("pageNumber" -> pageNumber.toString, "element_id" -> UUID.randomUUID().toString)
// Assign parent_id only for narrative text or non-titles
if (!isTitleLine) currentParentId.foreach(pid => metadata("parent_id") = pid)
Some(HTMLElement(elementType, lineText, metadata))
}

Expand All @@ -176,7 +189,8 @@ class PdfReader(
"pageNumber" -> pageIndex.toString,
"format" -> "jpg",
"width" -> bufferedImage.getWidth.toString,
"height" -> bufferedImage.getHeight.toString)
"height" -> bufferedImage.getHeight.toString,
"element_id" -> UUID.randomUUID().toString)

Some(
HTMLElement(
Expand Down
101 changes: 82 additions & 19 deletions src/main/scala/com/johnsnowlabs/reader/WordReader.scala
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, udf}

import java.io.{ByteArrayInputStream, IOException}
import java.util.UUID
import scala.collection.JavaConverters._
import scala.collection.mutable

Expand Down Expand Up @@ -127,6 +128,8 @@ class WordReader(
Array(0xd0.toByte, 0xcf.toByte, 0x11.toByte, 0xe0.toByte) // Bytes indicating .doc

private var pageBreak = 0
private var currentParentId: Option[String] = None
private def newUUID(): String = UUID.randomUUID().toString

private def isDocxFile(content: Array[Byte]): Boolean = {
content.length > 1 && content(0) == ZipMagicNumberFirstByte && content(
Expand Down Expand Up @@ -184,14 +187,16 @@ class WordReader(
elements
}

private def processParagraph(
private def processParagraphOld(
paragraph: XWPFParagraph,
source: String,
tableLocation: mutable.Map[String, String] = mutable.Map()): Option[HTMLElement] = {
val text = paragraph.getText.trim
if (text.isEmpty) None
else {
val metadata = mutable.Map[String, String]()
val elementId = newUUID()
metadata("element_id") = elementId

if (includePageBreaks) {
val isBreak = paragraph.isCustomPageBreak || paragraph.isSectionBreak
Expand All @@ -206,41 +211,93 @@ class WordReader(
}

val elementType = paragraph match {
case p if p.isTitle => ElementType.TITLE
case p if p.isTitle =>
currentParentId = Some(elementId)
ElementType.TITLE
case p if p.isListItem => ElementType.LIST_ITEM
case _ => if (source == "table") ElementType.TABLE else ElementType.NARRATIVE_TEXT
}
Some(HTMLElement(elementType, text, metadata))
}
}

private def processParagraph(
paragraph: XWPFParagraph,
source: String,
tableLocation: mutable.Map[String, String] = mutable.Map()): Option[HTMLElement] = {

val text = paragraph.getText.trim
if (text.isEmpty) None
else {
val metadata = mutable.Map[String, String]()
val elementId = newUUID()
metadata("element_id") = elementId

val style = Option(paragraph.getStyleID).getOrElse("").toLowerCase
val isHeading = style.startsWith("heading") || style.startsWith("title")

// Handle page breaks if needed
if (includePageBreaks) {
val isBreak = paragraph.isCustomPageBreak || paragraph.isSectionBreak
if (isBreak) {
pageBreak += 1
metadata += ("pageBreak" -> pageBreak.toString)
currentParentId = None
}
}

if (tableLocation.nonEmpty) metadata ++= tableLocation

val elementType = paragraph match {
case _ if isHeading =>
// Titles have no parent
currentParentId = Some(elementId)
ElementType.TITLE
case p if p.isListItem =>
currentParentId.foreach(pid => metadata("parent_id") = pid)
ElementType.LIST_ITEM
case _ =>
currentParentId.foreach(pid => metadata("parent_id") = pid)
if (source == "table") ElementType.TABLE else ElementType.NARRATIVE_TEXT
}

Some(HTMLElement(elementType, text, metadata))
}
}

private def processTable(table: XWPFTable): Seq[HTMLElement] = {
val tableHtml = if (inferTableStructure) Some(table.processAsHtml) else None

val tableId = newUUID()
val tableElements: Seq[HTMLElement] = table.getRows.asScala.zipWithIndex.flatMap {
case (row, rowIndex) =>
row.getTableCells.asScala.zipWithIndex.flatMap { case (cell, cellIndex) =>
val tableLocation = mutable.Map("tableLocation" -> s"($rowIndex, $cellIndex)")
val tableLocation = mutable.Map(
"tableLocation" -> s"($rowIndex, $cellIndex)",
"element_id" -> newUUID(),
"parent_id" -> tableId)
cell.getParagraphs.asScala.flatMap { paragraph =>
processParagraph(paragraph, "table", tableLocation)
}
}
}

if (tableHtml.isDefined) {
if (outputFormat == "html-table") {
val htmlElement =
HTMLElement(ElementType.HTML, tableHtml.get, mutable.Map.empty[String, String])
tableElements :+ htmlElement
} else if (outputFormat == "json-table") {
val tableElement = HTMLParser.parseFirstTableElement(tableHtml.get)
val jsonString = HTMLParser.tableElementToJson(tableElement)
val jsonElement =
HTMLElement(ElementType.JSON, jsonString, mutable.Map.empty[String, String])
tableElements :+ jsonElement
} else tableElements
} else tableElements
val tableMetadata = mutable.Map[String, String]("element_id" -> tableId)
currentParentId.foreach(pid => tableMetadata("parent_id") = pid)

val tableElement: Option[HTMLElement] = tableHtml.map { html =>
outputFormat match {
case "html-table" =>
HTMLElement(ElementType.HTML, html, tableMetadata)
case "json-table" =>
val tableElem = HTMLParser.parseFirstTableElement(html)
val jsonString = HTMLParser.tableElementToJson(tableElem)
HTMLElement(ElementType.JSON, jsonString, tableMetadata)
case _ =>
HTMLElement(ElementType.TABLE, table.getText.trim, tableMetadata)
}
}

tableElements ++ tableElement.toSeq
}

private def parseDocToElements(document: HWPFDocument): Seq[HTMLElement] = {
Expand All @@ -252,6 +309,10 @@ class WordReader(
if (text.isEmpty) None
else {
val metadata = mutable.Map[String, String]()
val elementId = newUUID()
metadata("element_id") = elementId
currentParentId.foreach(pid => metadata("parent_id") = pid)

paragraph match {
case p if p.isInTable(paragraphs) =>
val tableText = p.tableText(paragraphs).getOrElse("")
Expand All @@ -271,7 +332,8 @@ class WordReader(
document.getAllPictures.asScala.map { pic =>
val metadata = mutable.Map(
"format" -> pic.suggestFileExtension,
"imageType" -> pic.getPictureType.toString)
"imageType" -> pic.getPictureType.toString,
"element_id" -> newUUID())
HTMLElement(
elementType = ElementType.IMAGE,
content = "", // leave textual content empty
Expand All @@ -282,7 +344,8 @@ class WordReader(

private def extractImages(document: HWPFDocument): Seq[HTMLElement] = {
document.getPicturesTable.getAllPictures.asScala.map { pic =>
val metadata = mutable.Map("format" -> pic.suggestFileExtension)
val metadata = mutable.Map("format" -> pic.suggestFileExtension, "element_id" -> newUUID())
currentParentId.foreach(pid => metadata("parent_id") = pid)
HTMLElement(
elementType = ElementType.IMAGE,
content = "",
Expand Down
Binary file added src/test/resources/reader/doc/hierarchy_test.docx
Binary file not shown.
27 changes: 27 additions & 0 deletions src/test/resources/reader/md/simple-book.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Simple Book

## Index
- [Chapter 1: Beginnings](#chapter-1-beginnings)
- [Chapter 2: Middle Path](#chapter-2-middle-path)
- [Chapter 3: Finishing Touch](#chapter-3-finishing-touch)

---

## Chapter 1: Beginnings
Every project starts with a simple idea and a clear intention. In this chapter, we set the stage and outline the basic goals.
Small steps help build momentum and reduce uncertainty. With a plan in place, moving forward becomes much easier.
[Back to top](#simple-book)

---

## Chapter 2: Middle Path
Progress is rarely a straight line, and that is perfectly fine. Here we adjust our approach based on what we learn.
Iteration helps refine ideas and improves the final outcome. Staying flexible keeps the project healthy and on track.
[Back to top](#simple-book)

---

## Chapter 3: Finishing Touch
The final phase focuses on clarity and polish. We review the work, remove distractions, and keep what matters.
A simple, tidy result is easier to use and maintain. With that, the project is ready to share.
[Back to top](#simple-book)
Binary file added src/test/resources/reader/pdf/hierarchy_test.pdf
Binary file not shown.
Loading