Utkarsh212 · JavierBagatoli · Sep 8, 2025 · Nov 21, 2025 · Nov 21, 2025 · Nov 21, 2025
diff --git a/.github/workflows/npm-publish.yml b/.github/workflows/npm-publish.yml
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2024 Utkarsh Pancholi
+Copyright (c) 2025 Javier Bagatoli
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -1,13 +1,15 @@
-# react-pdftotext
+# react-pdftotext-advanced
 
-Light-weight memory-safe client library for extracting plain text from pdf files.
+This is a library based on "react-pdftotext" that aims to format text for readability without requiring extensive coding.
+
+This version separates paragraph and page endings, taking into account expected spacing and page breaks.
 
 ## Installing
 
 Using npm:
 
 ```js
-npm install react-pdftotext
+npm install react-pdftotext-advanced
 ```
 
 ## Example
@@ -22,35 +24,48 @@ Now add a input tag with type="file" to take file input.
 
 Import the pdf2text function from package
 
-```js
-import pdfToText from "react-pdftotext";
+```ts
+//simple mode
+//input Base text
+//Good morning everyone.
+//
+//How are you all?
+//
+//I hope you're well.
+import pdfToText from "react-pdftotext-advanced";
 
 function extractText(event) {
   const file = event.target.files[0];
-  pdfToText(file)
+  selectModeToExtract(file, 'simple')
     .then((text) => console.log(text))
     .catch((error) => console.error("Failed to extract text from pdf"));
 }
+//output Base text
+// Good morning everyone.How are you all?I hope you're well.
 ```
 
-**Remote PDF File Input**
-
-For Pdf files stored at remote locations
-
-```js
-import pdfToText from 'react-pdftotext'
-
-const pdf_url = "REMOTE_PDF_URL"
+```ts
+//Advanced mode
+//input text
+//Good morning everyone.
+//
+//How are you all?
+//
+//I hope you're well.
+import pdfToText from "react-pdftotext-advanced";
 
-function extractText() {
-    const file = await fetch(pdf_url)
-        .then(res => res.blob())
-        .catch(error => console.error(error))
-
-    pdfToText(file)
-        .then(text => console.log(text))
-        .catch(error => console.error("Failed to extract text from pdf"))
+function extractText(event) {
+  const file = event.target.files[0];
+  selectModeToExtract(file, 'simple')
+    .then((text) => console.log(text))
+    .catch((error) => console.error("Failed to extract text from pdf"));
 }
+//output text
+//Good morning everyone.
+//
+//How are you all?
+//
+//I hope you're well.
 ```
 
 ## Contributing

diff --git a/aux-funcitons.ts b/aux-funcitons.ts
@@ -0,0 +1,17 @@
+const calculateStartLine = (controlPosX: {[key: string]: number}): number => {
+ let vectorKeys = Object.keys(controlPosX);
+    let moreUsualValue: {key:string, value: number} = {key:'', value: 0};
+
+    vectorKeys.forEach(key => {
+      if(controlPosX[key] > moreUsualValue.value){
+        moreUsualValue = {
+          key,
+          value: controlPosX[key] as number
+        }
+      }
+    })
+
+   return (Number(moreUsualValue.key)+2)
+}
+
+export default calculateStartLine;
diff --git a/index.ts b/index.ts
@@ -1,4 +1,5 @@
 import { pdfjs } from "react-pdf";
+import calculateStartLine from "./aux-funcitons";
 
 // Path to the pdf.worker.js file
 pdfjs.GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjs.version}/pdf.worker.mjs`;
@@ -42,4 +43,123 @@ const pdfToText = async (file: File | Blob | MediaSource): Promise<string> => {
   return extractedText;
 };
 
-export default pdfToText;
+/**
+ * Extracts text content from a PDF file.
+ * @param {File | Blob | MediaSource} file - The PDF file to extract text from.
+ * @param lineSpacing - is space inter line an line standart
+ * @returns {Promise<string>} A promise that resolves with the extracted text content.
+ */
+const pdfToTextLikePDF = async (file: File | Blob | MediaSource, lineSpacing: number = 1): Promise<string> => {
+  // Create a blob URL for the PDF file
+  const blobUrl = URL.createObjectURL(file);
+
+  // Load the PDF file
+  const loadingTask = pdfjs.getDocument(blobUrl);
+
+  let extractedText = "";
+  try {
+    const pdf = await loadingTask.promise;
+    const numPages = pdf.numPages;
+
+    // Iterate through each page and extract text
+    let lastPage = 1
+
+    let sizeFontProm = 10;
+    let totalFontSize = 0;
+    let totalTokens = 0;
+
+    let controlPosX: {[key: string]: number} = {}
+    for (let pageNumber = 1; pageNumber <= numPages && pageNumber <= 5; pageNumber++) {
+      const page = await pdf.getPage(pageNumber);
+      const textContent = await page.getTextContent();
+
+      textContent.items.map((item) => {
+        if("height" in item){
+          if(item.height > 3 && item.str){
+            totalFontSize = totalFontSize + item.height;
+            totalTokens++
+          }
+
+          const name: string =(((item.transform[4]).toString()).split ('.'))[0]
+          controlPosX[name] = (controlPosX[name] || 0) + 1 
+        }
+      })
+    }
+    sizeFontProm = totalFontSize/totalTokens;
+
+    const startLine = calculateStartLine(controlPosX);
+
+    for (let pageNumber = 1; pageNumber <= numPages; pageNumber++) {
+      const page = await pdf.getPage(pageNumber);
+      const textContent = await page.getTextContent();
+
+      let lastToken = ''
+      let lastLastPositionY = 0
+      let lastPositionY = 0
+
+      const pageText = textContent.items
+        .map((item, index) => {
+          if("str" in item){
+            lastToken = ''
+
+            lastLastPositionY = lastPositionY
+            lastPositionY = item.transform[5]
+
+            //Encabezado
+            if(lastLastPositionY >= 790){
+              lastToken = '\n'
+            }
+            //Is end of Page?
+            if(lastPage < pageNumber && pageNumber !== 1){
+              lastToken = '\n',
+              lastPage++
+            }
+
+            //Is end of line
+            if(index > 1 && (lastLastPositionY-(sizeFontProm*1.6*lineSpacing) > item.transform[5])){
+              lastToken = '\n'
+            }
+
+            //Is new parrafo
+            if(lastLastPositionY-(sizeFontProm*lineSpacing) > item.transform[5] 
+              && item.transform[4] > startLine
+            ){
+              lastToken = '\n'
+            }
+
+            //Is a foot of page
+            if((index > 0 && lastLastPositionY < item.transform[5] )){
+              lastToken = '\n'
+            }
+
+            //Is a jump
+            if(lastPositionY === 0){
+              lastToken = '\n'
+            }
+
+            return lastToken + (item.str === ''? ' ': '') + item.str
+          }})
+        .join('');
+      extractedText += pageText;
+    }
+  } catch (error) {
+    throw new Error(`Failed to extract text from PDF: ${error}`);
+  } finally {
+    // Clean up the blob URL
+    URL.revokeObjectURL(blobUrl);
+
+    // Free memory from loading task
+    loadingTask.destroy();
+  }
+  return extractedText;
+};
+
+const selectModeToExtract = async (file: File | Blob | MediaSource, mode: 'simple' | 'advanced', lineSpacing: number = 1): Promise<string> => {
+  if (mode === 'simple') {
+    return pdfToText(file);
+  } else {
+    return pdfToTextLikePDF(file,lineSpacing);
+  }
+}
+
+export default selectModeToExtract;
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
-  "name": "react-pdftotext",
-  "version": "1.3.4",
+  "name": "react-pdftotext-advanced",
+  "version": "1.3.5",
   "description": "A simple light weight react package to extract plain text from a pdf file.",
   "main": "dist/index.js",
   "module": "dist/index.js",
@@ -12,7 +12,7 @@
   },
   "repository": {
     "type": "git",
-    "url": "https://github.com/Utkarsh212/react-pdftotext.git"
+    "url": "https://github.com/JavierBagatoli/react-pdftotext.git"
   },
   "keywords": [
     "react-pdf",
@@ -22,9 +22,10 @@
     "react",
     "pdf2text",
     "pdfjs",
-    "pdf-to-text"
+    "pdf-to-text",
+    "typescript"
   ],
-  "author": "Utkarsh Pancholi",
+  "author": "Javier Bagatoli",
   "license": "MIT",
   "dependencies": {
     "pdfjs-dist": "^4.6.82",