From 1a61aff87c4138e65d1be882ed6c349afa5f278a Mon Sep 17 00:00:00 2001 From: Javier Bagatoli Date: Mon, 8 Sep 2025 15:07:37 -0300 Subject: [PATCH 1/4] Add: advanced mode --- aux-funcitons.ts | 17 +++++++ index.ts | 117 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 aux-funcitons.ts diff --git a/aux-funcitons.ts b/aux-funcitons.ts new file mode 100644 index 0000000..a016c4a --- /dev/null +++ b/aux-funcitons.ts @@ -0,0 +1,17 @@ +const calculateStartLine = (controlPosX: {[key: string]: number}): number => { + let vectorKeys = Object.keys(controlPosX); + let moreUsualValue: {key:string, value: number} = {key:'', value: 0}; + + vectorKeys.forEach(key => { + if(controlPosX[key] > moreUsualValue.value){ + moreUsualValue = { + key, + value: controlPosX[key] as number + } + } + }) + + return (Number(moreUsualValue.key)+2) +} + +export default calculateStartLine; \ No newline at end of file diff --git a/index.ts b/index.ts index 095fd9e..ce04a0e 100644 --- a/index.ts +++ b/index.ts @@ -1,4 +1,5 @@ import { pdfjs } from "react-pdf"; +import calculateStartLine from "./aux-funcitons"; // Path to the pdf.worker.js file pdfjs.GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjs.version}/pdf.worker.mjs`; @@ -42,4 +43,118 @@ const pdfToText = async (file: File | Blob | MediaSource): Promise => { return extractedText; }; -export default pdfToText; +/** + * Extracts text content from a PDF file like the original PDF. + * @param {File | Blob | MediaSource} file - The PDF file to extract text from. + * @returns {Promise} A promise that resolves with the extracted text content. + */ +const pdfToTextLikePDF = async (file: File | Blob | MediaSource): Promise => { + // Create a blob URL for the PDF file + const blobUrl = URL.createObjectURL(file); + + // Load the PDF file + const loadingTask = pdfjs.getDocument(blobUrl); + + let extractedText = ""; + try { + const pdf = await loadingTask.promise; + const numPages = pdf.numPages; + + let lastPage = 1 + + let sizeFontProm = 10; + let totalSize = 0; + let totalTokens = 0; + + let controlPosX: {[key: string]: number} = {} + + //Scan the first few pages to sample the beginning of a line of text + for (let pageNumber = 1; pageNumber <= numPages && pageNumber <= 5; pageNumber++) { + const page = await pdf.getPage(pageNumber); + const textContent = await page.getTextContent(); + + textContent.items.map((item) => { + if("transform" in item){ + if(item.height > 3 && item.str){ + totalSize = totalSize + item.height; + totalTokens++ + } + + const name: string =( ((item.transform[4]).toString()).split ('.'))[0] + controlPosX[name] = (controlPosX[name] || 0) + 1 + } + }) + } + sizeFontProm = totalSize/totalTokens; + + const startLine = calculateStartLine(controlPosX); + + // Iterate through each page and extract text + for (let pageNumber = 1; pageNumber <= numPages; pageNumber++) { + const page = await pdf.getPage(pageNumber); + const textContent = await page.getTextContent(); + + let lastToken = '' + let lastLastPositionY = 0 + let lastPositionY = 0 + + const pageText = textContent.items + .map((item, index) => { + if("str" in item){ + lastToken = '' + + lastLastPositionY = lastPositionY + lastPositionY = item.transform[5] + + //Is end of Page? + if(lastPage < pageNumber && pageNumber !== 1){ + lastToken = '\n', + lastPage++ + } + + //Is end of line + if(index > 1 && (lastLastPositionY-(sizeFontProm*1.6) > item.transform[5] )){ + lastToken = '\n' + } + + //Is new Line + if(lastLastPositionY-(sizeFontProm*1) > item.transform[5] + && item.transform[4] > startLine){ + lastToken = '\n' + } + + //Is a foot of page + if((index > 0 && lastLastPositionY < item.transform[5] )){ + lastToken = '\n' + } + + //Is a jump + if(lastPositionY === 0){ + lastToken = '\n' + } + return lastToken + item.str + }}) + .join(''); + extractedText += pageText; + } + } catch (error) { + throw new Error(`Failed to extract text from PDF: ${error}`); + } finally { + // Clean up the blob URL + URL.revokeObjectURL(blobUrl); + + // Free memory from loading task + loadingTask.destroy(); + } + return extractedText; +}; + +const selectModeToExtract = async (file: File | Blob | MediaSource, mode: 'simple' | 'advanced'): Promise => { + if (mode === 'simple') { + return pdfToText(file); + } else { + return pdfToTextLikePDF(file); + } +} + +export default selectModeToExtract; From 18b5e19a19c22d071db4eabd2187e26a1c38c6f8 Mon Sep 17 00:00:00 2001 From: Javier Bagatoli Date: Fri, 21 Nov 2025 17:51:51 -0300 Subject: [PATCH 2/4] add: Detect linespacing --- index.ts | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/index.ts b/index.ts index ce04a0e..c564583 100644 --- a/index.ts +++ b/index.ts @@ -44,11 +44,12 @@ const pdfToText = async (file: File | Blob | MediaSource): Promise => { }; /** - * Extracts text content from a PDF file like the original PDF. + * Extracts text content from a PDF file. * @param {File | Blob | MediaSource} file - The PDF file to extract text from. + * @param lineSpacing - is space inter line an line standart * @returns {Promise} A promise that resolves with the extracted text content. */ -const pdfToTextLikePDF = async (file: File | Blob | MediaSource): Promise => { +const pdfToTextLikePDF = async (file: File | Blob | MediaSource, lineSpacing: number = 1): Promise => { // Create a blob URL for the PDF file const blobUrl = URL.createObjectURL(file); @@ -60,36 +61,34 @@ const pdfToTextLikePDF = async (file: File | Blob | MediaSource): Promise { - if("transform" in item){ + if("height" in item){ if(item.height > 3 && item.str){ - totalSize = totalSize + item.height; + totalFontSize = totalFontSize + item.height; totalTokens++ } - const name: string =( ((item.transform[4]).toString()).split ('.'))[0] + const name: string =(((item.transform[4]).toString()).split ('.'))[0] controlPosX[name] = (controlPosX[name] || 0) + 1 } }) } - sizeFontProm = totalSize/totalTokens; + sizeFontProm = totalFontSize/totalTokens; const startLine = calculateStartLine(controlPosX); - - // Iterate through each page and extract text + for (let pageNumber = 1; pageNumber <= numPages; pageNumber++) { const page = await pdf.getPage(pageNumber); const textContent = await page.getTextContent(); @@ -106,6 +105,10 @@ const pdfToTextLikePDF = async (file: File | Blob | MediaSource): Promise= 790){ + lastToken = '\n' + } //Is end of Page? if(lastPage < pageNumber && pageNumber !== 1){ lastToken = '\n', @@ -113,13 +116,14 @@ const pdfToTextLikePDF = async (file: File | Blob | MediaSource): Promise 1 && (lastLastPositionY-(sizeFontProm*1.6) > item.transform[5] )){ + if(index > 1 && (lastLastPositionY-(sizeFontProm*1.6*lineSpacing) > item.transform[5])){ lastToken = '\n' } - //Is new Line - if(lastLastPositionY-(sizeFontProm*1) > item.transform[5] - && item.transform[4] > startLine){ + //Is new parrafo + if(lastLastPositionY-(sizeFontProm*lineSpacing) > item.transform[5] + && item.transform[4] > startLine + ){ lastToken = '\n' } @@ -132,7 +136,8 @@ const pdfToTextLikePDF = async (file: File | Blob | MediaSource): Promise => { +const selectModeToExtract = async (file: File | Blob | MediaSource, mode: 'simple' | 'advanced', lineSpacing): Promise => { if (mode === 'simple') { return pdfToText(file); } else { - return pdfToTextLikePDF(file); + return pdfToTextLikePDF(file,lineSpacing); } } From 20462a66c9d5533ce1341fdade8e5c4a4bb8d879 Mon Sep 17 00:00:00 2001 From: Javier Bagatoli Date: Fri, 21 Nov 2025 18:03:11 -0300 Subject: [PATCH 3/4] update data for NPM --- .github/workflows/npm-publish.yml | 67 ------------------------------- LICENSE | 2 +- README.md | 59 +++++++++++++++++---------- index.ts | 2 +- package.json | 9 +++-- 5 files changed, 44 insertions(+), 95 deletions(-) delete mode 100644 .github/workflows/npm-publish.yml diff --git a/.github/workflows/npm-publish.yml b/.github/workflows/npm-publish.yml deleted file mode 100644 index 6b0b23a..0000000 --- a/.github/workflows/npm-publish.yml +++ /dev/null @@ -1,67 +0,0 @@ -name: Publish NPM package - -on: - push: - branches: - - main - -jobs: - publish: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Setup Node.js - uses: actions/setup-node@v2 - with: - node-version: '20' - registry-url: 'https://registry.npmjs.org' - - - name: Install dependencies - run: npm install - - - name: Check the version - id: check - run: | - CURRENT_VERSION=$(jq -r .version package.json) - echo "Current version: $CURRENT_VERSION" - LATEST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0") - echo "Latest tag: $LATEST_TAG" - - LATEST_VERSION=${LATEST_TAG#v} - - if [ "$LATEST_VERSION" != "$CURRENT_VERSION" ]; - then - echo "Version changed" - echo "version_changed=true" >> $GITHUB_OUTPUT - echo "new_version=$CURRENT_VERSION" >> $GITHUB_OUTPUT - else - echo "Version not changed" - echo "version_changed=false" >> $GITHUB_OUTPUT - fi - - - name: Build - run: npm run build - if: steps.check.outputs.version_changed == 'true' - - - name: Publish - if: steps.check.outputs.version_changed == 'true' - run: npm publish --access public --no-git-checks - env: - NODE_AUTH_TOKEN: ${{ secrets.NPM_ACCESS_TOKEN }} - - - name: Tag release - if: steps.check.outputs.version_changed == 'true' - run: | - git config --local user.email "github-actions[bot]@users.noreply.github.com" - git config --local user.name "github-actions[bot]" - git tag -a "v${{ steps.check.outputs.new_version }}" -m "v${{ steps.check.outputs.new_version }}" - git push origin "v${{ steps.check.outputs.new_version }}" - - - - - - \ No newline at end of file diff --git a/LICENSE b/LICENSE index 6a97962..23b44a3 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2024 Utkarsh Pancholi +Copyright (c) 2025 Javier Bagatoli Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 953748e..87d0e20 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,15 @@ -# react-pdftotext +# react-pdftotext-advanced -Light-weight memory-safe client library for extracting plain text from pdf files. +This is a library based on "react-pdftotext" that aims to format text for readability without requiring extensive coding. + +This version separates paragraph and page endings, taking into account expected spacing and page breaks. ## Installing Using npm: ```js -npm install react-pdftotext +npm install react-pdftotext-advanced ``` ## Example @@ -22,35 +24,48 @@ Now add a input tag with type="file" to take file input. Import the pdf2text function from package -```js -import pdfToText from "react-pdftotext"; +```ts +//simple mode +//input Base text +//Good morning everyone. +// +//How are you all? +// +//I hope you're well. +import pdfToText from "react-pdftotext-advanced"; function extractText(event) { const file = event.target.files[0]; - pdfToText(file) + selectModeToExtract(file, 'simple') .then((text) => console.log(text)) .catch((error) => console.error("Failed to extract text from pdf")); } +//output Base text +// Good morning everyone.How are you all?I hope you're well. ``` -**Remote PDF File Input** - -For Pdf files stored at remote locations - -```js -import pdfToText from 'react-pdftotext' - -const pdf_url = "REMOTE_PDF_URL" +```ts +//Advanced mode +//input text +//Good morning everyone. +// +//How are you all? +// +//I hope you're well. +import pdfToText from "react-pdftotext-advanced"; -function extractText() { - const file = await fetch(pdf_url) - .then(res => res.blob()) - .catch(error => console.error(error)) - - pdfToText(file) - .then(text => console.log(text)) - .catch(error => console.error("Failed to extract text from pdf")) +function extractText(event) { + const file = event.target.files[0]; + selectModeToExtract(file, 'simple') + .then((text) => console.log(text)) + .catch((error) => console.error("Failed to extract text from pdf")); } +//output text +//Good morning everyone. +// +//How are you all? +// +//I hope you're well. ``` ## Contributing diff --git a/index.ts b/index.ts index c564583..210c70d 100644 --- a/index.ts +++ b/index.ts @@ -154,7 +154,7 @@ const pdfToTextLikePDF = async (file: File | Blob | MediaSource, lineSpacing: nu return extractedText; }; -const selectModeToExtract = async (file: File | Blob | MediaSource, mode: 'simple' | 'advanced', lineSpacing): Promise => { +const selectModeToExtract = async (file: File | Blob | MediaSource, mode: 'simple' | 'advanced', lineSpacing: number = 1): Promise => { if (mode === 'simple') { return pdfToText(file); } else { diff --git a/package.json b/package.json index 31eb6b4..7b94bf7 100644 --- a/package.json +++ b/package.json @@ -1,5 +1,5 @@ { - "name": "react-pdftotext", + "name": "react-pdftotext-advanced", "version": "1.3.4", "description": "A simple light weight react package to extract plain text from a pdf file.", "main": "dist/index.js", @@ -12,7 +12,7 @@ }, "repository": { "type": "git", - "url": "https://github.com/Utkarsh212/react-pdftotext.git" + "url": "https://github.com/JavierBagatoli/react-pdftotext-advanced.git" }, "keywords": [ "react-pdf", @@ -22,9 +22,10 @@ "react", "pdf2text", "pdfjs", - "pdf-to-text" + "pdf-to-text", + "typescript" ], - "author": "Utkarsh Pancholi", + "author": "Javier Bagatoli", "license": "MIT", "dependencies": { "pdfjs-dist": "^4.6.82", From 8c23a011e31de502b8a32dfdbf106b4b7c292ab1 Mon Sep 17 00:00:00 2001 From: Javier Bagatoli Date: Fri, 21 Nov 2025 18:17:35 -0300 Subject: [PATCH 4/4] fix: url --- package.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index 7b94bf7..6376f60 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "react-pdftotext-advanced", - "version": "1.3.4", + "version": "1.3.5", "description": "A simple light weight react package to extract plain text from a pdf file.", "main": "dist/index.js", "module": "dist/index.js", @@ -12,7 +12,7 @@ }, "repository": { "type": "git", - "url": "https://github.com/JavierBagatoli/react-pdftotext-advanced.git" + "url": "https://github.com/JavierBagatoli/react-pdftotext.git" }, "keywords": [ "react-pdf",