Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 0 additions & 67 deletions .github/workflows/npm-publish.yml

This file was deleted.

2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2024 Utkarsh Pancholi
Copyright (c) 2025 Javier Bagatoli

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
59 changes: 37 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
# react-pdftotext
# react-pdftotext-advanced

Light-weight memory-safe client library for extracting plain text from pdf files.
This is a library based on "react-pdftotext" that aims to format text for readability without requiring extensive coding.

This version separates paragraph and page endings, taking into account expected spacing and page breaks.

## Installing

Using npm:

```js
npm install react-pdftotext
npm install react-pdftotext-advanced
```

## Example
Expand All @@ -22,35 +24,48 @@ Now add a input tag with type="file" to take file input.

Import the pdf2text function from package

```js
import pdfToText from "react-pdftotext";
```ts
//simple mode
//input Base text
//Good morning everyone.
//
//How are you all?
//
//I hope you're well.
import pdfToText from "react-pdftotext-advanced";

function extractText(event) {
const file = event.target.files[0];
pdfToText(file)
selectModeToExtract(file, 'simple')
.then((text) => console.log(text))
.catch((error) => console.error("Failed to extract text from pdf"));
}
//output Base text
// Good morning everyone.How are you all?I hope you're well.
```

**Remote PDF File Input**

For Pdf files stored at remote locations

```js
import pdfToText from 'react-pdftotext'

const pdf_url = "REMOTE_PDF_URL"
```ts
//Advanced mode
//input text
//Good morning everyone.
//
//How are you all?
//
//I hope you're well.
import pdfToText from "react-pdftotext-advanced";

function extractText() {
const file = await fetch(pdf_url)
.then(res => res.blob())
.catch(error => console.error(error))

pdfToText(file)
.then(text => console.log(text))
.catch(error => console.error("Failed to extract text from pdf"))
function extractText(event) {
const file = event.target.files[0];
selectModeToExtract(file, 'simple')
.then((text) => console.log(text))
.catch((error) => console.error("Failed to extract text from pdf"));
}
//output text
//Good morning everyone.
//
//How are you all?
//
//I hope you're well.
```

## Contributing
Expand Down
17 changes: 17 additions & 0 deletions aux-funcitons.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
const calculateStartLine = (controlPosX: {[key: string]: number}): number => {
let vectorKeys = Object.keys(controlPosX);
let moreUsualValue: {key:string, value: number} = {key:'', value: 0};

vectorKeys.forEach(key => {
if(controlPosX[key] > moreUsualValue.value){
moreUsualValue = {
key,
value: controlPosX[key] as number
}
}
})

return (Number(moreUsualValue.key)+2)
}

export default calculateStartLine;
122 changes: 121 additions & 1 deletion index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { pdfjs } from "react-pdf";
import calculateStartLine from "./aux-funcitons";

// Path to the pdf.worker.js file
pdfjs.GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjs.version}/pdf.worker.mjs`;
Expand Down Expand Up @@ -42,4 +43,123 @@ const pdfToText = async (file: File | Blob | MediaSource): Promise<string> => {
return extractedText;
};

export default pdfToText;
/**
* Extracts text content from a PDF file.
* @param {File | Blob | MediaSource} file - The PDF file to extract text from.
* @param lineSpacing - is space inter line an line standart
* @returns {Promise<string>} A promise that resolves with the extracted text content.
*/
const pdfToTextLikePDF = async (file: File | Blob | MediaSource, lineSpacing: number = 1): Promise<string> => {
// Create a blob URL for the PDF file
const blobUrl = URL.createObjectURL(file);

// Load the PDF file
const loadingTask = pdfjs.getDocument(blobUrl);

let extractedText = "";
try {
const pdf = await loadingTask.promise;
const numPages = pdf.numPages;

// Iterate through each page and extract text
let lastPage = 1

let sizeFontProm = 10;
let totalFontSize = 0;
let totalTokens = 0;

let controlPosX: {[key: string]: number} = {}
for (let pageNumber = 1; pageNumber <= numPages && pageNumber <= 5; pageNumber++) {
const page = await pdf.getPage(pageNumber);
const textContent = await page.getTextContent();

textContent.items.map((item) => {
if("height" in item){
if(item.height > 3 && item.str){
totalFontSize = totalFontSize + item.height;
totalTokens++
}

const name: string =(((item.transform[4]).toString()).split ('.'))[0]
controlPosX[name] = (controlPosX[name] || 0) + 1
}
})
}
sizeFontProm = totalFontSize/totalTokens;

const startLine = calculateStartLine(controlPosX);

for (let pageNumber = 1; pageNumber <= numPages; pageNumber++) {
const page = await pdf.getPage(pageNumber);
const textContent = await page.getTextContent();

let lastToken = ''
let lastLastPositionY = 0
let lastPositionY = 0

const pageText = textContent.items
.map((item, index) => {
if("str" in item){
lastToken = ''

lastLastPositionY = lastPositionY
lastPositionY = item.transform[5]

//Encabezado
if(lastLastPositionY >= 790){
lastToken = '\n'
}
//Is end of Page?
if(lastPage < pageNumber && pageNumber !== 1){
lastToken = '\n',
lastPage++
}

//Is end of line
if(index > 1 && (lastLastPositionY-(sizeFontProm*1.6*lineSpacing) > item.transform[5])){
lastToken = '\n'
}

//Is new parrafo
if(lastLastPositionY-(sizeFontProm*lineSpacing) > item.transform[5]
&& item.transform[4] > startLine
){
lastToken = '\n'
}

//Is a foot of page
if((index > 0 && lastLastPositionY < item.transform[5] )){
lastToken = '\n'
}

//Is a jump
if(lastPositionY === 0){
lastToken = '\n'
}

return lastToken + (item.str === ''? ' ': '') + item.str
}})
.join('');
extractedText += pageText;
}
} catch (error) {
throw new Error(`Failed to extract text from PDF: ${error}`);
} finally {
// Clean up the blob URL
URL.revokeObjectURL(blobUrl);

// Free memory from loading task
loadingTask.destroy();
}
return extractedText;
};

const selectModeToExtract = async (file: File | Blob | MediaSource, mode: 'simple' | 'advanced', lineSpacing: number = 1): Promise<string> => {
if (mode === 'simple') {
return pdfToText(file);
} else {
return pdfToTextLikePDF(file,lineSpacing);
}
}

export default selectModeToExtract;
11 changes: 6 additions & 5 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "react-pdftotext",
"version": "1.3.4",
"name": "react-pdftotext-advanced",
"version": "1.3.5",
"description": "A simple light weight react package to extract plain text from a pdf file.",
"main": "dist/index.js",
"module": "dist/index.js",
Expand All @@ -12,7 +12,7 @@
},
"repository": {
"type": "git",
"url": "https://github.com/Utkarsh212/react-pdftotext.git"
"url": "https://github.com/JavierBagatoli/react-pdftotext.git"
},
"keywords": [
"react-pdf",
Expand All @@ -22,9 +22,10 @@
"react",
"pdf2text",
"pdfjs",
"pdf-to-text"
"pdf-to-text",
"typescript"
],
"author": "Utkarsh Pancholi",
"author": "Javier Bagatoli",
"license": "MIT",
"dependencies": {
"pdfjs-dist": "^4.6.82",
Expand Down