diff --git a/.gitignore b/.gitignore index d2f206a..6ef5962 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ __pycache__ venv .env .vscode +.DS_Store \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/aws.xml b/.idea/aws.xml new file mode 100644 index 0000000..5cc5d17 --- /dev/null +++ b/.idea/aws.xml @@ -0,0 +1,11 @@ + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..0c8f81e --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..85ed81a --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index aea76bb..b6b13b8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,19 +1,39 @@ FROM python:3.8.13-slim-buster +# Set working directory WORKDIR /code +# Install dependencies & clean up cache +RUN apt-get update && apt-get install -y logrotate && rm -rf /var/lib/apt/lists/* + +# Create a virtual environment RUN python -m venv venv ENV PATH="venv/bin:$PATH" -ENV GOTENBERG_API_URL=http://host.docker.internal:3000 -COPY ./requirements.txt /code/requirements.txt +# Set environment variable for Gotenberg +ENV GOTENBERG_API_URL=http://gotenberg:3000 -RUN pip install --no-warn-script-location \ - --no-cache-dir --upgrade -r /code/requirements.txt +# Copy and install dependencies +COPY ./requirements.txt /code/requirements.txt +RUN pip install --no-warn-script-location --no-cache-dir --upgrade -r /code/requirements.txt +# Copy application files COPY . /code -CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] +# Set up log rotation +RUN echo "/code/temp/*.log { \n\ + daily \n\ + rotate 7 \n\ + compress \n\ + missingok \n\ + notifempty \n\ + }" > /etc/logrotate.d/app_logs + +# Automatically clean up old temp files on container start +RUN echo '#!/bin/sh\nrm -rf /code/temp/*' > /cleanup.sh && chmod +x /cleanup.sh + +# Expose port +EXPOSE 4532 -# If running behind a proxy like Nginx or Traefik add --proxy-headers -# CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--proxy-headers"] +# Start the application +CMD ["sh", "-c", "/cleanup.sh && exec venv/bin/uvicorn main:app --host 0.0.0.0 --port 4532"] diff --git a/document-templating-service.iml b/document-templating-service.iml new file mode 100644 index 0000000..fe1e22b --- /dev/null +++ b/document-templating-service.iml @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/docx-template/DSP-loan-kit-template-with-variables.docx b/docx-template/DSP-loan-kit-template-with-variables.docx new file mode 100644 index 0000000..169ccca Binary files /dev/null and b/docx-template/DSP-loan-kit-template-with-variables.docx differ diff --git a/docx-template/Final_Agreement_09th_Sept.docx b/docx-template/Final_Agreement_09th_Sept.docx new file mode 100644 index 0000000..eb7350e Binary files /dev/null and b/docx-template/Final_Agreement_09th_Sept.docx differ diff --git a/docx-template/Final_Agreement_11th_Feb_Alpha.docx b/docx-template/Final_Agreement_11th_Feb_Alpha.docx new file mode 100644 index 0000000..ab924a2 Binary files /dev/null and b/docx-template/Final_Agreement_11th_Feb_Alpha.docx differ diff --git a/docx-template/Final_Agreement_11th_Feb_Prod.docx b/docx-template/Final_Agreement_11th_Feb_Prod.docx new file mode 100644 index 0000000..affdebb Binary files /dev/null and b/docx-template/Final_Agreement_11th_Feb_Prod.docx differ diff --git a/docx-template/Final_Agreement_12th_June_Alpha.docx b/docx-template/Final_Agreement_12th_June_Alpha.docx new file mode 100644 index 0000000..17c6ac1 Binary files /dev/null and b/docx-template/Final_Agreement_12th_June_Alpha.docx differ diff --git a/docx-template/Final_Agreement_15th_July_Alpha.docx b/docx-template/Final_Agreement_15th_July_Alpha.docx new file mode 100644 index 0000000..bd8a482 Binary files /dev/null and b/docx-template/Final_Agreement_15th_July_Alpha.docx differ diff --git a/docx-template/Final_Agreement_3rd_January.docx b/docx-template/Final_Agreement_3rd_January.docx new file mode 100644 index 0000000..1cfb952 Binary files /dev/null and b/docx-template/Final_Agreement_3rd_January.docx differ diff --git a/docx-template/Final_Agreement_7th_July_Alpha.docx b/docx-template/Final_Agreement_7th_July_Alpha.docx new file mode 100644 index 0000000..6dbdb0b Binary files /dev/null and b/docx-template/Final_Agreement_7th_July_Alpha.docx differ diff --git a/docx-template/Final_Agreement_9th_December.docx b/docx-template/Final_Agreement_9th_December.docx new file mode 100644 index 0000000..a376683 Binary files /dev/null and b/docx-template/Final_Agreement_9th_December.docx differ diff --git a/docx-template/Final_Agreement_9th_January.docx b/docx-template/Final_Agreement_9th_January.docx new file mode 100644 index 0000000..0f133bf Binary files /dev/null and b/docx-template/Final_Agreement_9th_January.docx differ diff --git a/docx-template/Final_Agreement_TermLoan_31_July_Alpha.docx b/docx-template/Final_Agreement_TermLoan_31_July_Alpha.docx new file mode 100644 index 0000000..b0f827a Binary files /dev/null and b/docx-template/Final_Agreement_TermLoan_31_July_Alpha.docx differ diff --git a/docx-template/Final_KFS_Template_09th_Sept.docx b/docx-template/Final_KFS_Template_09th_Sept.docx new file mode 100644 index 0000000..21ca412 Binary files /dev/null and b/docx-template/Final_KFS_Template_09th_Sept.docx differ diff --git a/docx-template/Final_KFS_Template_16th_July_Alpha.docx b/docx-template/Final_KFS_Template_16th_July_Alpha.docx new file mode 100644 index 0000000..54f9e43 Binary files /dev/null and b/docx-template/Final_KFS_Template_16th_July_Alpha.docx differ diff --git a/docx-template/KFS_TermLoan-test.docx b/docx-template/KFS_TermLoan-test.docx new file mode 100644 index 0000000..e275a36 Binary files /dev/null and b/docx-template/KFS_TermLoan-test.docx differ diff --git a/docx-template/KFS_TermLoan_31_July.docx b/docx-template/KFS_TermLoan_31_July.docx new file mode 100644 index 0000000..c1d08db Binary files /dev/null and b/docx-template/KFS_TermLoan_31_July.docx differ diff --git a/docx-template/KFS_TermLoan_31st_July.docx b/docx-template/KFS_TermLoan_31st_July.docx new file mode 100644 index 0000000..e275a36 Binary files /dev/null and b/docx-template/KFS_TermLoan_31st_July.docx differ diff --git a/docx-template/KFS_template_Document.docx b/docx-template/KFS_template_Document.docx new file mode 100644 index 0000000..de3981a Binary files /dev/null and b/docx-template/KFS_template_Document.docx differ diff --git a/docx-template/KFS_template_Document_12th_June.docx b/docx-template/KFS_template_Document_12th_June.docx new file mode 100644 index 0000000..374781e Binary files /dev/null and b/docx-template/KFS_template_Document_12th_June.docx differ diff --git a/docx-template/Sample_Testing.docx b/docx-template/Sample_Testing.docx new file mode 100644 index 0000000..8a91d0e Binary files /dev/null and b/docx-template/Sample_Testing.docx differ diff --git a/docx-template/Tata_Agreement_Template.docx b/docx-template/Tata_Agreement_Template.docx new file mode 100644 index 0000000..3cfef3c Binary files /dev/null and b/docx-template/Tata_Agreement_Template.docx differ diff --git a/docx-template/Tata_Agreement_Template_v2.docx b/docx-template/Tata_Agreement_Template_v2.docx new file mode 100644 index 0000000..6eae1c7 Binary files /dev/null and b/docx-template/Tata_Agreement_Template_v2.docx differ diff --git a/docx-template/Tata_App_Form_Template.docx b/docx-template/Tata_App_Form_Template.docx new file mode 100644 index 0000000..56c2545 Binary files /dev/null and b/docx-template/Tata_App_Form_Template.docx differ diff --git a/docx-template/Tata_Enhancement_Agreement_Template.docx b/docx-template/Tata_Enhancement_Agreement_Template.docx new file mode 100644 index 0000000..67cafb8 Binary files /dev/null and b/docx-template/Tata_Enhancement_Agreement_Template.docx differ diff --git a/docx-template/Tata_Enhancement_Agreement_Template_v2.docx b/docx-template/Tata_Enhancement_Agreement_Template_v2.docx new file mode 100644 index 0000000..1c07f06 Binary files /dev/null and b/docx-template/Tata_Enhancement_Agreement_Template_v2.docx differ diff --git a/docx-template/Tata_KFS_Review_Template.docx b/docx-template/Tata_KFS_Review_Template.docx new file mode 100644 index 0000000..17771c2 Binary files /dev/null and b/docx-template/Tata_KFS_Review_Template.docx differ diff --git a/docx-template/Tata_KFS_Review_Template_v2.docx b/docx-template/Tata_KFS_Review_Template_v2.docx new file mode 100644 index 0000000..aa8f27a Binary files /dev/null and b/docx-template/Tata_KFS_Review_Template_v2.docx differ diff --git a/docx-template/UpdatedAgreementTemplate.docx b/docx-template/UpdatedAgreementTemplate.docx new file mode 100644 index 0000000..56f4c09 Binary files /dev/null and b/docx-template/UpdatedAgreementTemplate.docx differ diff --git a/docx-template/tata_posidex_report.docx b/docx-template/tata_posidex_report.docx new file mode 100644 index 0000000..e0a4970 Binary files /dev/null and b/docx-template/tata_posidex_report.docx differ diff --git a/docx-template/tata_posidex_report_v2.docx b/docx-template/tata_posidex_report_v2.docx new file mode 100644 index 0000000..bb6b8d4 Binary files /dev/null and b/docx-template/tata_posidex_report_v2.docx differ diff --git a/main.py b/main.py index f068745..6313d91 100644 --- a/main.py +++ b/main.py @@ -1,10 +1,22 @@ +import base64 from fastapi import Body, FastAPI, File, UploadFile from fastapi.responses import FileResponse, JSONResponse from pydantic import Json -from docxtpl import DocxTemplate +from docxtpl import DocxTemplate, InlineImage +from docx.shared import Inches +from io import BytesIO +from typing import Any, Dict import aiofiles from utils import remove_temporary_files, get_env import requests +import uuid +import io +import os +import logging + +# Configure logging +logging.basicConfig(level=logging.INFO) # Set to DEBUG for more detailed logs +logger = logging.getLogger(__name__) app = FastAPI( title="Document Template Processing Service", @@ -45,4 +57,352 @@ async def process_document_template(data: Json = Body(...), file: UploadFile = F response = requests.post(url=resourceURL, files={'file': open(file_path, 'rb')}) async with aiofiles.open(pdf_file_path, 'wb') as out_file: await out_file.write(response.content) - return FileResponse(pdf_file_path, media_type='application/pdf') \ No newline at end of file + return FileResponse(pdf_file_path, media_type='application/pdf') + +# Define a custom temporary folder to store uploaded files +UPLOAD_FOLDER = 'temp/' +os.makedirs(UPLOAD_FOLDER, exist_ok=True) + +# Only allow .docx files +ALLOWED_EXTENSIONS = {'docx'} + +def allowed_file(filename): + return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS + + +@app.post('/api/v1/process-template-document/upload-file') +async def process_document_template(file: UploadFile = File(...)): + # Check if the uploaded file is a .docx file + if not allowed_file(file.filename): + return JSONResponse(content={"error": "Only .docx files are allowed"}, status_code=400) + + # Save the uploaded file to the custom temporary folder + file_location = os.path.join(UPLOAD_FOLDER, file.filename) + + with open(file_location, "wb") as buffer: + buffer.write(await file.read()) + + # Return success message with file path + return {"message": "File uploaded successfully", "file_path": file_location} + + +async def process_parallel_sections(data: Dict[str, Any]): + """Helper function for parallel processing of document sections""" + # Ensure directories exist + temp_dir = 'temp' + sections_dir = 'docx-sections' + for dir_path in [temp_dir, sections_dir]: + if not os.path.exists(dir_path): + os.makedirs(dir_path) + + resourceURL = f"{get_env('GOTENBERG_API_URL')}/forms/libreoffice/convert" + folder_name = data['folderName'] + folder_path = f'{sections_dir}/{folder_name}' + + if not os.path.exists(folder_path): + return JSONResponse({'status': 'error', 'message': f'Section folder not found: {folder_name}'}, status_code=404) + + import threading + from concurrent.futures import ThreadPoolExecutor, as_completed + + try: + # Get all files in the folder (both DOCX and PDF) + all_files = os.listdir(folder_path) + all_docx_files = [f for f in all_files if f.endswith('.docx')] + all_pdf_files = [f for f in all_files if f.endswith('.pdf')] + + # Sort files to ensure proper order (handles 01, 02, 03... 10, 11 correctly) + def natural_sort_key(filename): + import re + # Extract number from filename (e.g., "01_dynamic_agreement.docx" -> 1) + match = re.search(r'^(\d+)_', filename) + if match: + return int(match.group(1)) + return 0 # Files without numbers go first + + all_docx_files.sort(key=natural_sort_key) + all_pdf_files.sort(key=natural_sort_key) + + # Separate dynamic and static files + dynamic_files = [f for f in all_docx_files if '_dynamic_' in f] + static_pdf_files = [f for f in all_pdf_files if '_static_' in f] + + # Static files are always PDF-only - create mapping for ordering + static_files = [] + static_file_mapping = {} # Maps display name to actual file path + + for static_pdf in static_pdf_files: + # Create a display name using the DOCX convention for ordering + static_docx = static_pdf.replace('.pdf', '.docx') + static_files.append(static_docx) # Use DOCX name for ordering + static_file_mapping[static_docx] = os.path.join(folder_path, static_pdf) + + # Create the complete file list for merging (all files in order) + all_files_for_merging = [] + + # Combine all files and sort by number + all_file_names = dynamic_files + static_files + all_file_names.sort(key=natural_sort_key) + + for file_name in all_file_names: + if file_name in dynamic_files: + all_files_for_merging.append(file_name) + elif file_name in static_files: + all_files_for_merging.append(file_name) + + if not all_docx_files: + return JSONResponse({'status': 'error', 'message': f'No DOCX files found in folder: {folder_name}'}, status_code=404) + + # Thread-safe storage for processed dynamic files + processed_dynamic_pdfs = {} + processing_lock = threading.Lock() + + def process_dynamic_section(docx_file, index): + """Process a single dynamic section in a thread""" + file_path = os.path.join(folder_path, docx_file) + + try: + # Process with data + document = DocxTemplate(file_path) + context = data['data'] + + # Process image if provided (only for first dynamic section) + if index == 0 and 'image' in data: + image_info = data['image'] + base64_image = image_info.get('content') + image_width = image_info.get('width', 2) + image_height = image_info.get('height', 2) + + if base64_image: + image_data = base64.b64decode(base64_image) + image_file = BytesIO(image_data) + context["image_placeholder"] = InlineImage(document, image_file, width=Inches(image_width), height=Inches(image_height)) + + # Render with data + document.render(context) + + # Convert to PDF + output_stream = BytesIO() + document.save(output_stream) + output_stream.seek(0) + + # Convert to PDF + response = requests.post( + url=resourceURL, + files={'file': ( + f'dynamic_{index}.docx', output_stream, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')} + ) + response.raise_for_status() + + # Clean up BytesIO stream + output_stream.close() + + if response.content: + # Thread-safe storage of results + with processing_lock: + processed_dynamic_pdfs[docx_file] = { + 'content': response.content + } + + except Exception as e: + return None + + return { + 'filename': docx_file + } + + # Process dynamic files in parallel using ThreadPoolExecutor + if dynamic_files: + with ThreadPoolExecutor(max_workers=min(len(dynamic_files), 6)) as executor: + # Submit all dynamic tasks + future_to_section = { + executor.submit(process_dynamic_section, docx_file, i): (docx_file, i) + for i, docx_file in enumerate(dynamic_files) + } + + # Wait for all tasks to complete + completed_sections = [] + for future in as_completed(future_to_section): + docx_file, index = future_to_section[future] + try: + result = future.result() + if result: + completed_sections.append(result) + except Exception as e: + pass + + # Load pre-existing static PDFs (no processing needed) + static_pdfs = {} + + for static_file in static_files: + try: + # Use the mapping to get the actual file path + static_pdf_path = static_file_mapping[static_file] + + if os.path.exists(static_pdf_path): + # Read the pre-existing PDF file + with open(static_pdf_path, 'rb') as pdf_file: + pdf_content = pdf_file.read() + + static_pdfs[static_file] = { + 'content': pdf_content + } + + except Exception as e: + continue + + # Merge all PDFs in correct order (both dynamic and static) + try: + from PyPDF2 import PdfMerger + + # Create PDF merger + merger = PdfMerger() + + # Merge in the order of all files (dynamic and static combined) + pdf_streams = [] + for docx_file in all_files_for_merging: + if docx_file in processed_dynamic_pdfs: + # Add processed dynamic PDF + pdf_data = processed_dynamic_pdfs[docx_file] + pdf_stream = BytesIO(pdf_data['content']) + merger.append(pdf_stream) + pdf_streams.append(pdf_stream) + elif docx_file in static_pdfs: + # Add static PDF + pdf_data = static_pdfs[docx_file] + pdf_stream = BytesIO(pdf_data['content']) + merger.append(pdf_stream) + pdf_streams.append(pdf_stream) + + # Create merged PDF + merged_pdf_stream = BytesIO() + merger.write(merged_pdf_stream) + merger.close() + + merged_pdf = merged_pdf_stream.getvalue() + + # Clean up PDF streams + for stream in pdf_streams: + stream.close() + merged_pdf_stream.close() + + except Exception as e: + # Get first available PDF as fallback + if processed_dynamic_pdfs: + first_dynamic = list(processed_dynamic_pdfs.keys())[0] + merged_pdf = processed_dynamic_pdfs[first_dynamic]['content'] + elif static_pdfs: + first_static = list(static_pdfs.keys())[0] + merged_pdf = static_pdfs[first_static]['content'] + else: + return JSONResponse({'status': 'error', 'message': 'No PDFs were successfully processed'}, status_code=500) + + # Encode final PDF + pdf_base64 = base64.b64encode(merged_pdf).decode('utf-8') + + # Explicit memory cleanup to ensure no memory leaks + del processed_dynamic_pdfs + del static_pdfs + del merged_pdf + del merged_pdf_stream + del merger + del pdf_streams + del all_files_for_merging + del static_file_mapping + del dynamic_files + del static_files + del all_docx_files + del all_pdf_files + del all_files + + except Exception as e: + return JSONResponse({'status': 'error', 'message': f"Error in parallel processing: {str(e)}"}, status_code=500) + + return JSONResponse({ + 'status': 'success', + 'pdf_base64': pdf_base64 + }) + + +@app.post('/api/v1/process-template-document/docx-to-pdf') +async def process_document_template(data: Dict[str, Any] = Body(...)): + # Check if folderName is provided and fileName is empty/null - use parallel processing + if data and 'folderName' in data and data.get('folderName') and (not data.get('fileName') or data.get('fileName') == ''): + return await process_parallel_sections(data) + + # Otherwise, use single file processing + if not data or 'fileName' not in data or 'data' not in data: + return JSONResponse({'status': 'error', 'message': 'fileName and data are required'}, status_code=400) + + # Ensure the temp directory exists + temp_dir = 'temp' + if not os.path.exists(temp_dir): + os.makedirs(temp_dir) + print(f"Created temp directory: {temp_dir}") + + # List the contents of the temp directory + files_in_temp = os.listdir(temp_dir) + print(f"Current contents of the temp directory: {files_in_temp}") + + resourceURL = f"{get_env('GOTENBERG_API_URL')}/forms/libreoffice/convert" + file_name = data['fileName'].replace('.docx', '') # Remove the extension for filename purposes + file_path = f'docx-template/{data["fileName"]}' + + # Generate unique filenames + unique_id = str(uuid.uuid4()) + modified_file_path = f'temp/modified_{file_name}_{unique_id}.docx' + + output_stream = BytesIO() + + # Load and modify the document + try: + document = DocxTemplate(file_path) + # Start with the provided data as the context + context = data['data'] + # Process image if provided in nested 'image' data + if 'image' in data: + image_info = data['image'] + base64_image = image_info.get('content') + image_width = image_info.get('width', 2) # Default to 2 inches if not provided + image_height = image_info.get('height', 2) # Default to 2 inches if not provided + + if base64_image: + # Decode the base64 string and use BytesIO to create a file-like object + image_data = base64.b64decode(base64_image) + image_file = BytesIO(image_data) + + # Add the InlineImage to the context under a key that matches the placeholder in the template + context["image_placeholder"] = InlineImage(document, image_file, width=Inches(image_width), height=Inches(image_height)) + + # Render the document once with the combined context + document.render(context) + + document.save(output_stream) + output_stream.seek(0) # Reset stream position for reading + except Exception as e: + return JSONResponse({'status': 'error', 'message': f"Error rendering or saving docx: {str(e)}"}, status_code=500) + + # Convert to PDF + try: + response = requests.post( + url=resourceURL, + files={'file': ( + 'modified.docx', output_stream, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')} + ) + response.raise_for_status() # Check for errors in the response + except requests.exceptions.RequestException as e: + return JSONResponse({'status': 'error', 'message': f"PDF conversion failed: {str(e)}"}, status_code=500) + + # Ensure the response contains the PDF content + if not response.content: + return JSONResponse({'status': 'error', 'message': 'PDF conversion returned empty content'}, status_code=500) + + # Directly encode PDF content to Base64 without saving it + try: + pdf_base64 = base64.b64encode(response.content).decode('utf-8') + except Exception as e: + return JSONResponse({'status': 'error', 'message': f"Error encoding PDF to Base64: {str(e)}"}, status_code=500) + + return JSONResponse({'status': 'success', 'pdf_base64': pdf_base64}) + + diff --git a/temp/__init__.py b/temp/__init__.py old mode 100644 new mode 100755