diff --git a/infra/scripts/Process-Sample-Data.ps1 b/infra/scripts/Process-Sample-Data.ps1 index 6597e344..64122878 100644 --- a/infra/scripts/Process-Sample-Data.ps1 +++ b/infra/scripts/Process-Sample-Data.ps1 @@ -119,10 +119,11 @@ if ($ResourceGroup) { } # Upload CSV files -Write-Host "Uploading CSV files to blob storage..." +Write-Host "Uploading CSV and JSON files to blob storage..." az storage blob upload-batch --account-name $StorageAccount --destination $BlobContainer --source "data/datasets" --auth-mode login --pattern "*.csv" --overwrite --output none -if ($LASTEXITCODE -ne 0) { Write-Host "Error: Failed to upload CSV files."; exit 1 } -Write-Host "CSV files uploaded successfully." +az storage blob upload-batch --account-name $StorageAccount --destination $BlobContainer --source "data/datasets" --auth-mode login --pattern "*.json" --overwrite --output none +if ($LASTEXITCODE -ne 0) { Write-Host "Error: Failed to upload CSV and JSON files."; exit 1 } +Write-Host "CSV and JSON files uploaded successfully." # Upload PDF files Write-Host "Uploading PDF files from RFP_dataset to blob storage..." @@ -180,21 +181,29 @@ Write-Host "Installing requirements" pip install --quiet -r infra/scripts/requirements.txt Write-Host "Requirements installed" -# Run indexing scripts -if ($hasCsv) { - Write-Host "Running the python script to index CSV data" - & $pythonCmd "infra/scripts/index_datasets.py" $StorageAccount $BlobContainer $AiSearch $AiSearchIndex - if ($LASTEXITCODE -ne 0) { Write-Host "Error: CSV indexing script failed."; exit 1 } -} -if ($hasPdf) { - Write-Host "Running the python script to index PDF data" - & $pythonCmd "infra/scripts/index_rfp_data.py" $StorageAccount $BlobContainer $AiSearch $AiSearchIndex - if ($LASTEXITCODE -ne 0) { Write-Host "Error: PDF indexing script failed."; exit 1 } -} -if (-not $hasCsv -and -not $hasPdf) { - Write-Host "No CSV or PDF files found to index." +Write-Host "Running the python script to index data" +$process = Start-Process -FilePath $pythonCmd -ArgumentList "infra/scripts/index_datasets.py", $StorageAccount, $BlobContainer, $AiSearch, $AiSearchIndex -Wait -NoNewWindow -PassThru + +if ($process.ExitCode -ne 0) { + Write-Host "Error: Indexing python script execution failed." + exit 1 } +# Run indexing scripts +# if ($hasCsv) { +# Write-Host "Running the python script to index CSV data" +# & $pythonCmd "infra/scripts/index_datasets.py" $StorageAccount $BlobContainer $AiSearch $AiSearchIndex +# if ($LASTEXITCODE -ne 0) { Write-Host "Error: CSV indexing script failed."; exit 1 } +# } +# if ($hasPdf) { +# Write-Host "Running the python script to index PDF data" +# & $pythonCmd "infra/scripts/index_rfp_data.py" $StorageAccount $BlobContainer $AiSearch $AiSearchIndex +# if ($LASTEXITCODE -ne 0) { Write-Host "Error: PDF indexing script failed."; exit 1 } +# } +# if (-not $hasCsv -and -not $hasPdf) { +# Write-Host "No CSV or PDF files found to index." +# } + # Disable public access again if ($stIsPublicAccessDisabled) { Write-Host "Disabling public access for storage account: $StorageAccount" diff --git a/infra/scripts/index_datasets.py b/infra/scripts/index_datasets.py index 48040738..9510d155 100644 --- a/infra/scripts/index_datasets.py +++ b/infra/scripts/index_datasets.py @@ -5,6 +5,49 @@ from azure.storage.blob import BlobServiceClient import sys +# PDF text extraction function +def extract_pdf_text(pdf_bytes): + """Extract text content from PDF bytes using PyPDF2""" + try: + import PyPDF2 + import io + + pdf_file = io.BytesIO(pdf_bytes) + pdf_reader = PyPDF2.PdfReader(pdf_file) + + # Check if PDF is encrypted/protected + if pdf_reader.is_encrypted: + return "PDF_PROTECTED: This PDF document is password-protected or encrypted and cannot be processed." + + text_content = [] + for page in pdf_reader.pages: + try: + page_text = page.extract_text() + if page_text and page_text.strip(): + text_content.append(page_text) + except Exception: + continue + + full_text = "\n".join(text_content).strip() + + # Check for protection messages + protection_indicators = [ + "protected by Microsoft Office", + "You'll need a different reader", + "Download a compatible PDF reader", + "This PDF Document has been protected" + ] + + if any(indicator.lower() in full_text.lower() for indicator in protection_indicators): + return "PDF_PROTECTED: This PDF document appears to be protected or encrypted." + + return full_text if full_text else "PDF_NO_TEXT: No readable text content found in PDF." + + except ImportError: + return "PDF_ERROR: PyPDF2 library not available. Install with: pip install PyPDF2" + except Exception as e: + return f"PDF_ERROR: Error reading PDF content: {str(e)}" + if len(sys.argv) < 4: print("Usage: python index_datasets.py []") sys.exit(1) @@ -51,11 +94,19 @@ #if blob.name.endswith(".csv"): title = blob.name.replace(".csv", "") title = blob.name.replace(".json", "") + title = blob.name.replace(".pdf", "") # Also handle PDF extension data = container_client.download_blob(blob.name).readall() try: print(f"Reading data from blob: {blob.name}...") - text = data.decode('utf-8') + + # Check if this is a PDF file and process accordingly + if blob.name.lower().endswith('.pdf'): + text = extract_pdf_text(data) + else: + # Original processing for non-PDF files + text = data.decode('utf-8') + data_list.append({ "content": text, "id": str(idx), diff --git a/infra/scripts/process_sample_data.sh b/infra/scripts/process_sample_data.sh index 2c498199..c6f9c3ad 100644 --- a/infra/scripts/process_sample_data.sh +++ b/infra/scripts/process_sample_data.sh @@ -123,13 +123,14 @@ fi #Upload sample CSV files to blob storage -echo "Uploading CSV sample files to blob storage..." +echo "Uploading CSV and JSON sample files to blob storage..." az storage blob upload-batch --account-name "$storageAccount" --destination "$blobContainer" --source "data/datasets" --auth-mode login --pattern '*.csv' --overwrite --output none +az storage blob upload-batch --account-name "$storageAccount" --destination "$blobContainer" --source "data/datasets" --auth-mode login --pattern '*.json' --overwrite --output none if [ $? -ne 0 ]; then - echo "Error: Failed to upload CSV files to blob storage." + echo "Error: Failed to upload CSV and JSON files to blob storage." exit 1 fi -echo "CSV files uploaded successfully to blob storage." +echo "CSV and JSON files uploaded successfully to blob storage." #Upload PDF files from RFP_dataset to blob storage echo "Uploading PDF files from RFP_dataset to blob storage..." @@ -194,14 +195,14 @@ if [ "$has_csv" = true ]; then fi fi -if [ "$has_pdf" = true ]; then - echo "Running the python script to index PDF data" - $PYTHON_CMD infra/scripts/index_rfp_data.py "$storageAccount" "$blobContainer" "$aiSearch" "$aiSearchIndex" - if [ $? -ne 0 ]; then - echo "Error: PDF indexing python script execution failed." - exit 1 - fi -fi +# if [ "$has_pdf" = true ]; then +# echo "Running the python script to index PDF data" +# $PYTHON_CMD infra/scripts/index_rfp_data.py "$storageAccount" "$blobContainer" "$aiSearch" "$aiSearchIndex" +# if [ $? -ne 0 ]; then +# echo "Error: PDF indexing python script execution failed." +# exit 1 +# fi +# fi if [ "$has_csv" = false ] && [ "$has_pdf" = false ]; then echo "No CSV or PDF files found to index." diff --git a/infra/scripts/upload_team_config.py b/infra/scripts/upload_team_config.py index 39036ee7..94e22adf 100644 --- a/infra/scripts/upload_team_config.py +++ b/infra/scripts/upload_team_config.py @@ -15,7 +15,7 @@ print(f"Scanning directory: {directory_path}") files_to_process = [ - ("RFP_Analysis_team", "00000000-0000-0000-0000-000000000001"), + ("RFP_Analysis_team.json", "00000000-0000-0000-0000-000000000001"), ("hr.json", "00000000-0000-0000-0000-000000000002"), ("marketing.json", "00000000-0000-0000-0000-000000000003"), ("retail.json", "00000000-0000-0000-0000-000000000004"),