From 85c0b82eb8fe4bcdc9bf5b7d9428830b6f0205e1 Mon Sep 17 00:00:00 2001 From: Charith Nuwan Bimsara <59943919+nuwangeek@users.noreply.github.com> Date: Wed, 12 Nov 2025 21:03:57 +0530 Subject: [PATCH] Merge pull request #383 from rootcodelabs/deployment-bug-fixes-Bimsara Bug fixes and create separate endpoint for generating presigned urls --- .env | 6 +- .../DSL/fetch_chunk_without_filter.yml | 5 - DSL/CronManager/DSL/fetch_multi_chunk.yml | 5 - .../DSL/mock_signed_url_generation.yml | 5 + DSL/CronManager/script/callback_format.sh | 46 ++--- DSL/CronManager/script/dataset_pipeline_s3.sh | 171 +++++++++++++--- DSL/CronManager/script/fetch_multi_chunk.sh | 120 ------------ DSL/CronManager/script/fetch_single_chunk.sh | 95 --------- .../script/presigned_url_generate.sh | 78 ++++++++ .../script/train_script_starter.sh | 166 +++++++++------- .../global-classifier/POST/get-agencies.sql | 2 +- .../global-classifier/POST/get-datasets.sql | 5 + .../POST/insert-agency-presigned-url.sql | 3 + .../POST/update-agency-presigned-url.sql | 6 + .../global-classifier/GET/datasets/list.yml | 5 + .../POST/ckb/agency-data-url.yml | 59 ++++++ .../global-classifier/POST/data/callback.yml | 4 +- .../FormElements/FormSelect/FormSelect.scss | 8 + .../FormElements/FormSelect/index.tsx | 4 +- .../FormTextarea/FormTextarea.scss | 6 +- .../FormElements/FormTextarea/index.tsx | 1 + .../molecules/DataModelForm/index.tsx | 5 + GUI/src/pages/DataModels/CreateDataModel.tsx | 3 +- GUI/src/pages/Datasets/index.tsx | 4 +- GUI/src/pages/TestModel/index.tsx | 16 +- GUI/src/pages/ViewDataset/index.tsx | 2 +- GUI/src/services/datasets.ts | 4 +- GUI/src/utils/commonUtilts.ts | 15 +- GUI/src/utils/queryKeys.ts | 4 +- GUI/translations/en/common.json | 8 +- GUI/translations/et/common.json | 6 +- docker-compose-dev.yml | 24 +-- src/s3_dataset_processor/constants.py | 1 + .../dataset_generation_callback_processor.py | 53 +++++ src/scripts/constants.py | 8 + src/scripts/generate_signed_urls.py | 185 ++++++++++++++++++ 36 files changed, 737 insertions(+), 401 deletions(-) delete mode 100644 DSL/CronManager/DSL/fetch_chunk_without_filter.yml delete mode 100644 DSL/CronManager/DSL/fetch_multi_chunk.yml create mode 100644 DSL/CronManager/DSL/mock_signed_url_generation.yml delete mode 100755 DSL/CronManager/script/fetch_multi_chunk.sh delete mode 100755 DSL/CronManager/script/fetch_single_chunk.sh create mode 100644 DSL/CronManager/script/presigned_url_generate.sh create mode 100644 DSL/Resql/global-classifier/POST/insert-agency-presigned-url.sql create mode 100644 DSL/Resql/global-classifier/POST/update-agency-presigned-url.sql create mode 100644 DSL/Ruuter.public/global-classifier/POST/ckb/agency-data-url.yml create mode 100644 src/scripts/constants.py create mode 100644 src/scripts/generate_signed_urls.py diff --git a/.env b/.env index a30f1b57..90abe763 100644 --- a/.env +++ b/.env @@ -1,7 +1,7 @@ AWS_ACCESS_KEY_ID=your_aws_access_key_id AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key BEDROCK_AWS_REGION=eu-west-1 -AZURE_OPENAI_API_KEY=your_azure_openai_api_key -AZURE_OPENAI_ENDPOINT=your_azure_openai_endpoint -AZURE_OPENAI_DEPLOYMENT_NAME=gpt-4o +AZURE_OPENAI_API_KEY=your_openai_api_key +AZURE_OPENAI_ENDPOINT=your_openai_endpoint +AZURE_OPENAI_DEPLOYMENT_NAME=gpt-4o-mini PROVIDER_NAME=azure-openai \ No newline at end of file diff --git a/DSL/CronManager/DSL/fetch_chunk_without_filter.yml b/DSL/CronManager/DSL/fetch_chunk_without_filter.yml deleted file mode 100644 index 6f12fb8e..00000000 --- a/DSL/CronManager/DSL/fetch_chunk_without_filter.yml +++ /dev/null @@ -1,5 +0,0 @@ -fetch_single_chunk: - trigger: off - type: exec - command: "../app/scripts/fetch_single_chunk.sh" - allowedEnvs: ['datasetId', 'pageNum'] \ No newline at end of file diff --git a/DSL/CronManager/DSL/fetch_multi_chunk.yml b/DSL/CronManager/DSL/fetch_multi_chunk.yml deleted file mode 100644 index f52a735e..00000000 --- a/DSL/CronManager/DSL/fetch_multi_chunk.yml +++ /dev/null @@ -1,5 +0,0 @@ -multi_chunk: - trigger: off - type: exec - command: "../app/scripts/fetch_multi_chunk.sh" - allowedEnvs: ['datasetId', 'chunkIds'] \ No newline at end of file diff --git a/DSL/CronManager/DSL/mock_signed_url_generation.yml b/DSL/CronManager/DSL/mock_signed_url_generation.yml new file mode 100644 index 00000000..88c665a1 --- /dev/null +++ b/DSL/CronManager/DSL/mock_signed_url_generation.yml @@ -0,0 +1,5 @@ +mock_signed_url_generate: + trigger: off + type: exec + command: "../app/scripts/presigned_url_generate.sh" + allowedEnvs: ['centopsAgencies'] \ No newline at end of file diff --git a/DSL/CronManager/script/callback_format.sh b/DSL/CronManager/script/callback_format.sh index 1eee8dda..62517c7a 100755 --- a/DSL/CronManager/script/callback_format.sh +++ b/DSL/CronManager/script/callback_format.sh @@ -14,16 +14,16 @@ log() { } PROGRESS_UPDATE_URL="http://ruuter-public:8086/global-classifier/datasets/progress/update" # Debug: Check Python environment -log "πŸ” Python version: $(python3 --version)" -log "πŸ” Python path: $(which python3)" +log "Python version: $(python3 --version)" +log "Python path: $(which python3)" # Install required packages -log "πŸ” Installing required Python packages..." +log "Installing required Python packages..." python3 -m pip install --quiet --no-cache-dir requests pydantic pandas || { - log "❌ Failed to install packages" + log "Failed to install packages" exit 1 } -log "βœ… Required packages installed" +log "Required packages installed" log "Dataset generation callback processing started" log "File path: $filePath" @@ -35,7 +35,7 @@ log "Extracted dataset ID: $dataset_id" # Direct Python script path for processing generation callback (inside container) CALLBACK_SCRIPT="/app/src/s3_dataset_processor/dataset_generation_callback_processor.py" -log "πŸ” Calling direct Python script to process generation callback..." +log "Calling direct Python script to process generation callback..." # Create temporary file for response temp_response="/tmp/callback_response.json" @@ -65,46 +65,46 @@ python3 "$CALLBACK_SCRIPT" \ > /tmp/callback_stdout.log 2> /tmp/callback_stderr.log exit_code=$? -log "πŸͺ΅ Python STDOUT:" +log "Python STDOUT:" cat /tmp/callback_stdout.log -log "πŸͺ΅ Python STDERR:" +log "Python STDERR:" cat /tmp/callback_stderr.log -log "πŸ” Python script exit code: $exit_code" +log "Python script exit code: $exit_code" if [ -f "$temp_response" ]; then - log "πŸ“„ Contents of output JSON:" + log "Contents of output JSON:" cat "$temp_response" else - log "⚠️ No output JSON file was generated." + log "No output JSON file was generated." fi # Check if script execution was successful if [ "$exit_code" -eq 0 ] && [ -f "$temp_response" ]; then - log "βœ… Python script execution successful" + log "Python script execution successful" response_body=$(cat "$temp_response") - log "πŸ” Response: $response_body" + log "Response: $response_body" # Parse the response to get status information if command -v jq >/dev/null 2>&1; then status=$(echo "$response_body" | jq -r '.status // "unknown"') message=$(echo "$response_body" | jq -r '.message // "unknown"') - log "πŸ“Š Callback Processing Status:" + log "Callback Processing Status:" log " - Status: $status" log " - Message: $message" log " - Dataset ID: $dataset_id" else # Fallback parsing without jq - log "⚠️ jq not available, using grep/sed for parsing" + log "jq not available, using grep/sed for parsing" status=$(echo "$response_body" | grep -o '"status":"[^"]*"' | sed 's/.*"status":"\([^"]*\)".*/\1/' || echo "unknown") message=$(echo "$response_body" | grep -o '"message":"[^"]*"' | sed 's/.*"message":"\([^"]*\)".*/\1/' || echo "unknown") - log "πŸ“Š Callback Processing Status:" + log "Callback Processing Status:" log " - Status: $status" log " - Message: $message" log " - Dataset ID: $dataset_id" @@ -112,22 +112,22 @@ if [ "$exit_code" -eq 0 ] && [ -f "$temp_response" ]; then # Check if callback processing was completed if [ "$status" = "completed" ]; then - log "βœ… Dataset generation callback processed successfully" - log "πŸ”„ Callback payload has been sent to status update endpoint" + log "Dataset generation callback processed successfully" + log "Callback payload has been sent to status update endpoint" log " - agencies: [{agencyId: X, syncStatus: Synced_with_CKB/Sync_with_CKB_Failed}, ...]" log " - datasetId: $dataset_id" log " - generationStatus: Generation_Success/Generation_Failed" else - log "⚠️ Unexpected status received: $status" - log "⚠️ Message: $message" + log "Unexpected status received: $status" + log "Message: $message" fi # Cleanup temp file rm -f "$temp_response" else - log "❌ Python script execution failed with exit code: $exit_code" + log "Python script execution failed with exit code: $exit_code" if [ -f "$temp_response" ]; then log "Error response: $(cat $temp_response)" rm -f "$temp_response" @@ -135,7 +135,7 @@ else exit 1 fi -log "βœ… Dataset generation callback processing completed successfully" -log "πŸ“‹ Summary: Dataset ID: $dataset_id, Request Status: $status" +log "Dataset generation callback processing completed successfully" +log "Summary: Dataset ID: $dataset_id, Request Status: $status" exit 0 \ No newline at end of file diff --git a/DSL/CronManager/script/dataset_pipeline_s3.sh b/DSL/CronManager/script/dataset_pipeline_s3.sh index 898b7619..01795e79 100755 --- a/DSL/CronManager/script/dataset_pipeline_s3.sh +++ b/DSL/CronManager/script/dataset_pipeline_s3.sh @@ -11,6 +11,77 @@ log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" } +# Failure notification function +send_failure_status_update() { + local failure_message="$1" + local dataset_id="$2" + local response_body="$3" + local failure_type="$4" # "extraction_failure" or "generation_failure" + + STATUS_UPDATE_URL="http://ruuter-public:8086/global-classifier/agencies/data/generation" + + agencies_array="[]" + + if [ -n "$response_body" ] && [ "$response_body" != "null" ]; then + if command -v jq >/dev/null 2>&1; then + if [ "$failure_type" = "extraction_failure" ]; then + # Only agencies with extraction_success = false + agencies_array=$(echo "$response_body" | jq -r '[.downloaded_files[]? | select(.extraction_success == false) | {"agencyId": .agency_id, "syncStatus": "Sync_with_CKB_Failed"}]' 2>/dev/null || echo "[]") + else + # All agencies failed + agencies_array=$(echo "$response_body" | jq -r '[.downloaded_files[]? | {"agencyId": .agency_id, "syncStatus": "Sync_with_CKB_Failed"}]' 2>/dev/null || echo "[]") + fi + else + # Fallback parsing + agencies_array="[" + first_agency=true + + if [ "$failure_type" = "extraction_failure" ]; then + # Only include agencies where extraction_success is false + echo "$response_body" | grep -o '"agency_id"[[:space:]]*:[[:space:]]*"[^"]*"[^}]*"extraction_success"[[:space:]]*:[[:space:]]*false' | sed 's/.*"agency_id"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/' | while read -r agency_id; do + if [ -n "$agency_id" ]; then + if [ "$first_agency" = false ]; then + agencies_array="$agencies_array," + fi + agencies_array="$agencies_array{\"agencyId\": \"$agency_id\", \"syncStatus\": \"Sync_with_CKB_Failed\"}" + first_agency=false + fi + done + else + # All agencies failed + echo "$response_body" | grep -o '"agency_id"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/.*"agency_id"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/' | while read -r agency_id; do + if [ -n "$agency_id" ]; then + if [ "$first_agency" = false ]; then + agencies_array="$agencies_array," + fi + agencies_array="$agencies_array{\"agencyId\": \"$agency_id\", \"syncStatus\": \"Sync_with_CKB_Failed\"}" + first_agency=false + fi + done + fi + agencies_array="$agencies_array]" + fi + fi + + failure_payload=$(cat <&2 -} - -log "Multi-chunk download request started" -log "Dataset ID: $datasetId" -log "Chunk IDs: $chunkIds" - -# Clean the parameters -DATASET_ID=$(echo "$datasetId" | tr -d '"') -CHUNK_IDS=$(echo "$chunkIds" | tr -d '"') - -log "Cleaned Dataset ID: $DATASET_ID" -log "Cleaned Chunk IDs: $CHUNK_IDS" - -# Validate chunk IDs format -if [[ ! "$CHUNK_IDS" =~ ^[0-9]+([[:space:]]+[0-9]+)*$ ]]; then - log "❌ Invalid chunk IDs format. Expected space-separated numbers." - error_response="{\"success\": false, \"dataset_id\": \"$DATASET_ID\", \"chunk_ids\": \"$CHUNK_IDS\", \"error\": \"Invalid chunk IDs format\", \"message\": \"Expected space-separated numbers like '1 2 3'\"}" - echo "$error_response" - exit 1 -fi - -# Create temp_chunks directory if it doesn't exist -mkdir -p /app/temp_chunks -log "Created/verified temp_chunks directory" - -# Install required Python packages if not present -log "πŸ” Installing required Python packages..." -python3 -m pip install --quiet --no-cache-dir requests pydantic || { - log "❌ Failed to install packages" - exit 1 -} -log "βœ… Required packages installed" - -# Direct Python script path for downloading multiple chunks (inside container) -DOWNLOAD_SCRIPT="/app/src/s3_dataset_processor/fetch_multi_chunk.py" - -log "πŸ” Calling Python script to download and aggregate chunks..." - -# Create temporary file for response -temp_response="/tmp/multi_chunk_response.json" - -# Call the Python script -python3 "$DOWNLOAD_SCRIPT" \ - --dataset-id "$DATASET_ID" \ - --chunk-ids "$CHUNK_IDS" \ - --output-json "$temp_response" - -exit_code=$? -log "πŸ” Python script exit code: $exit_code" - -if [ "$exit_code" -eq 0 ] && [ -f "$temp_response" ]; then - log "βœ… Multi-chunk processing successful" - - response_body=$(cat "$temp_response") - - # Check if aggregation was successful - success_check=$(echo "$response_body" | grep -o '"success"[[:space:]]*:[[:space:]]*true' | wc -l) - - if [ "$success_check" -gt 0 ]; then - log "βœ… Chunks aggregated successfully" - - # Extract summary information for logging - if command -v jq >/dev/null 2>&1; then - total_items=$(echo "$response_body" | jq -r '.download_summary.total_items_aggregated // 0' 2>/dev/null || echo "0") - successful_chunks=$(echo "$response_body" | jq -r '.download_summary.successful_downloads // 0' 2>/dev/null || echo "0") - failed_chunks=$(echo "$response_body" | jq -r '.download_summary.failed_downloads // 0' 2>/dev/null || echo "0") - - log "πŸ“Š Aggregation Summary:" - log " - Total items aggregated: $total_items" - log " - Successful chunk downloads: $successful_chunks" - log " - Failed chunk downloads: $failed_chunks" - else - log "πŸ“Š Multi-chunk aggregation completed (install jq for detailed summary)" - fi - - # Output the JSON response to stdout (this goes to CronManager caller) - cat "$temp_response" - - # Cleanup - rm -f "$temp_response" - - log "βœ… Multi-chunk aggregation completed successfully" - exit 0 - else - log "❌ Multi-chunk aggregation failed - check response for details" - - # Still output the response so caller can see the error - cat "$temp_response" - - # Cleanup - rm -f "$temp_response" - exit 1 - fi - -else - log "❌ Python script execution failed with exit code: $exit_code" - - # Create error response - error_response="{\"success\": false, \"dataset_id\": \"$DATASET_ID\", \"chunk_ids\": \"$CHUNK_IDS\", \"error\": \"Script execution failed\", \"message\": \"Python script failed with exit code $exit_code\"}" - echo "$error_response" - - if [ -f "$temp_response" ]; then - log "Error response: $(cat $temp_response)" - rm -f "$temp_response" - fi - exit 1 -fi \ No newline at end of file diff --git a/DSL/CronManager/script/fetch_single_chunk.sh b/DSL/CronManager/script/fetch_single_chunk.sh deleted file mode 100755 index 77df4869..00000000 --- a/DSL/CronManager/script/fetch_single_chunk.sh +++ /dev/null @@ -1,95 +0,0 @@ -#!/bin/bash - -echo "Started Shell Script for Chunk Download" - -# Check if environment variables are set -if [ -z "$datasetId" ] || [ -z "$pageNum" ]; then - echo "Please set the datasetId and pageNum environment variables." - exit 1 -fi - -# Logging function -log() { - echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >&2 -} - -log "Chunk download request started" -log "Dataset ID: $datasetId" -log "Page Number: $pageNum" - -# Clean the parameters -DATASET_ID=$(echo "$datasetId" | tr -d '"') -PAGE_NUM=$(echo "$pageNum" | tr -d '"') - -log "Cleaned Dataset ID: $DATASET_ID" -log "Cleaned Page Number: $PAGE_NUM" - -# Install required Python packages if not present -log "πŸ” Installing required Python packages..." -python3 -m pip install --quiet --no-cache-dir requests pydantic || { - log "❌ Failed to install packages" - exit 1 -} -log "βœ… Required packages installed" - -# Direct Python script path for downloading chunk (inside container) -DOWNLOAD_SCRIPT="/app/src/s3_dataset_processor/fetch_chunk_without_filter.py" - -log "πŸ” Calling Python script to download chunk..." - -# Create temporary file for response -temp_response="/tmp/chunk_response.json" - -# Call the Python script -python3 "$DOWNLOAD_SCRIPT" \ - --dataset-id "$DATASET_ID" \ - --page-num "$PAGE_NUM" \ - --output-json "$temp_response" - -exit_code=$? -log "πŸ” Python script exit code: $exit_code" - -if [ "$exit_code" -eq 0 ] && [ -f "$temp_response" ]; then - log "βœ… Chunk download successful" - - response_body=$(cat "$temp_response") - log "πŸ” Response: $response_body" - - # Check if download was successful - success_check=$(echo "$response_body" | grep -o '"success"[[:space:]]*:[[:space:]]*true' | wc -l) - - if [ "$success_check" -gt 0 ]; then - log "βœ… Chunk downloaded successfully" - - # Output the JSON response to stdout (this goes to CronManager caller) - cat "$temp_response" - - # Cleanup - rm -f "$temp_response" - - log "βœ… Chunk download completed successfully" - exit 0 - else - log "❌ Chunk download failed - check response for details" - - # Still output the response so caller can see the error - cat "$temp_response" - - # Cleanup - rm -f "$temp_response" - exit 1 - fi - -else - log "❌ Python script execution failed with exit code: $exit_code" - - # Create error response - error_response="{\"success\": false, \"dataset_id\": \"$DATASET_ID\", \"page_num\": $PAGE_NUM, \"error\": \"Script execution failed\", \"message\": \"Python script failed with exit code $exit_code\"}" - echo "$error_response" - - if [ -f "$temp_response" ]; then - log "Error response: $(cat $temp_response)" - rm -f "$temp_response" - fi - exit 1 -fi \ No newline at end of file diff --git a/DSL/CronManager/script/presigned_url_generate.sh b/DSL/CronManager/script/presigned_url_generate.sh new file mode 100644 index 00000000..49fe4a31 --- /dev/null +++ b/DSL/CronManager/script/presigned_url_generate.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +echo "Starting presigned URL generation..." + +# Check if environment variable is set +if [ -z "$centopsAgencies" ]; then + echo "Error: centopsAgencies environment variable is not set" + exit 1 +fi + +echo "Received centopsAgencies: $centopsAgencies" + +# Decode the URL-encoded string for debugging +decoded_agencies=$(python3 -c "import urllib.parse, sys; print(urllib.parse.unquote(sys.argv[1]))" "$centopsAgencies" 2>/dev/null) +echo "Decoded agencies: $decoded_agencies" + +# Install uv if not found (using unmanaged installation for security) +UV_INSTALL_DIR="/app/tools/uv" +UV_BIN="$UV_INSTALL_DIR/uv" + +if [ ! -f "$UV_BIN" ]; then + echo "[UV] Installing uv to isolated directory..." + + # Create installation directory + mkdir -p "$UV_INSTALL_DIR" || { + echo "[ERROR] Failed to create UV installation directory" + exit 1 + } + + # Use unmanaged installation to avoid root directory modifications + curl -LsSf https://astral.sh/uv/install.sh | env UV_UNMANAGED_INSTALL="$UV_INSTALL_DIR" sh || { + echo "[ERROR] Failed to install uv" + exit 1 + } + + # Verify installation + if [ ! -x "$UV_BIN" ]; then + echo "[ERROR] UV installation failed or not executable" + exit 1 + fi + + # Verify functionality + "$UV_BIN" --version || { + echo "[ERROR] UV installation corrupted" + exit 1 + } + + echo "[UV] Successfully installed uv (unmanaged) to $UV_INSTALL_DIR" +fi + +# Activate Python virtual environment +VENV_PATH="/app/python_virtual_env" +echo "[VENV] Activating virtual environment at: $VENV_PATH" +source "$VENV_PATH/bin/activate" || { + echo "[ERROR] Failed to activate virtual environment" + exit 1 +} + +# Install required packages +echo "[PACKAGES] Installing required packages..." +"$UV_BIN" pip install --python "$VENV_PATH/bin/python3" "boto3>=1.35.0" || exit 1 +"$UV_BIN" pip install --python "$VENV_PATH/bin/python3" "botocore>=1.35.0" || exit 1 +"$UV_BIN" pip install --python "$VENV_PATH/bin/python3" "requests>=2.32.0" || exit 1 +echo "[PACKAGES] All packages installed successfully" + +export PYTHONPATH="/app:/app/src:$PYTHONPATH" + +# Call Python script with the agencies data +echo "Calling Python script..." +python3 "/app/src/scripts/generate_signed_urls.py" "$centopsAgencies" 2>&1 + +# Check if Python script execution was successful +if [ $? -eq 0 ]; then + echo "Presigned URL generation completed successfully" +else + echo "Error: Presigned URL generation failed" + exit 1 +fi diff --git a/DSL/CronManager/script/train_script_starter.sh b/DSL/CronManager/script/train_script_starter.sh index 790429e3..dc1e199b 100755 --- a/DSL/CronManager/script/train_script_starter.sh +++ b/DSL/CronManager/script/train_script_starter.sh @@ -7,34 +7,32 @@ GET_FIRST_COME_TRAINING_JOB_SQL="http://resql:8082/global-classifier/get-queued- GET_DATA_MODEL_BY_MODEL_ID_SQL="http://resql:8082/global-classifier/get-data-model-info-by-given-model-id" UPDATE_JOB_STATUS="http://resql:8082/global-classifier/update-training-job-status" -echo "πŸ”„ [START] Training script starter" +echo "[START] Training script starter" # Check if training is in progress -echo "πŸ” [CHECK] Checking if training is in progress..." +echo "[CHECK] Checking if training is in progress..." response_job_status_in_progres=$(curl -s -X POST "$CHECK_JOB_STATUS_IN_PROGRESS_SQL") -echo "πŸ” [DEBUG] Training status response: '$response_job_status_in_progres'" +echo "[DEBUG] Training status response: '$response_job_status_in_progres'" if [ $? -ne 0 ] || [ -z "$response_job_status_in_progres" ]; then - echo "❌ [ERROR] Failed to check training status" + echo "[ERROR] Failed to check training status" exit 1 fi if echo "$response_job_status_in_progres" | grep -q '"hasTrainingInProgress":true'; then - echo "⚠️ [INFO] Training is already in progress. Exiting..." + echo "[INFO] Training is already in progress. Exiting..." exit 0 fi -echo "βœ… [AVAILABLE] No training in progress." - +echo "[AVAILABLE] No training in progress." # Get first queued training job -echo "🎯 [QUEUE] Getting first queued training job..." +echo "[QUEUE] Getting first queued training job..." response_first_come_training_job=$(curl -s -X POST "$GET_FIRST_COME_TRAINING_JOB_SQL") -echo "πŸ” [DEBUG] First queued job response: '$response_first_come_training_job'" - +echo "[DEBUG] First queued job response: '$response_first_come_training_job'" # Handle empty response (no queued jobs) - this is normal, not an error if [ -z "$response_first_come_training_job" ]; then - echo "ℹ️ [INFO] No queued training jobs found. Nothing to process." - echo "βœ… [DONE] Training script starter completed - no work to do" + echo "[INFO] No queued training jobs found. Nothing to process." + echo "[DONE] Training script starter completed - no work to do" exit 0 fi @@ -45,8 +43,8 @@ if echo "$response_first_come_training_job" | grep -q '"hasQueuedJobs":false' || [ "$response_first_come_training_job" = "{}" ] || \ [ "$response_first_come_training_job" = "null" ] || \ [ "$response_first_come_training_job" = "[]" ]; then - echo "ℹ️ [INFO] No queued training jobs available. Queue is empty." - echo "βœ… [DONE] Training script starter completed - no work to do" + echo "[INFO] No queued training jobs available. Queue is empty." + echo "[DONE] Training script starter completed - no work to do" exit 0 fi @@ -59,74 +57,74 @@ minor_version=$(echo "$response_first_come_training_job" | sed -E 's/.*"minorVer latest=$(echo "$response_first_come_training_job" | sed -E 's/.*"latest":[[:space:]]*(true|false).*/\1/') deployment_environment=$(echo "$response_first_come_training_job" | sed -E 's/.*"deploymentEnvironment":"?([^",}]+)"?.*/\1/') -echo "πŸ” [----DEBUG----] Raw response: '$response_first_come_training_job'" +echo "[DEBUG] Raw response: '$response_first_come_training_job'" if [ -z "$model_id" ]; then - echo "❌ [ERROR] Model ID not found in response" - echo "πŸ” [DEBUG] Raw response: '$response_first_come_training_job'" + echo "[ERROR] Model ID not found in response" + echo "[DEBUG] Raw response: '$response_first_come_training_job'" exit 1 fi if [ -z "$job_id" ] || [ "$job_id" = "$response_first_come_training_job" ]; then - echo "❌ [ERROR] Job ID not found or invalid in response" - echo "πŸ” [DEBUG] Raw response: '$response_first_come_training_job'" + echo "[ERROR] Job ID not found or invalid in response" + echo "[DEBUG] Raw response: '$response_first_come_training_job'" exit 1 fi -echo "πŸ“¦ [MODEL] Model ID: $model_id" -echo "πŸ“¦ [JOB] Job ID: $job_id" -echo "πŸ“¦ [MODEL] Model Name: $model_name" -echo "πŸ“¦ [VERSION] Major Version: $major_version" -echo "πŸ“¦ [VERSION] Minor Version: $minor_version" -echo "πŸ“¦ [VERSION] Latest: $latest" -echo "πŸ“¦ [ENVIRONMENT] Deployment Environment: $deployment_environment" +echo "[MODEL] Model ID: $model_id" +echo "[JOB] Job ID: $job_id" +echo "[MODEL] Model Name: $model_name" +echo "[VERSION] Major Version: $major_version" +echo "[VERSION] Minor Version: $minor_version" +echo "[VERSION] Latest: $latest" +echo "[ENVIRONMENT] Deployment Environment: $deployment_environment" response_update_job_status=$(curl -s -X POST "$UPDATE_JOB_STATUS" \ -H "Content-Type: application/json" \ -d "{\"jobId\": $job_id, \"jobStatus\": \"training-in-progress\"}") -echo "πŸ” [DEBUG] Update job status response: '$response_update_job_status'" +echo "[DEBUG] Update job status response: '$response_update_job_status'" # Get dataset ID response_get_dataset_id=$(curl -s -X POST "$GET_DATA_MODEL_BY_MODEL_ID_SQL" \ -H "Content-Type: application/json" \ -d "{\"model_id\": $model_id}") -echo "πŸ” [DEBUG] Dataset ID response: '$response_get_dataset_id'" +echo "[DEBUG] Dataset ID response: '$response_get_dataset_id'" # Handle empty response if [ -z "$response_get_dataset_id" ] || [ "$response_get_dataset_id" = "[]" ]; then - echo "❌ [ERROR] No dataset information found for model ID: $model_id" + echo "[ERROR] No dataset information found for model ID: $model_id" exit 1 fi dataset_id=$(echo "$response_get_dataset_id" | sed -E 's/.*"connectedDsId":([0-9]+).*/\1/') if [ -z "$dataset_id" ] || [ "$dataset_id" = "$response_get_dataset_id" ]; then - echo "❌ [ERROR] Connected Dataset ID not found in response" - echo "πŸ” [DEBUG] Raw response: '$response_get_dataset_id'" + echo "[ERROR] Connected Dataset ID not found in response" + echo "[DEBUG] Raw response: '$response_get_dataset_id'" exit 1 fi -echo "πŸ“¦ [DATASET] Dataset ID: $dataset_id" +echo "[DATASET] Dataset ID: $dataset_id" base_models_json=$(echo "$response_get_dataset_id" | sed -nE 's/.*"value":"(\[[^]]+\])".*/\1/p' | sed 's/\\"/"/g') if [[ "$base_models_json" == "["* ]] && [[ "$base_models_json" == *"]" ]]; then model_types="$base_models_json" - echo "πŸ“¦ [MODELS] Model types extracted from DB: $model_types" + echo "[MODELS] Model types extracted from DB: $model_types" else - echo "❌ [ERROR] Failed to extract base models from response" - echo "❌ [ERROR] Raw response: $response_get_dataset_id" - echo "❌ [ERROR] Extracted base_models: $base_models_json" + echo "[ERROR] Failed to extract base models from response" + echo "[ERROR] Raw response: $response_get_dataset_id" + echo "[ERROR] Extracted base_models: $base_models_json" exit 1 fi # Activate existing virtualenv -echo "βœ… Activating existing virtualenv at /app/python_virtual_env" -source /app/python_virtual_env/bin/activate || { echo "❌ Failed to activate virtualenv"; exit 1; } +echo "[INFO] Activating existing virtualenv at /app/python_virtual_env" +source /app/python_virtual_env/bin/activate || { echo "[ERROR] Failed to activate virtualenv"; exit 1; } export PYTHONPATH="/app:/app/src:/app/src/training:/app/src/s3_dataset_processor:$PYTHONPATH" -echo "πŸ” [DEBUG] PYTHONPATH set to: $PYTHONPATH" +echo "[DEBUG] PYTHONPATH set to: $PYTHONPATH" # Add these debug commands -echo "πŸ” [DEBUG] Virtual environment debugging:" +echo "[DEBUG] Virtual environment debugging:" echo " - VIRTUAL_ENV: $VIRTUAL_ENV" echo " - Python path: $(which python)" echo " - Python version: $(python --version)" @@ -134,53 +132,80 @@ echo " - Pip path: $(which pip)" echo " - Site packages: $(python -c "import site; print(site.getsitepackages())")" # List installed packages -echo "πŸ“¦ [DEBUG] Installed packages in current environment:" +echo "[DEBUG] Installed packages in current environment:" pip list | head -20 # Show first 20 packages # Check required packages -echo "πŸ” [DEBUG] Testing individual package imports inside virtualenv..." +echo "[DEBUG] Testing individual package imports inside virtualenv..." missing_pkgs=() for pkg in torch transformers sklearn mlflow pandas numpy loguru; do - echo "πŸ” [DEBUG] Testing import for $pkg" + echo "[DEBUG] Testing import for $pkg" if ! python -c "import $pkg" &>/dev/null; then - echo "❌ [MISSING or failed import] Package '$pkg'" + echo "[ERROR] Package '$pkg' is missing or failed to import" missing_pkgs+=("$pkg") else - echo "βœ… [FOUND] Package '$pkg'" + echo "[INFO] Package '$pkg' found" fi done # Install if missing if [ ${#missing_pkgs[@]} -ne 0 ]; then - echo "⚑ [ACTION] Missing packages detected: ${missing_pkgs[*]}" + echo "[ACTION] Missing packages detected: ${missing_pkgs[*]}" - if ! command -v uv &>/dev/null; then - echo "⚑ Installing uv inside virtualenv..." - pip install uv || { echo "❌ Failed to install uv"; exit 1; } - else - echo "βœ… uv already installed." + # Install uv using secure unmanaged installation (same as presigned_url_generate.sh) + UV_INSTALL_DIR="/app/tools/uv" + UV_BIN="$UV_INSTALL_DIR/uv" + + if [ ! -f "$UV_BIN" ]; then + echo "[UV] Installing uv to isolated directory..." + + # Create installation directory + mkdir -p "$UV_INSTALL_DIR" || { + echo "[ERROR] Failed to create UV installation directory" + exit 1 + } + + # Use unmanaged installation to avoid root directory modifications + curl -LsSf https://astral.sh/uv/install.sh | env UV_UNMANAGED_INSTALL="$UV_INSTALL_DIR" sh || { + echo "[ERROR] Failed to install uv" + exit 1 + } + + # Verify installation + if [ ! -x "$UV_BIN" ]; then + echo "[ERROR] UV installation failed or not executable" + exit 1 + fi + + # Verify functionality + "$UV_BIN" --version || { + echo "[ERROR] UV installation corrupted" + exit 1 + } + + echo "[UV] Successfully installed uv (unmanaged) to $UV_INSTALL_DIR" fi if [ ! -f /app/src/training/requirements-gpu.txt ]; then - echo "❌ /app/src/training/requirements-gpu.txt not found!" + echo "/app/src/training/requirements-gpu.txt not found!" exit 1 fi - echo "πŸ“¦ [INSTALL] Installing from /app/src/training/requirements-gpu.txt using uv..." - uv pip install -r /app/src/training/requirements-gpu.txt || { - echo "⚠️ uv install failed β€” trying pip as fallback..." + echo "[INSTALL] Installing from /app/src/training/requirements-gpu.txt using secure uv..." + "$UV_BIN" pip install --python "$VIRTUAL_ENV/bin/python3" -r /app/src/training/requirements-gpu.txt || { + echo "[WARNING] uv install failed β€” trying pip as fallback..." pip install -r /app/src/training/requirements-gpu.txt || { - echo "❌ Both uv and pip install failed inside virtualenv" + echo "[ERROR] Both uv and pip install failed inside virtualenv" exit 1 } } - echo "πŸŽ‰ [SUCCESS] Required packages installed successfully inside virtualenv." + echo "[SUCCESS] Required packages installed successfully inside virtualenv." else - echo "πŸŽ‰ [SUCCESS] All required Python packages are already installed inside virtualenv." + echo "[SUCCESS] All required Python packages are already installed inside virtualenv." fi -echo "βœ… [VIRTUALENV] All checks passed, proceeding with training script..." -echo "πŸš€ [TRAINING] Starting training for Model ID: $model_id, Dataset ID: $dataset_id, Model Major Version: $major_version, Model Minor Version: $minor_version, Model Name: $model_name" +echo "[SUCCESS] All checks passed, proceeding with training script..." +echo "[INFO] Starting training for Model ID: $model_id, Dataset ID: $dataset_id, Model Major Version: $major_version, Model Minor Version: $minor_version, Model Name: $model_name" # Set up training parameters TRAINING_SCRIPT="/app/src/training/model_trainer.py" @@ -192,13 +217,7 @@ PROCESSED_DATA_DIR="/app/data/processed" training_output_dir="${TRAINING_OUTPUT_DIR}/model_${model_id}" mkdir -p "$training_output_dir" -# # Set default training parameters (can be made configurable) -# max_seq_length=128 -# num_epochs=3 -# batch_size=8 -# learning_rate=2e-5 - -echo "πŸ“‹ [PARAMS] Training parameters:" +echo "[INFO] Training parameters:" echo " - Dataset ID: $dataset_id" echo " - Model ID: $model_id" echo " - Model Type: $model_types" @@ -211,7 +230,7 @@ echo " - Is Latest: $latest" echo " - Deployment Environment: $deployment_environment" # Call the training script -echo "πŸŽ“ [EXECUTE] Calling training script..." +echo "[EXECUTE] Calling training script..." python3 "$TRAINING_SCRIPT" \ --model_types "$model_types" \ @@ -223,16 +242,13 @@ python3 "$TRAINING_SCRIPT" \ --minor_version "$minor_version" \ --latest "$latest" \ --deployment_environment "$deployment_environment" \ - # --data_dir "$PROCESSED_DATA_DIR" \ - # --output_dir "$training_output_dir" \ - # --mlflow_tracking_uri "$MLFLOW_TRACKING_URI" \ training_exit_code=$? # Check training result if [ $training_exit_code -eq 0 ]; then - echo "πŸŽ‰ [SUCCESS] Training completed successfully" - echo "πŸ“ [OUTPUT] Training outputs saved to: $training_output_dir" + echo "[SUCCESS] Training completed successfully" + echo "[OUTPUT] Training outputs saved to: $training_output_dir" # Update job status to trained echo "[UPDATE] Updating job status to trained..." @@ -240,7 +256,7 @@ if [ $training_exit_code -eq 0 ]; then -H "Content-Type: application/json" \ -d "{\"jobId\": $job_id, \"jobStatus\": \"trained\"}") - echo "πŸ” [DEBUG] Update job status to trained response: '$response_update_job_status_trained'" + echo "[DEBUG] Update job status to trained response: '$response_update_job_status_trained'" else echo "[FAILED] Training failed with exit code: $training_exit_code" @@ -252,4 +268,4 @@ else exit 1 fi -echo "βœ… [DONE] Training script starter completed" \ No newline at end of file +echo "[DONE] Training script starter completed" \ No newline at end of file diff --git a/DSL/Resql/global-classifier/POST/get-agencies.sql b/DSL/Resql/global-classifier/POST/get-agencies.sql index 514aed0b..b2bc9db0 100644 --- a/DSL/Resql/global-classifier/POST/get-agencies.sql +++ b/DSL/Resql/global-classifier/POST/get-agencies.sql @@ -16,7 +16,7 @@ SELECT FROM integrated_agencies WHERE - (:agency_name = 'all' OR agency_name ILIKE '%' || :agency_name || '%') + (:agency_name = 'all' OR agency_name ILIKE '%' || REPLACE(:agency_name,'_', '\_') || '%' ESCAPE '\') ORDER BY CASE WHEN :sort_by = 'agency_name' AND :sort_type = 'asc' THEN agency_name END ASC, CASE WHEN :sort_by = 'agency_name' AND :sort_type = 'desc' THEN agency_name END DESC, diff --git a/DSL/Resql/global-classifier/POST/get-datasets.sql b/DSL/Resql/global-classifier/POST/get-datasets.sql index bd2febd8..1245a0e5 100644 --- a/DSL/Resql/global-classifier/POST/get-datasets.sql +++ b/DSL/Resql/global-classifier/POST/get-datasets.sql @@ -11,6 +11,11 @@ FROM dataset_versions WHERE (:generation_status = 'all' OR generation_status ILIKE '%' || :generation_status || '%') + AND (:dataset_name = 'all' + OR POSITION(LOWER(:dataset_name) IN LOWER(CONCAT('v', major, '.', minor))) > 0 + OR POSITION(LOWER(:dataset_name) IN LOWER(CONCAT(major, '.', minor))) > 0 + OR POSITION(LOWER(:dataset_name) IN LOWER(major::text)) > 0 + OR POSITION(LOWER(:dataset_name) IN LOWER(minor::text)) > 0) ORDER BY CASE WHEN :sort_by = 'created_at' AND :sort_type = 'asc' THEN created_at END ASC, CASE WHEN :sort_by = 'created_at' AND :sort_type = 'desc' THEN created_at END DESC, diff --git a/DSL/Resql/global-classifier/POST/insert-agency-presigned-url.sql b/DSL/Resql/global-classifier/POST/insert-agency-presigned-url.sql new file mode 100644 index 00000000..34f96ad3 --- /dev/null +++ b/DSL/Resql/global-classifier/POST/insert-agency-presigned-url.sql @@ -0,0 +1,3 @@ +-- Insert agency presigned URL +INSERT INTO public.mock_ckb (agency_id, agency_data_hash, data_url) +VALUES (:agencyId, :agencyDataHash, :dataUrl); \ No newline at end of file diff --git a/DSL/Resql/global-classifier/POST/update-agency-presigned-url.sql b/DSL/Resql/global-classifier/POST/update-agency-presigned-url.sql new file mode 100644 index 00000000..69a640bf --- /dev/null +++ b/DSL/Resql/global-classifier/POST/update-agency-presigned-url.sql @@ -0,0 +1,6 @@ +-- Update agency presigned URL +UPDATE public.mock_ckb +SET + data_url = :dataUrl, + created_at = NOW() +WHERE agency_id = :agencyId; \ No newline at end of file diff --git a/DSL/Ruuter.private/global-classifier/GET/datasets/list.yml b/DSL/Ruuter.private/global-classifier/GET/datasets/list.yml index 3790fd8d..3696b583 100644 --- a/DSL/Ruuter.private/global-classifier/GET/datasets/list.yml +++ b/DSL/Ruuter.private/global-classifier/GET/datasets/list.yml @@ -23,6 +23,9 @@ declaration: - field: sortType type: string description: "Query parameter 'sortType' for sort direction (asc, desc)" + - field: datasetName + type: string + description: "Query parameter 'datasetName' for filtering datasets by name" extractRequestData: assign: @@ -31,6 +34,7 @@ extractRequestData: generationStatus: ${incoming.params.generationStatus || 'all'} sortBy: ${incoming.params.sortBy || ''} sortType: ${incoming.params.sortType || 'desc'} + datasetName: ${incoming.params.datasetName || 'all'} getAllDatasets: call: http.post @@ -42,6 +46,7 @@ getAllDatasets: generation_status: ${generationStatus} sort_by: ${sortBy} sort_type: ${sortType} + dataset_name: ${datasetName} result: datasets_res next: return_result diff --git a/DSL/Ruuter.public/global-classifier/POST/ckb/agency-data-url.yml b/DSL/Ruuter.public/global-classifier/POST/ckb/agency-data-url.yml new file mode 100644 index 00000000..2d8b03c0 --- /dev/null +++ b/DSL/Ruuter.public/global-classifier/POST/ckb/agency-data-url.yml @@ -0,0 +1,59 @@ +declaration: + call: declare + version: 0.1 + description: "Mock endpoint for generating agency data URLs" + method: post + accepts: json + returns: json + namespace: global-classifier + +get_centops_agencies: + call: http.post + args: + url: "[#GLOBAL_CLASSIFIER_RESQL]/mock-get-agencies-from-centops" + result: fetch_agencies_from_centops + next: log_result + +log_result: + log: ${fetch_agencies_from_centops.response.body} + next: assign_request_data + +assign_request_data: + assign: + centops_agencies: ${encodeURIComponent(JSON.stringify(fetch_agencies_from_centops.response.body))} + next: execute_cron_manager + +execute_cron_manager: + call: http.post + args: + url: "[#GLOBAL_CLASSIFIER_CRON_MANAGER]/execute/mock_signed_url_generation/mock_signed_url_generate" + query: + centopsAgencies: ${centops_agencies} + result: res + next: assign_success_response + +assign_success_response: + assign: + format_res: { + message: "Centops agency data URLs synchronized successfully", + operationSuccessful: true, + } + next: return_ok + +assign_fail_response: + assign: + format_res: { + message: "Centops agency data URLs synchronization failed", + operationSuccessful: false, + } + next: return_bad_request + +return_ok: + status: 200 + return: ${format_res} + next: end + +return_bad_request: + status: 400 + return: ${format_res} + next: end \ No newline at end of file diff --git a/DSL/Ruuter.public/global-classifier/POST/data/callback.yml b/DSL/Ruuter.public/global-classifier/POST/data/callback.yml index e069deab..9aec279c 100644 --- a/DSL/Ruuter.public/global-classifier/POST/data/callback.yml +++ b/DSL/Ruuter.public/global-classifier/POST/data/callback.yml @@ -25,7 +25,7 @@ declare: description: "List of agency IDs for which the dataset generation was completed" log_callback_received: - log: "πŸ“ž Dataset generation callback received - Task ID: ${incoming.body.task_id}, Status: ${incoming.body.status}, File Path: ${incoming.body.filePath}" + log: "Dataset generation callback received - Task ID: ${incoming.body.task_id}, Status: ${incoming.body.status}, File Path: ${incoming.body.filePath}" next: extract_callback_data extract_callback_data: @@ -38,7 +38,7 @@ extract_callback_data: next: log_detailed_info log_detailed_info: - log: "πŸ“‹ Callback Details - Task: ${task_id}, Status: ${status}, Message: ${message}, filePath: ${file_path}, results: ${results}" + log: "Callback Details - Task: ${task_id}, Status: ${status}, Message: ${message}, filePath: ${file_path}, results: ${results}" next: check_for_request_data check_for_request_data: diff --git a/GUI/src/components/FormElements/FormSelect/FormSelect.scss b/GUI/src/components/FormElements/FormSelect/FormSelect.scss index b6b4f434..6db2b3b7 100644 --- a/GUI/src/components/FormElements/FormSelect/FormSelect.scss +++ b/GUI/src/components/FormElements/FormSelect/FormSelect.scss @@ -124,5 +124,13 @@ &:focus { background-color: get-color(black-coral-0); } + &--disabled { + color: get-color(black-coral-6); + cursor: not-allowed; + pointer-events: none; + background-color: get-color(white); + } + } + } diff --git a/GUI/src/components/FormElements/FormSelect/index.tsx b/GUI/src/components/FormElements/FormSelect/index.tsx index e1187a49..e1f4bd42 100644 --- a/GUI/src/components/FormElements/FormSelect/index.tsx +++ b/GUI/src/components/FormElements/FormSelect/index.tsx @@ -18,6 +18,7 @@ import { ControllerRenderProps } from 'react-hook-form'; type FormSelectOption = { label: string; value: string | { name: string; id: string }; + disabled?: boolean; }; type FormSelectProps = Partial & @@ -130,9 +131,10 @@ const FormSelect = forwardRef(
  • {item.label}
  • diff --git a/GUI/src/components/FormElements/FormTextarea/FormTextarea.scss b/GUI/src/components/FormElements/FormTextarea/FormTextarea.scss index 51750b6d..17e45330 100644 --- a/GUI/src/components/FormElements/FormTextarea/FormTextarea.scss +++ b/GUI/src/components/FormElements/FormTextarea/FormTextarea.scss @@ -95,7 +95,11 @@ } } - &--disabled & { + &--disabled { + textarea { + cursor: not-allowed; + resize: none; + } input { background-color: get-color(black-coral-0); } diff --git a/GUI/src/components/FormElements/FormTextarea/index.tsx b/GUI/src/components/FormElements/FormTextarea/index.tsx index b1f23fe1..55ea5cd1 100644 --- a/GUI/src/components/FormElements/FormTextarea/index.tsx +++ b/GUI/src/components/FormElements/FormTextarea/index.tsx @@ -67,6 +67,7 @@ const FormTextarea = forwardRef(( defaultValue={defaultValue} className={textareaAutosizeClasses} aria-label={hideLabel ? label : undefined} + disabled={disabled} onChange={(e) => { if (onChange) onChange(e); handleOnChange(e); diff --git a/GUI/src/components/molecules/DataModelForm/index.tsx b/GUI/src/components/molecules/DataModelForm/index.tsx index c7e6b324..abf7263b 100644 --- a/GUI/src/components/molecules/DataModelForm/index.tsx +++ b/GUI/src/components/molecules/DataModelForm/index.tsx @@ -62,6 +62,11 @@ const DataModelForm: FC = ({ error={errors?.modelName} /> + {dataModel.modelName && dataModel.modelName.length > 256 && ( +
    + {t('dataModels.dataModelForm.errors.modelNameLength')} +
    + )}
    {t('dataModels.dataModelForm.modelVersion')}{' '} diff --git a/GUI/src/pages/DataModels/CreateDataModel.tsx b/GUI/src/pages/DataModels/CreateDataModel.tsx index 02e88a55..5e3a8d30 100644 --- a/GUI/src/pages/DataModels/CreateDataModel.tsx +++ b/GUI/src/pages/DataModels/CreateDataModel.tsx @@ -73,7 +73,7 @@ const CreateDataModel: FC = () => { open({ title: t('dataModels.createDataModel.successTitle'), content: t('dataModels.createDataModel.successDesc'), - footer: (
    ) + footer: (
    ) }); }, @@ -110,6 +110,7 @@ const CreateDataModel: FC = () => { const isCreateDisabled = () => { return ( !dataModel.modelName || + dataModel.modelName.length > 256 || !dataModel.datasetId || !dataModel.baseModels || (Array.isArray(dataModel.baseModels) && dataModel.baseModels.length === 0) || diff --git a/GUI/src/pages/Datasets/index.tsx b/GUI/src/pages/Datasets/index.tsx index e7ed28a8..213c93a6 100644 --- a/GUI/src/pages/Datasets/index.tsx +++ b/GUI/src/pages/Datasets/index.tsx @@ -22,8 +22,8 @@ const Datasets: FC = () => { const [searchTerm, setSearchTerm] = useState('all'); const { data: datasets, isLoading } = useQuery({ - queryKey: datasetQueryKeys.DATASET_OVERVIEW(pageIndex, sortOption), - queryFn: () => getDatasetsOverview(pageIndex, sortOption), + queryKey: datasetQueryKeys.DATASET_OVERVIEW(pageIndex, sortOption, searchTerm), + queryFn: () => getDatasetsOverview(pageIndex, sortOption, searchTerm), }); const pageCount = datasets?.[0]?.totalPages ?? 1; diff --git a/GUI/src/pages/TestModel/index.tsx b/GUI/src/pages/TestModel/index.tsx index 77fd2766..fb08882c 100644 --- a/GUI/src/pages/TestModel/index.tsx +++ b/GUI/src/pages/TestModel/index.tsx @@ -1,3 +1,4 @@ +/* eslint-disable @typescript-eslint/no-unused-expressions */ import { useMutation, useQuery } from '@tanstack/react-query'; import { Button, FormSelect, FormTextarea } from 'components'; import CircularSpinner from 'components/molecules/CircularSpinner/CircularSpinner'; @@ -91,6 +92,10 @@ const TestModel: FC = () => { const processedResults = classificationResult ? processClassificationResult(classificationResult) : []; + const selectOptions = modelVersions?.length > 0 + ? toLabelValueArray(modelVersions, 'id', 'version') ?? [] + : [{ label: t('testModels.noModels') ?? 'No models available', value: '', disabled: true }]; + return (
    @@ -108,15 +113,17 @@ const TestModel: FC = () => { { - handleChange('modelId', selection?.value as string); - setIsClassifyEnabled(false); + if (selection && !selection.disabled) { + handleChange('modelId', selection?.value as string); + setIsClassifyEnabled(false); + } }} value={testModel?.modelId === null ? t('testModels.errors.modelNotExist') : undefined} defaultValue={testModel?.modelId ?? undefined} /> -
    {modelLoadingStatus}
    @@ -131,6 +138,7 @@ const TestModel: FC = () => { maxLength={1000} onChange={(e) => handleChange('text', e.target.value)} showMaxLength={true} + disabled={!isClassifyEnabled} />
    diff --git a/GUI/src/pages/ViewDataset/index.tsx b/GUI/src/pages/ViewDataset/index.tsx index e8400132..131e9651 100644 --- a/GUI/src/pages/ViewDataset/index.tsx +++ b/GUI/src/pages/ViewDataset/index.tsx @@ -456,7 +456,7 @@ const ViewDataset = () => { {datasetIsLoading && } {!datasetIsLoading && ( []} pagination={pagination} rowSelection={rowSelection} diff --git a/GUI/src/services/datasets.ts b/GUI/src/services/datasets.ts index 1e4d668b..d2f519b9 100644 --- a/GUI/src/services/datasets.ts +++ b/GUI/src/services/datasets.ts @@ -4,7 +4,8 @@ import { DATASET_PAGE_SIZE, OVERVIEW_PAGE_SIZE } from 'utils/constants'; export async function getDatasetsOverview( pageNum: number, - sort: string + sort: string, + searchTerm: string = 'all' ) { const { data } = await apiDev.get(datasetsEndpoints.GET_OVERVIEW(), { params: { @@ -13,6 +14,7 @@ export async function getDatasetsOverview( sortBy: sort?.split(" ")?.[0], sortType: sort?.split(" ")?.[1], pageSize: OVERVIEW_PAGE_SIZE, + datasetName: searchTerm, }, }); return data?.response ?? []; diff --git a/GUI/src/utils/commonUtilts.ts b/GUI/src/utils/commonUtilts.ts index f720386f..b436c1b9 100644 --- a/GUI/src/utils/commonUtilts.ts +++ b/GUI/src/utils/commonUtilts.ts @@ -16,17 +16,20 @@ export const formattedArray = (data: string[]|undefined): FormattedOption[]|unde }; export const toLabelValueArray = ( - data: T[] | undefined, + data: T[] | undefined | null, valueField: keyof T, labelField: keyof T -): { label: string; value: string }[] | undefined => { - return data?.map((item) => ({ - label: String(item[labelField]), - value: String(item[valueField]), +): { label: string; value: string }[] => { + if (!Array.isArray(data)) { + console.warn('toLabelValueArray: Expected array, got', typeof data, data); + return []; + } + return data.map((item) => ({ + label: String(item[labelField] ?? ''), + value: String(item[valueField] ?? ''), })); }; - export const convertTimestampToDateTime = (timestamp: number) => { return moment.unix(timestamp).format('YYYY-MM-DD HH:mm:ss'); }; diff --git a/GUI/src/utils/queryKeys.ts b/GUI/src/utils/queryKeys.ts index 27affe50..b2a6feb6 100644 --- a/GUI/src/utils/queryKeys.ts +++ b/GUI/src/utils/queryKeys.ts @@ -29,13 +29,15 @@ export const datasetQueryKeys = { DATASET_OVERVIEW: function ( pageIndex?: number, generationStatus?: string, - sort?: string + sort?: string, + searchTerm?: string ) { return [ 'datasets/overview', pageIndex, generationStatus, sort, + searchTerm, ].filter((val) => val !== undefined); }, GET_META_DATA: function (datasetId?: number|string) { diff --git a/GUI/translations/en/common.json b/GUI/translations/en/common.json index c9c92a31..470beb1c 100644 --- a/GUI/translations/en/common.json +++ b/GUI/translations/en/common.json @@ -327,7 +327,7 @@ "title": "Data Generation Sessions", "inprogress": "Data Generation in-Progress", "fail": "Data Generation failed because {{class}} class found in the {{column}} column does not exist in hierarchy", - "noSessions": "No ongoing Data Generation sessions available" + "noSessions": "No ongoing data generation sessions available" }, "correctedTexts": { "title": "Corrected Texts", @@ -447,7 +447,8 @@ "baseModels": "Select Base Models", "deploymentPlatform": "Select Deployment Environment", "errors": { - "datasetVersionNotExist": "Dataset version does not exist" + "datasetVersionNotExist": "Dataset version does not exist", + "modelNameLength": "Model name must be less than 256 characters" } } }, @@ -455,13 +456,14 @@ "title": "Training Sessions", "inprogress": "Validation in-Progress", "fail": "Validation failed because {{class}} class found in the {{column}} column does not exist in hierarchy", - "noSessions": "No Active Training Sessions", + "noSessions": "No active training sessions", "noSessionsDesc": "There are currently no active training sessions. Once you start a training session, it will appear here. In the meantime, you can initiate a new training session to begin improving your models." }, "testModels": { "title": "Test Model", "selectionLabel": "Model", "placeholder": "Choose model", + "noModels": "No models available", "classifyTextLabel": "Enter Text", "classify": "Classify", "predictedHierarchy": "Predicted Class Hierarchy : ", diff --git a/GUI/translations/et/common.json b/GUI/translations/et/common.json index 924845fb..ee25fbb3 100644 --- a/GUI/translations/et/common.json +++ b/GUI/translations/et/common.json @@ -447,7 +447,10 @@ "datasetGroup": "Vali andmestiku grupp", "baseModels": "Vali baasmudelid", "deploymentPlatform": "Vali rakenduse platvorm", - "maturityLabel": "Vali valmiduse silt" + "maturityLabel": "Vali valmiduse silt", + "errors": { + "modelNameLength": "Mudeli nimi peab olema vΓ€hem kui 256 tΓ€hemΓ€rki" + } } }, "trainingSessions": { @@ -461,6 +464,7 @@ "title": "Testige mudelit", "selectionLabel": "Mudel", "placeholder": "Valige mudel", + "noModels": "Mudeleid pole saadaval", "classifyTextLabel": "Sisestage tekst", "classify": "Klassifitseeri", "predictedHierarchy": "Prognoositud klassihierarhia: ", diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index a1739783..4cf20111 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -173,6 +173,7 @@ services: - ./grafana-configs/loki_logger.py:/app/src/training/loki_logger.py - ./constants.ini:/app/inference_scripts/constants.ini - cron_data:/app/data + - ./src/scripts:/app/src/scripts runtime: nvidia environment: - NVIDIA_VISIBLE_DEVICES=all @@ -383,29 +384,6 @@ services: timeout: 3s retries: 3 - dataset-gen-ollama: - image: synthesisai/dataset-generator-ollama:latest - container_name: dataset-gen-ollama - ports: - - "11434:11434" - environment: - - NVIDIA_VISIBLE_DEVICES=all - - OLLAMA_USE_GPU=1 - - OLLAMA_HOST=0.0.0.0 - volumes: - - dataset_gen_ollama_models:/root/.ollama - - ./DSL/DatasetGenerator/ollama-entrypoint.sh:/ollama-entrypoint.sh - entrypoint: ["bash", "/ollama-entrypoint.sh"] - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - networks: - - bykstack - #temporarary container to initialize S3 storage with necessary buckets and models init-storage: diff --git a/src/s3_dataset_processor/constants.py b/src/s3_dataset_processor/constants.py index cfa2b978..09062281 100644 --- a/src/s3_dataset_processor/constants.py +++ b/src/s3_dataset_processor/constants.py @@ -13,3 +13,4 @@ SYNCED_WITH_CKB = "Synced_with_CKB" SYNC_WITH_CKB_FAILED = "Sync_with_CKB_Failed" OUTPUT_DATA_DIR = "/app/output_datasets" +DATA_DIRECTORY = "/app/data" diff --git a/src/s3_dataset_processor/dataset_generation_callback_processor.py b/src/s3_dataset_processor/dataset_generation_callback_processor.py index f8624e4c..2c9441a4 100644 --- a/src/s3_dataset_processor/dataset_generation_callback_processor.py +++ b/src/s3_dataset_processor/dataset_generation_callback_processor.py @@ -12,6 +12,7 @@ import requests import traceback import os +import shutil import pandas as pd from constants import ( DATASET_UPDATE_URL, @@ -22,6 +23,7 @@ SYNC_WITH_CKB_FAILED, SCRIPT_DIR, PROGRESS_UPDATE_URL, + DATA_DIRECTORY, ) # --- Logging Setup --- @@ -189,6 +191,53 @@ def send_status_update(dataset_id: int, encoded_results: str) -> None: traceback.print_exc() +def cleanup_temporary_files() -> None: + """Clean up all temporary files and directories after successful S3 upload.""" + cleanup_summary = [] + + try: + # Clean up /app/data directory (downloaded and extracted source datasets) + data_dir = DATA_DIRECTORY + if os.path.exists(data_dir): + _cleanup_directory_contents(data_dir, cleanup_summary) + + # Clean up ENTIRE /app/output_datasets directory (all generated files) + output_dir = OUTPUT_DATA_DIR + if os.path.exists(output_dir): + _cleanup_directory_contents(output_dir, cleanup_summary) + + # Log cleanup results + _log_cleanup_results(cleanup_summary) + + except Exception as e: + logger.error(f"Error during cleanup: {e}") + + +def _cleanup_directory_contents(directory: str, cleanup_summary: list) -> None: + """Clean up all contents of a directory while preserving the directory itself.""" + for item in os.listdir(directory): + item_path = os.path.join(directory, item) + try: + if os.path.isfile(item_path): + os.remove(item_path) + cleanup_summary.append(f"Removed file: {item_path}") + elif os.path.isdir(item_path): + shutil.rmtree(item_path) + cleanup_summary.append(f"Removed directory: {item_path}") + except Exception as e: + logger.warning(f"Failed to remove {item_path}: {e}") + + +def _log_cleanup_results(cleanup_summary: list) -> None: + """Log the results of the cleanup operation.""" + if cleanup_summary: + logger.info("Cleanup completed successfully:") + for item in cleanup_summary: + logger.info(f" - {item}") + else: + logger.info("No temporary files found to clean up") + + def process_callback_background( file_path: str, encoded_results: str, session_id: int ) -> None: @@ -256,6 +305,10 @@ def process_callback_background( send_status_update(dataset_id, encoded_results) logger.info("Processing completed successfully") + + # Clean up temporary files before final notification + cleanup_temporary_files() + notify_progress_uploading_to_s3(session_id) except Exception as e: diff --git a/src/scripts/constants.py b/src/scripts/constants.py new file mode 100644 index 00000000..a9f31b1b --- /dev/null +++ b/src/scripts/constants.py @@ -0,0 +1,8 @@ +DATA_URL_INSERT_URL = "http://resql:8082/global-classifier/insert-agency-presigned-url" +DATA_URL_UPDATE_URL = "http://resql:8082/global-classifier/update-agency-presigned-url" +MINIO_ENDPOINT = "http://minio:9000" +MINIO_USER_ID = "minioadmin" +MINIO_USER_KEY = "minioadmin" +REGION_NAME = "us-east-1" +SIGNATURE_VERSION = "s3v4" +BUCKET_NAME = "ckb" diff --git a/src/scripts/generate_signed_urls.py b/src/scripts/generate_signed_urls.py new file mode 100644 index 00000000..a8878b79 --- /dev/null +++ b/src/scripts/generate_signed_urls.py @@ -0,0 +1,185 @@ +import boto3 +from botocore.client import Config +import sys +import json +import urllib.parse +import requests +from typing import List, Dict +from src.scripts.constants import ( + DATA_URL_INSERT_URL, + DATA_URL_UPDATE_URL, + MINIO_ENDPOINT, + MINIO_USER_ID, + MINIO_USER_KEY, + REGION_NAME, + SIGNATURE_VERSION, + BUCKET_NAME, +) + + +def upsert_agency_to_database(agency_id: str, agency_name: str, data_url: str) -> bool: + """ + Upsert agency data to mock_ckb table via Resql endpoint + Try INSERT first, if it fails with conflict, then UPDATE + """ + try: + agency_data_hash = f"{agency_name}_hash" + + payload = { + "agencyId": agency_id, + "agencyDataHash": agency_data_hash, + "dataUrl": data_url, + } + + # Try INSERT first + insert_url = DATA_URL_INSERT_URL + response = requests.post(insert_url, json=payload, timeout=30) + + if response.status_code == 200: + print(f"Successfully inserted new agency {agency_id} to database") + return True + elif response.status_code == 400 and "duplicate key" in response.text.lower(): + # If INSERT fails due to duplicate key, try UPDATE + print(f"Agency {agency_id} exists, updating...") + + update_url = DATA_URL_UPDATE_URL + update_response = requests.post(update_url, json=payload, timeout=30) + + if update_response.status_code == 200: + print(f"Successfully updated agency {agency_id} in database") + return True + else: + print( + f"Failed to update agency {agency_id}: HTTP {update_response.status_code}" + ) + print(f"Response: {update_response.text}") + return False + else: + print(f"Failed to insert agency {agency_id}: HTTP {response.status_code}") + print(f"Response: {response.text}") + return False + + except Exception as e: + print(f"Error upserting agency {agency_id} to database: {e}") + return False + + +def main(): + print("Python script started...") + + # Check if agencies data is provided as command line argument + if len(sys.argv) < 2: + print('Usage: python generate_signed_urls.py ""') + print( + "Expected format: URL-encoded JSON array with agencyId and agencyName fields" + ) + sys.exit(1) + + try: + # Decode the URL-encoded string first + encoded_agencies = sys.argv[1] + decoded_agencies_str = urllib.parse.unquote(encoded_agencies) + + # Parse JSON + agencies = json.loads(decoded_agencies_str) + print(f"Processing {len(agencies)} agencies") + + except json.JSONDecodeError as e: + print(f"Error: Failed to parse agencies JSON: {e}") + sys.exit(1) + except Exception as e: + print(f"Error during parsing: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + try: + s3_client = boto3.client( + "s3", + endpoint_url=MINIO_ENDPOINT, + aws_access_key_id=MINIO_USER_ID, + aws_secret_access_key=MINIO_USER_KEY, + config=Config(signature_version=SIGNATURE_VERSION), + region_name=REGION_NAME, + ) + except Exception as e: + print(f"Error creating S3 client: {e}") + sys.exit(1) + + # Build list of files to process from agencies + files_to_process: List[Dict[str, str]] = [] + for agency in agencies: + agency_name = agency.get("agencyName") + agency_id = agency.get("agencyId") + + if agency_name: + files_to_process.append( + { + "bucket": BUCKET_NAME, + "key": f"{agency_name}/{agency_name}.zip", + "agencyId": agency_id, + } + ) + else: + print(f"Warning: Agency missing agencyName: {agency}") + + if not files_to_process: + print("Error: No valid agencies found to process") + sys.exit(1) + + # Generate presigned URLs + presigned_urls: List[str] = [] + successful_agencies: List[Dict] = [] + + print("Generating presigned URLs...") + for file_info in files_to_process: + try: + url = s3_client.generate_presigned_url( + ClientMethod="get_object", + Params={"Bucket": file_info["bucket"], "Key": file_info["key"]}, + ExpiresIn=24 * 3600, # 24 hours in seconds + ) + presigned_urls.append(url) + successful_agencies.append( + { + "agency_id": file_info.get("agencyId"), + "agency_name": file_info["key"].split("/")[ + 0 + ], # Extract agency name from key + "data_url": url, + } + ) + except Exception as e: + print(f"Failed to generate URL for: {file_info['key']}") + print(f" Error: {str(e)}") + + print(f"Generated {len(presigned_urls)} URLs successfully") + + # Upsert agencies to database + if successful_agencies: + print("Storing agencies in database...") + db_success_count = 0 + + for agency_data in successful_agencies: + success = upsert_agency_to_database( + agency_data["agency_id"], + agency_data["agency_name"], + agency_data["data_url"], + ) + if success: + db_success_count += 1 + + print( + f"Successfully stored {db_success_count}/{len(successful_agencies)} agencies in database" + ) + + # Check if any URLs were generated + if not presigned_urls: + print("No URLs were generated successfully") + sys.exit(1) + + print("Presigned URL generation completed successfully") + + +if __name__ == "__main__": + main()