diff --git a/.env b/.env index 219fbae3..a30f1b57 100644 --- a/.env +++ b/.env @@ -2,6 +2,6 @@ AWS_ACCESS_KEY_ID=your_aws_access_key_id AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key BEDROCK_AWS_REGION=eu-west-1 AZURE_OPENAI_API_KEY=your_azure_openai_api_key -AZURE_OPENAI_ENDPOINT=your_azure_apenai_endpoint +AZURE_OPENAI_ENDPOINT=your_azure_openai_endpoint AZURE_OPENAI_DEPLOYMENT_NAME=gpt-4o PROVIDER_NAME=azure-openai \ No newline at end of file diff --git a/README.md b/README.md index 546497ac..21928d36 100644 --- a/README.md +++ b/README.md @@ -33,13 +33,38 @@ Windows: - Clone [TIM](https://github.com/buerokratt/TIM) - Navigate to TIM and build the image using the command `docker build -t tim .` - Clone [Authentication Layer](https://github.com/buerokratt/Authentication-layer) -- Go to public/env-config.js and update the RUUTER_API_URL to 'http://localhost:8086/classifier' +- Go to public/env-config.js and update the RUUTER_API_URL to 'http://localhost:8086/global-classifier' - Navigate to Authentication Layer, checkout to the `dev` branch and build the image using the command `docker build -f Dockerfile.dev -t authentication-layer .` - Clone [S3 Ferry](https://github.com/buerokratt/S3-Ferry) - Navigate to S3-Ferry and build the image using the command `docker build -t s3-ferry .` - Clone [Cron Manager](https://github.com/buerokratt/CronManager) - Navigate to Cron Manager `dev` branch and build the cron-manager-python image using the command `docker build -f Dockerfile.python -t cron-manager-python .` +- Clone [Dataset Generator](https://github.com/buerokratt/Dataset-Generator) +- Navigate to Dataset Generator `dev` branch and build the synthesisai/dataset-generator image using the command `docker compose build` +## Using LLMs for data generation + +Currently 3 providers available in Global classifier for dataset generation +- Bedrock Anthropic(bedrock-anthropic) +- Azure Openai(azure-openai) +- Ollama(ollama) + +To select a provider, navigate to DSL\DatasetGenerator\config\config.yaml + +1.Change the provider name in the below block. Dataset generator will use the selected provider for the generation. + ```yaml +provider: + name: "azure-openai" # THIS DETERMINES WHICH PROVIDER TO USE + timeout: 60 + max_retries: 3 + retry_delay: 5 +``` +2.Change the `PROVIDER_NAME` in .env file as well + +## Data Migration + +In order to access the GUI, data migration script should be executed. It will add the initial configurations of the system +run `migrate.sh` file and it will create the initial user with test Smart ID `EE30303039914` and the GUI can be accessed by logging in with the added Smart ID ## Contributing diff --git a/migrate.sh b/migrate.sh index e6927aa3..95417f16 100755 --- a/migrate.sh +++ b/migrate.sh @@ -12,4 +12,4 @@ INI_FILE="constants.ini" DB_PASSWORD=$(get_ini_value "$INI_FILE" "DB_PASSWORD") -docker run --rm --network bykstack -v `pwd`/DSL/Liquibase/changelog:/liquibase/changelog -v `pwd`/DSL/Liquibase/master.yml:/liquibase/master.yml -v `pwd`/DSL/Liquibase/data:/liquibase/data liquibase/liquibase --defaultsFile=/liquibase/changelog/liquibase.properties --changelog-file=master.yml --url=jdbc:postgresql://users_db:5432/global-classifier?user=postgres --password=dbadmin update +docker run --rm --network bykstack -v `pwd`/DSL/Liquibase/changelog:/liquibase/changelog -v `pwd`/DSL/Liquibase/master.yml:/liquibase/master.yml -v `pwd`/DSL/Liquibase/data:/liquibase/data liquibase/liquibase:4.33 --defaultsFile=/liquibase/changelog/liquibase.properties --changelog-file=master.yml --url=jdbc:postgresql://users_db:5432/global-classifier?user=postgres --password=dbadmin update diff --git a/src/model-training/create_triton_configs.py b/src/model-training/create_triton_configs.py index 6e301f96..2c4168cc 100644 --- a/src/model-training/create_triton_configs.py +++ b/src/model-training/create_triton_configs.py @@ -179,22 +179,19 @@ def generate_preprocessing_config( name: "attention_mask" data_type: TYPE_INT64 dims: [ -1 ] - }}, - """ + }}""" + if supports_token_type_ids: - config += """{{ + config += """, + { name: "token_type_ids" data_type: TYPE_INT64 dims: [ -1 ] - }}""" - - # Note: No training flag needed for current SNGP implementation + }""" config += f""" ] - - parameters [ {{ key: "model_name" @@ -235,17 +232,6 @@ def generate_text_classifier_config( ) -> str: """ Generate Triton text classifier config based on model type. - - Args: - model_name: Name of the text classifier model (e.g., "text_classifier") - model_type: Type of model ("distilbert", "bert", "xlm-roberta", "roberta") - num_labels: Number of output labels/classes - sequence_length: Maximum sequence length for the model - max_batch_size: Maximum batch size for inference - ood_method: OOD method ("sngp", "energy", "softmax", or None) - - Returns: - str: Complete Triton text classifier config as string """ # Define which models support token_type_ids @@ -268,16 +254,13 @@ def generate_text_classifier_config( dims: [ -1 ] }}""" - # Add token_type_ids input if supported if supports_token_type_ids: config += """, - {{ + { name: "token_type_ids" data_type: TYPE_INT64 dims: [ -1 ] - }}""" - - # Note: No training flag needed for current SNGP implementation + }""" config += f""" ] @@ -287,12 +270,9 @@ def generate_text_classifier_config( name: "logits" data_type: TYPE_FP32 dims: [ {num_labels} ] - }}""" - - config += """ + }} ] - dynamic_batching {{ max_queue_delay_microseconds: 100 }}""" diff --git a/src/model-training/model_trainer.py b/src/model-training/model_trainer.py index 2a590a82..af20a95d 100644 --- a/src/model-training/model_trainer.py +++ b/src/model-training/model_trainer.py @@ -439,6 +439,7 @@ def train(self): logger.info(f"BEST MODEL SELECTED: {best_variant['name']}") logger.info(f"BEST COMBINED SCORE: {best_result['combined_score']:.4f}") logger.info(f"BEST MODEL TYPE: {best_variant['type']}") + logger.info(f"BEST MODEL_BASE: {best_variant['base_model']}") # Save training summary training_summary = { diff --git a/src/model-training/trainingpipeline.py b/src/model-training/trainingpipeline.py index 527a5626..d3a909be 100644 --- a/src/model-training/trainingpipeline.py +++ b/src/model-training/trainingpipeline.py @@ -1146,13 +1146,13 @@ def train(self): # save labelmappings, ood config to config.json config = { "num_labels": len(label_encoder.classes_), - "model_name": self.full_name, + "model_name": MODEL_CONFIGS[self.model_name]["model_name"], "hidden_dim": model.hidden_dim, "dropout_rate": model.dropout_rate, "sequence_length": SEQUENCE_LENGTH, "ood_method": self.ood_method, "ood_config": self.ood_config, - "base_model_name": self.model_name, + "base_model_name": MODEL_CONFIGS[self.model_name]["tokenizer_name"], "base_model_type": model.base_model.__class__.__name__, "model_label2id": self.model_label2id, "model_id2label": self.model_id2label,