From 51eb091ca7c5ba73f7c4ecb5d986b11eb1044aba Mon Sep 17 00:00:00 2001 From: erangi-ar <111747955+erangi-ar@users.noreply.github.com> Date: Thu, 30 Oct 2025 13:35:18 +0530 Subject: [PATCH 1/6] Modify README with new Dataset Generator details Updated RUUTER_API_URL and added Dataset Generator instructions. --- README.md | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 546497ac..ac03915c 100644 --- a/README.md +++ b/README.md @@ -33,13 +33,31 @@ Windows: - Clone [TIM](https://github.com/buerokratt/TIM) - Navigate to TIM and build the image using the command `docker build -t tim .` - Clone [Authentication Layer](https://github.com/buerokratt/Authentication-layer) -- Go to public/env-config.js and update the RUUTER_API_URL to 'http://localhost:8086/classifier' +- Go to public/env-config.js and update the RUUTER_API_URL to 'http://localhost:8086/global-classifier' - Navigate to Authentication Layer, checkout to the `dev` branch and build the image using the command `docker build -f Dockerfile.dev -t authentication-layer .` - Clone [S3 Ferry](https://github.com/buerokratt/S3-Ferry) - Navigate to S3-Ferry and build the image using the command `docker build -t s3-ferry .` - Clone [Cron Manager](https://github.com/buerokratt/CronManager) - Navigate to Cron Manager `dev` branch and build the cron-manager-python image using the command `docker build -f Dockerfile.python -t cron-manager-python .` - +- Clone [Dataset Generator](https://github.com/buerokratt/Dataset-Generator) +- Navigate to Dataset Generator `dev` branch and build the synthesisai/dataset-generator image using the command `docker compose build` + +## Using LLMs for data generation + +Currently 3 providers available in Global classifier for dataset generation +- Bedrock Anthropic +- Azure Openai +- Ollama + +To select a provider, navigate to DSL\DatasetGenerator\config\config.yaml +Change the provider name in the below block. Dataset generator will use the selected provider for the generation. + ```yaml +provider: + name: "azure-openai" # THIS DETERMINES WHICH PROVIDER TO USE + timeout: 60 + max_retries: 3 + retry_delay: 5 +``` ## Contributing From 40685682267cef3b4a41ec0ce38ff8e83b73dbd2 Mon Sep 17 00:00:00 2001 From: erangi-ar <111747955+erangi-ar@users.noreply.github.com> Date: Thu, 30 Oct 2025 13:39:41 +0530 Subject: [PATCH 2/6] Clarify provider selection instructions in README Updated instructions for selecting a provider in the dataset generator configuration. --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ac03915c..cdb93db8 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,8 @@ Currently 3 providers available in Global classifier for dataset generation - Ollama To select a provider, navigate to DSL\DatasetGenerator\config\config.yaml -Change the provider name in the below block. Dataset generator will use the selected provider for the generation. + +1.Change the provider name in the below block. Dataset generator will use the selected provider for the generation. ```yaml provider: name: "azure-openai" # THIS DETERMINES WHICH PROVIDER TO USE @@ -58,7 +59,7 @@ provider: max_retries: 3 retry_delay: 5 ``` - +2.Change the `PROVIDER_NAME` in .env file as well ## Contributing From 5ae8ef335e7cf2681be26e308d5f347f41b8c5b3 Mon Sep 17 00:00:00 2001 From: erangi-ar <111747955+erangi-ar@users.noreply.github.com> Date: Thu, 30 Oct 2025 13:52:05 +0530 Subject: [PATCH 3/6] Add data migration instructions to README Added instructions for data migration and initial configurations. --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index cdb93db8..d28adde9 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,12 @@ provider: ``` 2.Change the `PROVIDER_NAME` in .env file as well +## Data Migration + +In order to access the GUI, data migration script should be executed. It will add the initial configurations of the system +run `migrate.sh` file and it will create the initial user with test Smart ID `EE30303039914` and the GUI can be accessed by logging in with the added Smart ID + + ## Contributing This section outlines the guidelines for contributing to the Global Classifier project. Please read through these before submitting any changes. From b6a860a32bd323050e3654f11f7a4508e63ec9d0 Mon Sep 17 00:00:00 2001 From: erangi-ar <111747955+erangi-ar@users.noreply.github.com> Date: Thu, 30 Oct 2025 14:59:48 +0530 Subject: [PATCH 4/6] Clarify dataset generation providers in README Added provider identifiers for clarity in the README. --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d28adde9..21928d36 100644 --- a/README.md +++ b/README.md @@ -45,9 +45,9 @@ Windows: ## Using LLMs for data generation Currently 3 providers available in Global classifier for dataset generation -- Bedrock Anthropic -- Azure Openai -- Ollama +- Bedrock Anthropic(bedrock-anthropic) +- Azure Openai(azure-openai) +- Ollama(ollama) To select a provider, navigate to DSL\DatasetGenerator\config\config.yaml From f7db94877f1a205c43305677813c00971fe12352 Mon Sep 17 00:00:00 2001 From: nuwangeek Date: Mon, 3 Nov 2025 18:04:07 +0530 Subject: [PATCH 5/6] fixed issue in trton configs and trainingpipeline --- .env | 2 +- migrate.sh | 2 +- src/model-training/create_triton_configs.py | 36 +++++---------------- src/model-training/model_trainer.py | 1 + src/model-training/trainingpipeline.py | 4 +-- 5 files changed, 13 insertions(+), 32 deletions(-) diff --git a/.env b/.env index 219fbae3..a30f1b57 100644 --- a/.env +++ b/.env @@ -2,6 +2,6 @@ AWS_ACCESS_KEY_ID=your_aws_access_key_id AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key BEDROCK_AWS_REGION=eu-west-1 AZURE_OPENAI_API_KEY=your_azure_openai_api_key -AZURE_OPENAI_ENDPOINT=your_azure_apenai_endpoint +AZURE_OPENAI_ENDPOINT=your_azure_openai_endpoint AZURE_OPENAI_DEPLOYMENT_NAME=gpt-4o PROVIDER_NAME=azure-openai \ No newline at end of file diff --git a/migrate.sh b/migrate.sh index e6927aa3..95417f16 100755 --- a/migrate.sh +++ b/migrate.sh @@ -12,4 +12,4 @@ INI_FILE="constants.ini" DB_PASSWORD=$(get_ini_value "$INI_FILE" "DB_PASSWORD") -docker run --rm --network bykstack -v `pwd`/DSL/Liquibase/changelog:/liquibase/changelog -v `pwd`/DSL/Liquibase/master.yml:/liquibase/master.yml -v `pwd`/DSL/Liquibase/data:/liquibase/data liquibase/liquibase --defaultsFile=/liquibase/changelog/liquibase.properties --changelog-file=master.yml --url=jdbc:postgresql://users_db:5432/global-classifier?user=postgres --password=dbadmin update +docker run --rm --network bykstack -v `pwd`/DSL/Liquibase/changelog:/liquibase/changelog -v `pwd`/DSL/Liquibase/master.yml:/liquibase/master.yml -v `pwd`/DSL/Liquibase/data:/liquibase/data liquibase/liquibase:4.33 --defaultsFile=/liquibase/changelog/liquibase.properties --changelog-file=master.yml --url=jdbc:postgresql://users_db:5432/global-classifier?user=postgres --password=dbadmin update diff --git a/src/model-training/create_triton_configs.py b/src/model-training/create_triton_configs.py index 6e301f96..cbdb61a4 100644 --- a/src/model-training/create_triton_configs.py +++ b/src/model-training/create_triton_configs.py @@ -179,22 +179,19 @@ def generate_preprocessing_config( name: "attention_mask" data_type: TYPE_INT64 dims: [ -1 ] - }}, - """ + }}""" + if supports_token_type_ids: - config += """{{ + config += """, + { name: "token_type_ids" data_type: TYPE_INT64 dims: [ -1 ] - }}""" - - # Note: No training flag needed for current SNGP implementation + }""" config += f""" ] - - parameters [ {{ key: "model_name" @@ -235,17 +232,6 @@ def generate_text_classifier_config( ) -> str: """ Generate Triton text classifier config based on model type. - - Args: - model_name: Name of the text classifier model (e.g., "text_classifier") - model_type: Type of model ("distilbert", "bert", "xlm-roberta", "roberta") - num_labels: Number of output labels/classes - sequence_length: Maximum sequence length for the model - max_batch_size: Maximum batch size for inference - ood_method: OOD method ("sngp", "energy", "softmax", or None) - - Returns: - str: Complete Triton text classifier config as string """ # Define which models support token_type_ids @@ -268,16 +254,13 @@ def generate_text_classifier_config( dims: [ -1 ] }}""" - # Add token_type_ids input if supported if supports_token_type_ids: config += """, - {{ + { name: "token_type_ids" data_type: TYPE_INT64 dims: [ -1 ] - }}""" - - # Note: No training flag needed for current SNGP implementation + }""" config += f""" ] @@ -287,12 +270,9 @@ def generate_text_classifier_config( name: "logits" data_type: TYPE_FP32 dims: [ {num_labels} ] - }}""" - - config += """ + }} ] - dynamic_batching {{ max_queue_delay_microseconds: 100 }}""" diff --git a/src/model-training/model_trainer.py b/src/model-training/model_trainer.py index 2a590a82..af20a95d 100644 --- a/src/model-training/model_trainer.py +++ b/src/model-training/model_trainer.py @@ -439,6 +439,7 @@ def train(self): logger.info(f"BEST MODEL SELECTED: {best_variant['name']}") logger.info(f"BEST COMBINED SCORE: {best_result['combined_score']:.4f}") logger.info(f"BEST MODEL TYPE: {best_variant['type']}") + logger.info(f"BEST MODEL_BASE: {best_variant['base_model']}") # Save training summary training_summary = { diff --git a/src/model-training/trainingpipeline.py b/src/model-training/trainingpipeline.py index 527a5626..d3a909be 100644 --- a/src/model-training/trainingpipeline.py +++ b/src/model-training/trainingpipeline.py @@ -1146,13 +1146,13 @@ def train(self): # save labelmappings, ood config to config.json config = { "num_labels": len(label_encoder.classes_), - "model_name": self.full_name, + "model_name": MODEL_CONFIGS[self.model_name]["model_name"], "hidden_dim": model.hidden_dim, "dropout_rate": model.dropout_rate, "sequence_length": SEQUENCE_LENGTH, "ood_method": self.ood_method, "ood_config": self.ood_config, - "base_model_name": self.model_name, + "base_model_name": MODEL_CONFIGS[self.model_name]["tokenizer_name"], "base_model_type": model.base_model.__class__.__name__, "model_label2id": self.model_label2id, "model_id2label": self.model_id2label, From abbcc9feda2670e101ee428d8c6390d174068a35 Mon Sep 17 00:00:00 2001 From: nuwangeek Date: Mon, 3 Nov 2025 18:09:37 +0530 Subject: [PATCH 6/6] fixed ruff formating issue --- src/model-training/create_triton_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/model-training/create_triton_configs.py b/src/model-training/create_triton_configs.py index cbdb61a4..2c4168cc 100644 --- a/src/model-training/create_triton_configs.py +++ b/src/model-training/create_triton_configs.py @@ -180,7 +180,7 @@ def generate_preprocessing_config( data_type: TYPE_INT64 dims: [ -1 ] }}""" - + if supports_token_type_ids: config += """, {