rootcodelabs · erangi-ar · Nov 10, 2025 · Oct 29, 2025 · Oct 30, 2025 · Oct 30, 2025
diff --git a/.env b/.env
@@ -2,6 +2,6 @@ AWS_ACCESS_KEY_ID=your_aws_access_key_id
 AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key
 BEDROCK_AWS_REGION=eu-west-1
 AZURE_OPENAI_API_KEY=your_azure_openai_api_key
-AZURE_OPENAI_ENDPOINT=your_azure_apenai_endpoint
+AZURE_OPENAI_ENDPOINT=your_azure_openai_endpoint
 AZURE_OPENAI_DEPLOYMENT_NAME=gpt-4o
 PROVIDER_NAME=azure-openai
diff --git a/README.md b/README.md
@@ -33,13 +33,38 @@ Windows:
 - Clone [TIM](https://github.com/buerokratt/TIM)
 - Navigate to TIM and build the image using the command `docker build -t tim .`
 - Clone [Authentication Layer](https://github.com/buerokratt/Authentication-layer)
-- Go to public/env-config.js and update the RUUTER_API_URL to 'http://localhost:8086/classifier'
+- Go to public/env-config.js and update the RUUTER_API_URL to 'http://localhost:8086/global-classifier'
 - Navigate to Authentication Layer, checkout to the `dev` branch and build the image using the command `docker build -f Dockerfile.dev -t authentication-layer .`
 - Clone [S3 Ferry](https://github.com/buerokratt/S3-Ferry)
 - Navigate to S3-Ferry and build the image using the command `docker build  -t s3-ferry .`
 - Clone [Cron Manager](https://github.com/buerokratt/CronManager) 
 - Navigate to Cron Manager `dev` branch and build the cron-manager-python image using the command `docker build -f Dockerfile.python -t cron-manager-python .`
+- Clone [Dataset Generator](https://github.com/buerokratt/Dataset-Generator)
+- Navigate to Dataset Generator `dev` branch and build the synthesisai/dataset-generator image using the command `docker compose build`
 
+## Using LLMs for data generation
+
+Currently 3 providers available in Global classifier for dataset generation
+- Bedrock Anthropic(bedrock-anthropic)
+- Azure Openai(azure-openai)
+- Ollama(ollama)
+
+To select a provider, navigate to DSL\DatasetGenerator\config\config.yaml
+
+1.Change the provider name in the below block. Dataset generator will use the selected provider for the generation.
+  ```yaml
+provider:
+  name: "azure-openai"  # THIS DETERMINES WHICH PROVIDER TO USE
+  timeout: 60
+  max_retries: 3
+  retry_delay: 5
+```
+2.Change the `PROVIDER_NAME` in .env file as well
+
+## Data Migration
+
+In order to access the GUI, data migration script should be executed. It will add the initial configurations of the system
+run `migrate.sh` file and it will create the initial user with test Smart ID `EE30303039914` and the GUI can be accessed by logging in with the added Smart ID
 
 
 ## Contributing

diff --git a/migrate.sh b/migrate.sh
@@ -12,4 +12,4 @@ INI_FILE="constants.ini"
 DB_PASSWORD=$(get_ini_value "$INI_FILE" "DB_PASSWORD")
 
 
-docker run --rm --network bykstack -v `pwd`/DSL/Liquibase/changelog:/liquibase/changelog -v `pwd`/DSL/Liquibase/master.yml:/liquibase/master.yml -v `pwd`/DSL/Liquibase/data:/liquibase/data liquibase/liquibase --defaultsFile=/liquibase/changelog/liquibase.properties --changelog-file=master.yml --url=jdbc:postgresql://users_db:5432/global-classifier?user=postgres --password=dbadmin update
+docker run --rm --network bykstack -v `pwd`/DSL/Liquibase/changelog:/liquibase/changelog -v `pwd`/DSL/Liquibase/master.yml:/liquibase/master.yml -v `pwd`/DSL/Liquibase/data:/liquibase/data liquibase/liquibase:4.33 --defaultsFile=/liquibase/changelog/liquibase.properties --changelog-file=master.yml --url=jdbc:postgresql://users_db:5432/global-classifier?user=postgres --password=dbadmin update
diff --git a/src/model-training/create_triton_configs.py b/src/model-training/create_triton_configs.py
@@ -179,22 +179,19 @@ def generate_preprocessing_config(
     name: "attention_mask"
     data_type: TYPE_INT64
     dims: [ -1 ]
-  }},
-  """
+  }}"""
+
     if supports_token_type_ids:
-        config += """{{
+        config += """,
+  {
     name: "token_type_ids"
     data_type: TYPE_INT64
     dims: [ -1 ]
-  }}"""
-
-    # Note: No training flag needed for current SNGP implementation
+  }"""
 
     config += f"""
 ]
 
-
-
 parameters [
   {{
     key: "model_name"
@@ -235,17 +232,6 @@ def generate_text_classifier_config(
 ) -> str:
     """
     Generate Triton text classifier config based on model type.
-
-    Args:
-        model_name: Name of the text classifier model (e.g., "text_classifier")
-        model_type: Type of model ("distilbert", "bert", "xlm-roberta", "roberta")
-        num_labels: Number of output labels/classes
-        sequence_length: Maximum sequence length for the model
-        max_batch_size: Maximum batch size for inference
-        ood_method: OOD method ("sngp", "energy", "softmax", or None)
-
-    Returns:
-        str: Complete Triton text classifier config as string
     """
 
     # Define which models support token_type_ids
@@ -268,16 +254,13 @@ def generate_text_classifier_config(
     dims: [ -1 ]
   }}"""
 
-    # Add token_type_ids input if supported
     if supports_token_type_ids:
         config += """,
-  {{
+  {
     name: "token_type_ids"
     data_type: TYPE_INT64
     dims: [ -1 ]
-  }}"""
-
-    # Note: No training flag needed for current SNGP implementation
+  }"""
 
     config += f"""
 ]
@@ -287,12 +270,9 @@ def generate_text_classifier_config(
     name: "logits"
     data_type: TYPE_FP32
     dims: [ {num_labels} ]
-  }}"""
-
-    config += """
+  }}
 ]
 
-
 dynamic_batching {{
   max_queue_delay_microseconds: 100
 }}"""

diff --git a/src/model-training/model_trainer.py b/src/model-training/model_trainer.py
@@ -439,6 +439,7 @@ def train(self):
             logger.info(f"BEST MODEL SELECTED: {best_variant['name']}")
             logger.info(f"BEST COMBINED SCORE: {best_result['combined_score']:.4f}")
             logger.info(f"BEST MODEL TYPE: {best_variant['type']}")
+            logger.info(f"BEST MODEL_BASE: {best_variant['base_model']}")
 
             # Save training summary
             training_summary = {

diff --git a/src/model-training/trainingpipeline.py b/src/model-training/trainingpipeline.py
@@ -1146,13 +1146,13 @@ def train(self):
             # save labelmappings, ood config to config.json
             config = {
                 "num_labels": len(label_encoder.classes_),
-                "model_name": self.full_name,
+                "model_name": MODEL_CONFIGS[self.model_name]["model_name"],
                 "hidden_dim": model.hidden_dim,
                 "dropout_rate": model.dropout_rate,
                 "sequence_length": SEQUENCE_LENGTH,
                 "ood_method": self.ood_method,
                 "ood_config": self.ood_config,
-                "base_model_name": self.model_name,
+                "base_model_name": MODEL_CONFIGS[self.model_name]["tokenizer_name"],
                 "base_model_type": model.base_model.__class__.__name__,
                 "model_label2id": self.model_label2id,
                 "model_id2label": self.model_id2label,
Original file line number	Diff line number	Diff line change
Expand Up		@@ -12,4 +12,4 @@ INI_FILE="constants.ini"
		DB_PASSWORD=$(get_ini_value "$INI_FILE" "DB_PASSWORD")


		docker run --rm --network bykstack -v `pwd`/DSL/Liquibase/changelog:/liquibase/changelog -v `pwd`/DSL/Liquibase/master.yml:/liquibase/master.yml -v `pwd`/DSL/Liquibase/data:/liquibase/data liquibase/liquibase --defaultsFile=/liquibase/changelog/liquibase.properties --changelog-file=master.yml --url=jdbc:postgresql://users_db:5432/global-classifier?user=postgres --password=dbadmin update
		docker run --rm --network bykstack -v `pwd`/DSL/Liquibase/changelog:/liquibase/changelog -v `pwd`/DSL/Liquibase/master.yml:/liquibase/master.yml -v `pwd`/DSL/Liquibase/data:/liquibase/data liquibase/liquibase:4.33 --defaultsFile=/liquibase/changelog/liquibase.properties --changelog-file=master.yml --url=jdbc:postgresql://users_db:5432/global-classifier?user=postgres --password=dbadmin update