Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .env
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ AWS_ACCESS_KEY_ID=your_aws_access_key_id
AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key
BEDROCK_AWS_REGION=eu-west-1
AZURE_OPENAI_API_KEY=your_azure_openai_api_key
AZURE_OPENAI_ENDPOINT=your_azure_apenai_endpoint
AZURE_OPENAI_ENDPOINT=your_azure_openai_endpoint
AZURE_OPENAI_DEPLOYMENT_NAME=gpt-4o
PROVIDER_NAME=azure-openai
27 changes: 26 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,38 @@ Windows:
- Clone [TIM](https://github.com/buerokratt/TIM)
- Navigate to TIM and build the image using the command `docker build -t tim .`
- Clone [Authentication Layer](https://github.com/buerokratt/Authentication-layer)
- Go to public/env-config.js and update the RUUTER_API_URL to 'http://localhost:8086/classifier'
- Go to public/env-config.js and update the RUUTER_API_URL to 'http://localhost:8086/global-classifier'
- Navigate to Authentication Layer, checkout to the `dev` branch and build the image using the command `docker build -f Dockerfile.dev -t authentication-layer .`
- Clone [S3 Ferry](https://github.com/buerokratt/S3-Ferry)
- Navigate to S3-Ferry and build the image using the command `docker build -t s3-ferry .`
- Clone [Cron Manager](https://github.com/buerokratt/CronManager)
- Navigate to Cron Manager `dev` branch and build the cron-manager-python image using the command `docker build -f Dockerfile.python -t cron-manager-python .`
- Clone [Dataset Generator](https://github.com/buerokratt/Dataset-Generator)
- Navigate to Dataset Generator `dev` branch and build the synthesisai/dataset-generator image using the command `docker compose build`

## Using LLMs for data generation

Currently 3 providers available in Global classifier for dataset generation
- Bedrock Anthropic(bedrock-anthropic)
- Azure Openai(azure-openai)
- Ollama(ollama)

To select a provider, navigate to DSL\DatasetGenerator\config\config.yaml

1.Change the provider name in the below block. Dataset generator will use the selected provider for the generation.
```yaml
provider:
name: "azure-openai" # THIS DETERMINES WHICH PROVIDER TO USE
timeout: 60
max_retries: 3
retry_delay: 5
```
2.Change the `PROVIDER_NAME` in .env file as well

## Data Migration

In order to access the GUI, data migration script should be executed. It will add the initial configurations of the system
run `migrate.sh` file and it will create the initial user with test Smart ID `EE30303039914` and the GUI can be accessed by logging in with the added Smart ID


## Contributing
Expand Down
2 changes: 1 addition & 1 deletion migrate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ INI_FILE="constants.ini"
DB_PASSWORD=$(get_ini_value "$INI_FILE" "DB_PASSWORD")


docker run --rm --network bykstack -v `pwd`/DSL/Liquibase/changelog:/liquibase/changelog -v `pwd`/DSL/Liquibase/master.yml:/liquibase/master.yml -v `pwd`/DSL/Liquibase/data:/liquibase/data liquibase/liquibase --defaultsFile=/liquibase/changelog/liquibase.properties --changelog-file=master.yml --url=jdbc:postgresql://users_db:5432/global-classifier?user=postgres --password=dbadmin update
docker run --rm --network bykstack -v `pwd`/DSL/Liquibase/changelog:/liquibase/changelog -v `pwd`/DSL/Liquibase/master.yml:/liquibase/master.yml -v `pwd`/DSL/Liquibase/data:/liquibase/data liquibase/liquibase:4.33 --defaultsFile=/liquibase/changelog/liquibase.properties --changelog-file=master.yml --url=jdbc:postgresql://users_db:5432/global-classifier?user=postgres --password=dbadmin update
36 changes: 8 additions & 28 deletions src/model-training/create_triton_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,22 +179,19 @@ def generate_preprocessing_config(
name: "attention_mask"
data_type: TYPE_INT64
dims: [ -1 ]
}},
"""
}}"""

if supports_token_type_ids:
config += """{{
config += """,
{
name: "token_type_ids"
data_type: TYPE_INT64
dims: [ -1 ]
}}"""

# Note: No training flag needed for current SNGP implementation
}"""

config += f"""
]



parameters [
{{
key: "model_name"
Expand Down Expand Up @@ -235,17 +232,6 @@ def generate_text_classifier_config(
) -> str:
"""
Generate Triton text classifier config based on model type.

Args:
model_name: Name of the text classifier model (e.g., "text_classifier")
model_type: Type of model ("distilbert", "bert", "xlm-roberta", "roberta")
num_labels: Number of output labels/classes
sequence_length: Maximum sequence length for the model
max_batch_size: Maximum batch size for inference
ood_method: OOD method ("sngp", "energy", "softmax", or None)

Returns:
str: Complete Triton text classifier config as string
"""

# Define which models support token_type_ids
Expand All @@ -268,16 +254,13 @@ def generate_text_classifier_config(
dims: [ -1 ]
}}"""

# Add token_type_ids input if supported
if supports_token_type_ids:
config += """,
{{
{
name: "token_type_ids"
data_type: TYPE_INT64
dims: [ -1 ]
}}"""

# Note: No training flag needed for current SNGP implementation
}"""

config += f"""
]
Expand All @@ -287,12 +270,9 @@ def generate_text_classifier_config(
name: "logits"
data_type: TYPE_FP32
dims: [ {num_labels} ]
}}"""

config += """
}}
]


dynamic_batching {{
max_queue_delay_microseconds: 100
}}"""
Expand Down
1 change: 1 addition & 0 deletions src/model-training/model_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,7 @@ def train(self):
logger.info(f"BEST MODEL SELECTED: {best_variant['name']}")
logger.info(f"BEST COMBINED SCORE: {best_result['combined_score']:.4f}")
logger.info(f"BEST MODEL TYPE: {best_variant['type']}")
logger.info(f"BEST MODEL_BASE: {best_variant['base_model']}")

# Save training summary
training_summary = {
Expand Down
4 changes: 2 additions & 2 deletions src/model-training/trainingpipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -1146,13 +1146,13 @@ def train(self):
# save labelmappings, ood config to config.json
config = {
"num_labels": len(label_encoder.classes_),
"model_name": self.full_name,
"model_name": MODEL_CONFIGS[self.model_name]["model_name"],
"hidden_dim": model.hidden_dim,
"dropout_rate": model.dropout_rate,
"sequence_length": SEQUENCE_LENGTH,
"ood_method": self.ood_method,
"ood_config": self.ood_config,
"base_model_name": self.model_name,
"base_model_name": MODEL_CONFIGS[self.model_name]["tokenizer_name"],
"base_model_type": model.base_model.__class__.__name__,
"model_label2id": self.model_label2id,
"model_id2label": self.model_id2label,
Expand Down