Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .env
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
AWS_ACCESS_KEY_ID=your_aws_access_key_id
AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key
BEDROCK_AWS_REGION=eu-west-1
AZURE_OPENAI_API_KEY=your_azure_openai_api_key
AZURE_OPENAI_ENDPOINT=your_azure_apenai_endpoint
AZURE_OPENAI_DEPLOYMENT_NAME=gpt-4o
PROVIDER_NAME=azure-openai
199 changes: 153 additions & 46 deletions DSL/DatasetGenerator/config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,71 @@
# LLM Provider configuration
provider:
name: "ollama"
# Bedrock Anthropic provider configuration (when provider.name = "bedrock-anthropic")
bedrock_anthropic:
model_name: "eu.anthropic.claude-3-7-sonnet-20250219-v1:0"
aws_region: "eu-west-1"
temperature: 0.7
max_tokens: 4096
top_p: 1.0
tpm_limit: 200000
stop_sequences: []
batch_generation:
enabled: true
max_batch_size: 10
max_tokens_per_batch: 20000

# Azure OpenAI provider configuration (when provider.name = "azure-openai")
azure_openai:
# Required: Set via environment variables AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_KEY, AZURE_OPENAI_DEPLOYMENT_NAME
api_version: "2024-12-01-preview"
model_name: "gpt-4o"
temperature: 0.7
max_tokens: 4096
top_p: 1.0
tpm_limit: 200000
rpm_limit: 6000
batch_generation:
enabled: true
max_batch_size: 10
max_tokens_per_batch: 20000

# Ollama provider configuration (when provider.name = "ollama")
ollama:
model_name: "gemma3:1b-it-qat"
api_url: "http://ollama:11434"
host: "http://ollama:11434"
timeout: 60
max_retries: 3
retry_delay: 5
batch_generation:
enabled: false

# MAIN PROVIDER SELECTION - THIS IS THE KEY SETTING
provider:
name: "azure-openai" # THIS DETERMINES WHICH PROVIDER TO USE
timeout: 60
max_retries: 3
retry_delay: 5

# Processing settings
processing:
wait_between_requests: 1
timeout_seconds: 1300 # 1 hour timeout for dataset processing
max_consecutive_failures: 3
success_rate_threshold: 0.8 # 80% success rate required
retry_on_failure: true

# Callback configuration for external project notifications
callback:
url: "http://ruuter-public:8086/global-classifier/data/callback"
timeout: 60.0 # Callback timeout in seconds
retries: 3 # Number of retry attempts
retry_backoff: true # Use exponential backoff
include_error_details: true
include_summary: true

# Batch generation settings (global)
batch_generation:
enabled: true
max_batch_size: 10
fallback_on_failure: true

# API connection settings
api:
Expand All @@ -18,8 +78,8 @@ directories:
output: "output_datasets"
templates: "templates"
user_configs: "user_configs"
# Default generation settings (can be overridden per generation request)

# Default generation settings
generation:
default_num_examples: 10
default_language: "et"
Expand All @@ -28,22 +88,42 @@ generation:
temperature: 0.7
max_tokens: 4096

# Language and Prompt Settings
language_settings:
default_system_prompt: "You are a helpful assistant providing accurate information based on topic content."
default_language: "et"
supported_languages:
en: "English"
et: "Estonian"
fi: "Finnish"

# Storage configuration
storage:
datasets_dir: "datasets"
templates_dir: "templates"
user_configs_dir: "user_configs"

# Default Output Settings
output_defaults:
save_format: "json"
supported_formats:
- "json"
- "text"

# Dataset generation configuration
dataset_generation:
structure_name: "single_question"
prompt_template_name: "institute_topic_question"
traversal_strategy: "pattern" # Options: "flat", "recursive", "institutional", "pattern"
traversal_strategy: "pattern"
output_format: "json"
num_samples: 10
post_processing: "aggregation" # Options: "zip", "aggregation"
# Aggregation-specific configuration (only used when post_processing = "aggregation")
post_processing: "aggregation"
aggregation:
output_filename: "12"
output_format: "csv"
output_format: "csv"
merge_strategy: "combine_arrays"
include_metadata: true
enable_shuffling: false

field_mapping:
enabled: true
payload_to_output:
Expand All @@ -54,15 +134,12 @@ dataset_generation:
dataset_version_id: version_id
content_fields:
question: data_item


csv_field_order:
- item_id
- agency_name
- agency_id
- data_item
- dataset_version_id

- item_id
- agency_name
- agency_id
- data_item
- dataset_version_id
parameters:
language: "et"
temperature: 0.7
Expand All @@ -72,10 +149,6 @@ dataset_generation:
style: "clear and concise"
system_prompt: "You are a helpful assistant for generating synthetic questions for given contexts."
filter: {}

# Processing settings
processing:
wait_between_requests: 1

# MLflow tracking
mlflow:
Expand All @@ -89,31 +162,65 @@ data_sources:
patterns: ["**/cleaned.txt"]
recursive: true

callback:
url: "http://ruuter-public:8086/global-classifier/data/callback"
max_retries: 3
timeout: 30

# Relevance Score Analysis
relevance_score:
enabled: true
embedding_model: "paraphrase-multilingual-mpnet-base-v2"
segment_weight: 0.6
query_weight: 0.3
term_weight: 0.1
threshold_good: 0.7
threshold_acceptable: 0.5
min_df: 1
max_df: 0.9
ngram_range: (1, 2)

# Information Coverage Analysis
information_coverage:
enabled: true
similarity_threshold: 0.5

# Model settings
models:
embedding_model: "paraphrase-multilingual-mpnet-base-v2"
qualitative_model: "google/gemma-2-2b-it"
use_4bit_quantization: true


# redis configuration
redis:
# Redis connection URL - modify according to your setup
# url: "redis://localhost:6379" # For local Redis
url: "redis://redis:6379" # For Docker Compose setup
# url: "redis://:password@redis-host:6379" # With password

# Redis database number to use (0-15 typically available)
db: 0

# Connection pool settings
max_connections: 10
retry_on_timeout: true
socket_timeout: 5
socket_connect_timeout: 5

# Embedding-specific settings
embedding:
# Path to topic documents
topic_documents_path: "/app/data"

# TTL settings for different embedding types
ttl:
# Persistent embeddings (topic documents) - no expiration by default
persistent: null # null means no expiration
# Temporary embeddings (questions) - expire after successful evaluation
temporary: 3600 # 1 hour in seconds
# Failed evaluation sessions - keep longer for potential retry
failed_session: 7200 # 2 hours in seconds

# Evaluation configuration
evaluation:
# Enable automatic embedding cleanup after successful evaluation
auto_cleanup_on_success: true
# Keep embeddings for failed evaluations (for potential retry)
keep_failed_embeddings: true
# Batch size for embedding operations
embedding_batch_size: 32
max_regeneration_attempts: 3
# Evaluation thresholds
thresholds:
topic_coverage_min: 0.7
information_coverage_min: 0.6
similarity_coverage_min: 0.6
overall_min: 0.65
context_coverage_min: 0.5

# Monitoring and logging
monitoring:
# Log cache hit/miss statistics
log_cache_stats: true
# Log embedding operations
log_embedding_operations: true
# Cache statistics reporting interval (seconds)
stats_interval: 300 # 5 minutes
39 changes: 0 additions & 39 deletions DSL/DatasetGenerator/config/model_config.yaml

This file was deleted.

72 changes: 72 additions & 0 deletions DSL/DatasetGenerator/config/redis.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@

# Network
bind 0.0.0.0
port 6379
timeout 0
tcp-keepalive 300

# General
daemonize no
supervised no
pidfile /var/run/redis_6379.pid
loglevel notice
logfile ""
databases 16

# Snapshotting
save 900 1
save 300 10
save 60 10000
stop-writes-on-bgsave-error yes
rdbcompression yes
rdbchecksum yes
dbfilename dump.rdb
dir /data

# Replication
replica-serve-stale-data yes
replica-read-only yes

# Security
# requirepass yourpassword # Uncomment and set password if needed

# Memory Management
maxmemory 1gb
maxmemory-policy allkeys-lru

# Append Only File
appendonly yes
appendfilename "appendonly.aof"
appendfsync everysec
no-appendfsync-on-rewrite no
auto-aof-rewrite-percentage 100
auto-aof-rewrite-min-size 64mb

# Lua scripting
lua-time-limit 5000

# Slow log
slowlog-log-slower-than 10000
slowlog-max-len 128

# Client output buffer limits
client-output-buffer-limit normal 0 0 0
client-output-buffer-limit replica 256mb 64mb 60
client-output-buffer-limit pubsub 32mb 8mb 60

# Advanced config
hash-max-ziplist-entries 512
hash-max-ziplist-value 64
list-max-ziplist-size -2
list-compress-depth 0
set-max-intset-entries 512
zset-max-ziplist-entries 128
zset-max-ziplist-value 64
hll-sparse-max-bytes 3000
stream-node-max-bytes 4096
stream-node-max-entries 100
activerehashing yes
hz 10
dynamic-hz yes
aof-rewrite-incremental-fsync yes
rdb-save-incremental-fsync yes
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ SELECT
major,
minor
FROM public.data_models
WHERE training_status = 'trained'
ORDER BY model_id;
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ SET
ELSE
connected_models
END,
updated_at = CURRENT_TIMESTAMP
last_trained = CURRENT_TIMESTAMP
WHERE
id = :datasetId
RETURNING
Expand Down
Loading