kubernetes-sigs · rlakhtakia · Oct 6, 2025
diff --git a/benchmarking/README.md b/benchmarking/README.md
@@ -0,0 +1,71 @@
+# Benchmarking Helm Chart
+
+This Helm chart deploys the `inference-perf` benchmarking tool. This guide will walk you through deploying a basic benchmarking job. By default, the `shareGPT` dataset is used for configuration.
+
+## Prerequisites
+
+Before you begin, ensure you have the following:
+
+*   **Helm 3+**: [Installation Guide](https://helm.sh/docs/intro/install/)
+*   **Kubernetes Cluster**: Access to a Kubernetes cluster
+*   **Gateway Deployed**: Your inference server/gateway must be deployed and accessible within the cluster.
+
+
+**Hugging Face Token Secret**
+
+The benchmark requires a Hugging Face token to pull tokenizers. Create a Kubernetes Secret named `hf-token` (or a custom name you provide) in your target namespace, containing your Hugging Face token.
+
+    To create this secret:
+    ```bash
+    export _HF_TOKEN='<YOUR_HF_TOKEN>'
+    kubectl create secret generic hf-token --from-literal=token=$_HF_TOKEN
+    ```
+
+## Deployment
+
+To deploy the benchmarking chart:
+
+```bash
+export IP='<YOUR_IP>'
+export PORT='<YOUR_PORT>'
+helm install benchmark . -f benchmark-values.yaml \
+  --set hfTokenSecret.name=hf-token \
+  --set hfTokenSecret.key=token \
+  --set "config.server.base_url=http://${IP}:${PORT}"
+```
+
+**Parameters to customize:**
+
+*   `benchmark`: A unique name for this deployment.
+*   `hfTokenSecret.name`: The name of your Kubernetes Secret containing the Hugging Face token (default: `hf-token`).
+*   `hfTokenSecret.key`: The key in your Kubernetes Secret pointing to the Hugging Face token (default: `token`).
+*   `config.server.base_url`: The base URL (IP and port) of your inference server.
+
+### Storage Parameters
+
+The following is how to add storage to the config. 
+By default we save to local storage however once the inference-perf job is completed the pod is deleted.
+
+```yaml
+storage:
+  local_storage:
+    path: "reports-{timestamp}"       # Local directory path
+    report_file_prefix: null          # Optional filename prefix
+  google_cloud_storage:               # Optional GCS configuration
+    bucket_name: "your-bucket-name"   # Required GCS bucket
+    path: "reports-{timestamp}"       # Optional path prefix
+    report_file_prefix: null          # Optional filename prefix
+  simple_storage_service:
+    bucket_name: "your-bucket-name"   # Required S3 bucket
+    path: "reports-{timestamp}"       # Optional path prefix
+    report_file_prefix: null          # Optional filename prefix
+```
+
+## Uninstalling the Chart
+
+To uninstall the deployed chart:
+
+```bash
+helm uninstall my-benchmark
+```
+
diff --git a/benchmarking/benchmark-values.yaml b/benchmarking/benchmark-values.yaml
@@ -0,0 +1,50 @@
+# High-Cache Configuration
+job:
+  image: "quay.io/inference-perf/inference-perf:latest"
+  memory: "8G"
+
+logLevel: INFO
+
+hfTokenSecret:
+  name: hf-token
+  key: token
+
+config:
+  load:
+    type: constant
+    interval: 15
+    stages:
+    - rate: 10
+      duration: 20
+    - rate: 20
+      duration: 20
+    - rate: 30
+      duration: 20
+  api:
+    type: completion
+    streaming: true
+  server:
+    type: vllm
+    model_name: meta-llama/Llama-3.1-8B-Instruct
+    base_url: http://0.0.0.0:8000
+    ignore_eos: true
+  tokenizer:
+    pretrained_model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
+  data:
+    type: shareGPT
+  storage:
+    google_cloud_storage:
+      bucket_name: "inference-perf-results"
+      report_file_prefix: benchmark
+  metrics:
+    type: prometheus
+    prometheus:
+      google_managed: true
+  report:
+    request_lifecycle:
+      summary: true
+      per_stage: true
+      per_request: true
+    prometheus:
+      summary: true
+      per_stage: true