Skip to content

Commit c76989e

Browse files
committed
Added helm chart for observability
1 parent 563edce commit c76989e

File tree

14 files changed

+338
-199
lines changed

14 files changed

+338
-199
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,11 +113,11 @@ Hyperflow provides two key Helm charts:
113113
To run a sample workflow on a clean Kubernetes cluster, you should do the following:
114114
- Install the `hyperflow-ops` chart
115115
```
116-
helm upgrade --dependency-update -i hf-ops hyperflow-ops
116+
helm upgrade --dependency-update -i hf-ops charts/hyperflow-ops
117117
```
118118
- Install the `hyperflow-run` chart (prefarably in a separate namespace)
119119
```
120-
helm upgrade --dependency-update -i hf-run-montage hyperflow-run
120+
helm upgrade --dependency-update -i hf-run-montage charts/hyperflow-run
121121
```
122122
- Once all pods are up and running or completed, you can manually run the workflow as follows:
123123
```

charts/hyperflow-engine/templates/deployment.yml

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,6 @@ spec:
6060
env:
6161
- name: REDIS_URL
6262
value: redis://redis.{{ .Release.Namespace }}:6379
63-
# URL for OpenTelemetry collector which will collect signals from engine
64-
- name: HF_VAR_ENABLE_TRACING
65-
value: "false"
66-
- name: HF_VAR_ENABLE_OTEL
67-
value: "false"
68-
- name: HF_VAR_OPT_URL
69-
value: nil
7063
- name: HF_VAR_function
7164
# The source of this function can be found here
7265
# https://github.com/hyperflow-wms/hyperflow/blob/master/functions/kubernetes/k8sCommand.js
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
apiVersion: v2
2+
name: hyperflow-observability
3+
description: Helm chart to deploy observability stack
4+
type: application
5+
version: 0.1.0
6+
appVersion: "1.0"
7+
8+
dependencies:
9+
- name: opensearch
10+
version: "2.34.0"
11+
repository: https://opensearch-project.github.io/helm-charts/
12+
- name: opensearch-dashboards
13+
version: "2.30.0"
14+
repository: https://opensearch-project.github.io/helm-charts/
15+
- name: data-prepper
16+
version: "0.3.1"
17+
repository: https://opensearch-project.github.io/helm-charts/
18+
- name: opentelemetry-collector
19+
version: "0.126.0"
20+
repository: https://open-telemetry.github.io/opentelemetry-helm-charts
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# HyperFlow K8S monitoring
2+
3+
```
4+
helm upgrade --dependency-update -i hf-obs charts/hyperflow-observability
5+
```
6+
7+
## Open opensearch dashboards
8+
9+
```
10+
kubectl port-forward svc/hf-obs-opensearch-dashboards 5601:5601
11+
```
12+
13+
Navigate to
14+
http://localhost:5601/
15+
16+
Go to Dashboards Management -> Index Patterns
17+
18+
create index patterns
19+
- hyperflow_traces
20+
- hyperflow_metrics
21+
- hyperflow_logs
22+
23+
Go to Discover and choose one of new index patterns as source
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: PrometheusRule
3+
metadata:
4+
name: hyperflow-rules
5+
labels:
6+
app: kube-prometheus-stack
7+
spec:
8+
groups:
9+
## Tracks the number of available replicas for deployments with label origin=hyperflow.
10+
- name: hyperflow-deployment-metrics
11+
interval: 1s
12+
rules:
13+
- record: hyperflow_deployment_status_replicas_available
14+
expr: |
15+
kube_deployment_status_replicas_available
16+
* on(namespace, deployment) group_left(label_origin)
17+
kube_deployment_labels{label_origin="hyperflow"}
18+
## Calculates the percentage of CPU usage per node.
19+
- name: node_cpu_usage
20+
interval: 5s
21+
rules:
22+
- record: node_cpu_usage_percent
23+
expr: |
24+
100 * (
25+
sum by (node) (
26+
rate(container_cpu_usage_seconds_total{container!=""}[1m])
27+
)
28+
/
29+
sum by (node) (
30+
kube_node_status_allocatable{resource="cpu", unit="core"}
31+
)
32+
)
33+
## Calculates the percentage of memory usage per node.
34+
- name: node_memory_usage
35+
interval: 5s
36+
rules:
37+
- record: node_memory_usage_percent
38+
expr: |
39+
(
40+
sum(container_memory_working_set_bytes) by (node)
41+
/
42+
sum(kube_node_status_allocatable{resource="memory"}) by (node)
43+
) * 100
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
opensearch:
2+
replicas: 1
3+
4+
config:
5+
opensearch.yml: |
6+
cluster.name: opensearch-cluster
7+
network.host: 0.0.0.0
8+
plugins:
9+
security:
10+
disabled: true
11+
extraEnvs:
12+
- name: OPENSEARCH_JAVA_OPTS
13+
value: "-Xms512m -Xmx512m"
14+
- name: OPENSEARCH_INITIAL_ADMIN_PASSWORD
15+
value: "Hyperflow1!"
16+
17+
opensearch-dashboards:
18+
opensearchHosts: "http://opensearch-cluster-master:9200"
19+
20+
extraEnvs:
21+
- name: DISABLE_SECURITY_DASHBOARDS_PLUGIN
22+
value: "true"
23+
24+
resources:
25+
requests:
26+
cpu: "200m"
27+
memory: 0.5Gi
28+
limits:
29+
cpu: "1"
30+
memory: 3Gi
31+
32+
data-prepper:
33+
pipelineConfig:
34+
enabled: true
35+
config:
36+
entry-pipeline:
37+
delay: "100"
38+
source:
39+
otel_trace_source:
40+
ssl: false
41+
sink:
42+
- pipeline:
43+
name: "raw-pipeline"
44+
- pipeline:
45+
name: "service-map-pipeline"
46+
raw-pipeline:
47+
source:
48+
pipeline:
49+
name: "entry-pipeline"
50+
processor:
51+
- otel_trace_raw:
52+
sink:
53+
- opensearch:
54+
hosts: [ "http://opensearch-cluster-master:9200" ]
55+
insecure: true
56+
username: admin
57+
password: "Hyperflow1!"
58+
index_type: custom
59+
index: hyperflow_traces
60+
service-map-pipeline:
61+
delay: "100"
62+
source:
63+
pipeline:
64+
name: "entry-pipeline"
65+
processor:
66+
- service_map_stateful:
67+
sink:
68+
- opensearch:
69+
hosts: [ "http://opensearch-cluster-master:9200" ]
70+
insecure: true
71+
username: admin
72+
password: "Hyperflow1!"
73+
index_type: trace-analytics-service-map
74+
75+
metrics-pipeline:
76+
source:
77+
otel_metrics_source:
78+
ssl: false
79+
sink:
80+
- opensearch:
81+
hosts: [ "http://opensearch-cluster-master:9200" ]
82+
insecure: true
83+
username: admin
84+
password: "Hyperflow1!"
85+
index_type: custom
86+
index: hyperflow_metrics
87+
88+
logs-pipeline:
89+
source:
90+
otel_logs_source:
91+
ssl: false
92+
sink:
93+
- opensearch:
94+
hosts: [ "http://opensearch-cluster-master:9200" ]
95+
insecure: true
96+
username: admin
97+
password: "Hyperflow1!"
98+
index: hyperflow_logs
99+
100+
opentelemetry-collector:
101+
mode: "statefulset"
102+
103+
image:
104+
repository: "otel/opentelemetry-collector"
105+
tag: "0.123.0"
106+
107+
command:
108+
name: "otelcol"
109+
110+
resources:
111+
requests:
112+
cpu: 1
113+
memory: 5Gi
114+
limits:
115+
cpu: 2
116+
memory: 5Gi
117+
118+
config:
119+
extensions:
120+
health_check:
121+
endpoint: 0.0.0.0:13133
122+
pprof:
123+
endpoint: 0.0.0.0:1777
124+
zpages:
125+
endpoint: 0.0.0.0:55679
126+
127+
receivers:
128+
otlp:
129+
protocols:
130+
http:
131+
endpoint: 0.0.0.0:4318
132+
prometheus:
133+
config:
134+
scrape_configs:
135+
- job_name: "kube-state-metrics"
136+
scrape_interval: 1s
137+
metrics_path: /federate
138+
honor_labels: true
139+
params:
140+
match[]:
141+
- '{label_origin="hyperflow"}'
142+
static_configs:
143+
- targets: [ "monitoring-prometheus:9090" ]
144+
metric_relabel_configs:
145+
- source_labels: [ __name__ ]
146+
regex: "kube_deployment_labels"
147+
action: drop
148+
- job_name: "cpu-by-node"
149+
scrape_interval: 5s
150+
metrics_path: /federate
151+
honor_labels: true
152+
params:
153+
match[]:
154+
- 'node_cpu_usage_percent'
155+
static_configs:
156+
- targets: [ "monitoring-prometheus:9090" ]
157+
- job_name: "memory-by-node"
158+
scrape_interval: 5s
159+
metrics_path: /federate
160+
honor_labels: true
161+
params:
162+
match[]:
163+
- 'node_memory_usage_percent'
164+
static_configs:
165+
- targets: [ "monitoring-prometheus:9090" ]
166+
- job_name: "rabbitmq-exporter"
167+
scrape_interval: 1s
168+
static_configs:
169+
- targets: [ "hf-ops-prometheus-rabbitmq-exporter:9419" ]
170+
metric_relabel_configs:
171+
- source_labels: [ __name__ ]
172+
regex: "rabbitmq_queue_messages_ready"
173+
action: keep
174+
175+
processors:
176+
batch: { }
177+
filter:
178+
metrics:
179+
exclude:
180+
match_type: regexp
181+
metric_names:
182+
- "up"
183+
- "scrape_.*"
184+
185+
186+
exporters:
187+
otlp/traces:
188+
endpoint: hf-obs-data-prepper:21890
189+
tls:
190+
insecure: true
191+
insecure_skip_verify: true
192+
otlp/metrics:
193+
endpoint: hf-obs-data-prepper:21891
194+
tls:
195+
insecure: true
196+
insecure_skip_verify: true
197+
otlp/logs:
198+
endpoint: hf-obs-data-prepper:21892
199+
tls:
200+
insecure: true
201+
insecure_skip_verify: true
202+
debug:
203+
verbosity: detailed
204+
205+
service:
206+
pipelines:
207+
traces:
208+
receivers: [ otlp ]
209+
processors: [ batch ]
210+
exporters: [ debug, otlp/traces ]
211+
metrics:
212+
receivers: [ otlp, prometheus ]
213+
processors: [ batch, filter ]
214+
exporters: [ debug, otlp/metrics ]
215+
logs:
216+
receivers: [ otlp ]
217+
processors: [ batch ]
218+
exporters: [ debug, otlp/logs ]
219+
220+
extensions: [ health_check, pprof, zpages ]

charts/hyperflow-ops/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ worker-pools:
66
enable-rabbitmq: &enable-rabbit-mq true
77
enable-kube-prometheus-stack: &enable-kube-prometheus-stack true
88
enable-alert-manager: &enable-alert-manager false
9-
enable-grafana: &enable-grafana true
9+
enable-grafana: &enable-grafana false
1010
enable-prometheus-operator: &enable-prometheus-operator true
1111
enable-prometheus: &enable-prometheus true
1212

charts/hyperflow-run/values.yaml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,8 @@ hyperflow-engine:
173173
value: "${enableTracing}"
174174
- name: HF_VAR_ENABLE_OTEL
175175
value: "${enableOtel}"
176+
- name: HF_VAR_OPT_URL
177+
value: "http://hf-obs-opentelemetry-collector"
176178
- name: HF_VAR_OT_PARENT_ID
177179
value: "${optParentId}"
178180
- name: HF_VAR_OT_TRACE_ID
@@ -197,6 +199,26 @@ hyperflow-engine:
197199
valueFrom:
198200
fieldRef:
199201
fieldPath: spec.serviceAccountName
202+
- name: HF_LOG_CPU_REQUEST
203+
valueFrom:
204+
resourceFieldRef:
205+
containerName: test
206+
resource: requests.cpu
207+
- name: HF_LOG_CPU_LIMIT
208+
valueFrom:
209+
resourceFieldRef:
210+
containerName: test
211+
resource: limits.cpu
212+
- name: HF_LOG_MEM_REQUEST
213+
valueFrom:
214+
resourceFieldRef:
215+
containerName: test
216+
resource: requests.memory
217+
- name: HF_LOG_MEM_LIMIT
218+
valueFrom:
219+
resourceFieldRef:
220+
containerName: test
221+
resource: limits.memory
200222
- name: HF_VAR_FS_MONIT_ENABLED
201223
value: "0"
202224
- name: HF_VAR_FS_MONIT_COMMAND

0 commit comments

Comments
 (0)