Skip to content

Commit 05f01bf

Browse files
committed
Update DRA testing to stable API version and prepare it to test more types of drivers
1 parent 344698d commit 05f01bf

File tree

11 files changed

+155
-51
lines changed

11 files changed

+155
-51
lines changed

clusterloader2/pkg/dependency/dra/dra.go

Lines changed: 111 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,10 @@ import (
2020
"context"
2121
"embed"
2222
"fmt"
23+
"strings"
2324
"time"
2425

26+
corev1 "k8s.io/api/core/v1"
2527
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2628
"k8s.io/apimachinery/pkg/util/wait"
2729
"k8s.io/klog/v2"
@@ -31,16 +33,16 @@ import (
3133
)
3234

3335
const (
34-
draDependencyName = "DRATestDriver"
35-
//TODO: this needs to be converted into a parameter. Will will not need this until parititionable devices test
36+
draDependencyName = "DRATestDriver"
3637
draNamespace = "dra-example-driver"
38+
draManifests = "dra-example-driver"
3739
defaultWorkerNodeCount = "100"
3840
draDaemonsetName = "dra-example-driver-kubeletplugin"
3941
checkDRAReadyInterval = 30 * time.Second
4042
defaultDRATimeout = 10 * time.Minute
4143
)
4244

43-
//go:embed manifests/*.yaml
45+
//go:embed manifests/**/*.yaml
4446
var manifestsFS embed.FS
4547

4648
func init() {
@@ -57,13 +59,24 @@ type draDependency struct{}
5759

5860
func (d *draDependency) Setup(config *dependency.Config) error {
5961
klog.V(2).Infof("%s: Installing DRA example driver", d)
60-
if err := client.CreateNamespace(config.ClusterFramework.GetClientSets().GetClient(), draNamespace); err != nil {
61-
return fmt.Errorf("namespace %s creation error: %v", draNamespace, err)
62+
63+
namespace, err := getNamespace(config)
64+
if err != nil {
65+
return err
6266
}
6367

64-
namespace, ok := config.Params["Namespace"]
65-
if !ok {
66-
namespace = draNamespace
68+
if err := client.CreateNamespace(config.ClusterFramework.GetClientSets().GetClient(), namespace); err != nil {
69+
return fmt.Errorf("namespace %s creation error: %v", namespace, err)
70+
}
71+
72+
manifests, err := getManifests(config)
73+
if err != nil {
74+
return err
75+
}
76+
77+
daemonsetName, err := getDaemonset(config)
78+
if err != nil {
79+
return err
6780
}
6881

6982
mapping := map[string]interface{}{
@@ -76,7 +89,7 @@ func (d *draDependency) Setup(config *dependency.Config) error {
7689
}
7790
if err := config.ClusterFramework.ApplyTemplatedManifests(
7891
manifestsFS,
79-
"manifests/*.yaml",
92+
manifests,
8093
mapping,
8194
client.Retry(client.IsRetryableAPIError),
8295
); err != nil {
@@ -86,8 +99,8 @@ func (d *draDependency) Setup(config *dependency.Config) error {
8699
if err != nil {
87100
return err
88101
}
89-
klog.V(2).Infof("%s: checking if DRA driver %s is healthy", d, draDaemonsetName)
90-
if err := d.waitForDRADriverToBeHealthy(config, timeout); err != nil {
102+
klog.V(2).Infof("%s: checking if DRA driver %s is healthy", d, daemonsetName)
103+
if err := d.waitForDRADriverToBeHealthy(config, timeout, daemonsetName, namespace); err != nil {
91104
return err
92105
}
93106

@@ -98,60 +111,76 @@ func (d *draDependency) Setup(config *dependency.Config) error {
98111
func (d *draDependency) Teardown(config *dependency.Config) error {
99112
klog.V(2).Infof("%s: Tearing down DRA example driver", d)
100113

114+
namespace, err := getNamespace(config)
115+
if err != nil {
116+
return err
117+
}
118+
101119
// Delete namespace (this will delete all resources in it)
102-
if err := client.DeleteNamespace(config.ClusterFramework.GetClientSets().GetClient(), draNamespace); err != nil {
103-
return fmt.Errorf("deleting %s namespace error: %v", draNamespace, err)
120+
if err := client.DeleteNamespace(config.ClusterFramework.GetClientSets().GetClient(), namespace); err != nil {
121+
return fmt.Errorf("deleting %s namespace error: %v", namespace, err)
104122
}
105123

106-
if err := client.WaitForDeleteNamespace(config.ClusterFramework.GetClientSets().GetClient(), draNamespace, client.DefaultNamespaceDeletionTimeout); err != nil {
124+
if err := client.WaitForDeleteNamespace(config.ClusterFramework.GetClientSets().GetClient(), namespace, client.DefaultNamespaceDeletionTimeout); err != nil {
107125
return err
108126
}
109127

110128
klog.V(2).Infof("%s: DRA example driver uninstalled successfully", d)
111129
return nil
112130
}
113131

114-
func (d *draDependency) waitForDRADriverToBeHealthy(config *dependency.Config, timeout time.Duration) error {
132+
func (d *draDependency) waitForDRADriverToBeHealthy(config *dependency.Config, timeout time.Duration, daemonsetName string, namespace string) error {
115133
if err := wait.PollImmediate(
116134
checkDRAReadyInterval,
117135
timeout,
118136
func() (done bool, err error) {
119-
return d.isDRADriverReady(config)
137+
return d.isDRADriverReady(config, daemonsetName, namespace)
120138
}); err != nil {
121139
return err
122140
}
123141
if err := wait.PollImmediate(
124142
checkDRAReadyInterval,
125143
timeout,
126144
func() (done bool, err error) {
127-
return isResourceSlicesPublished(config)
145+
return isResourceSlicesPublished(config, namespace)
128146
}); err != nil {
129147
return err
130148
}
131149
return nil
132150
}
133151

134-
func (d *draDependency) isDRADriverReady(config *dependency.Config) (done bool, err error) {
152+
func (d *draDependency) isDRADriverReady(config *dependency.Config, daemonsetName string, namespace string) (done bool, err error) {
135153
ds, err := config.ClusterFramework.GetClientSets().
136154
GetClient().
137155
AppsV1().
138-
DaemonSets(draNamespace).
139-
Get(context.Background(), draDaemonsetName, metav1.GetOptions{})
156+
DaemonSets(namespace).
157+
Get(context.Background(), daemonsetName, metav1.GetOptions{})
140158
if err != nil {
141-
return false, fmt.Errorf("failed to get %s: %v", draDaemonsetName, err)
159+
return false, fmt.Errorf("failed to get %s: %v", daemonsetName, err)
142160
}
143161
ready := ds.Status.NumberReady == ds.Status.DesiredNumberScheduled
144162
if !ready {
145163
klog.V(2).Infof("%s is not ready, "+
146-
"DesiredNumberScheduled: %d, NumberReady: %d", draDaemonsetName, ds.Status.DesiredNumberScheduled, ds.Status.NumberReady)
164+
"DesiredNumberScheduled: %d, NumberReady: %d", daemonsetName, ds.Status.DesiredNumberScheduled, ds.Status.NumberReady)
147165
}
148166
return ready, nil
149167
}
150168

151-
func isResourceSlicesPublished(config *dependency.Config) (bool, error) {
152-
workerCount := int(getWorkerCount(config).(float64))
169+
func isResourceSlicesPublished(config *dependency.Config, namespace string) (bool, error) {
170+
// Get a list of all nodes
171+
// nodes, err := getReadyNodesCount(config)
172+
// if err != nil {
173+
// return false, fmt.Errorf("failed to list nodes: %v", err)
174+
// }
153175

154-
resourceSlices, err := config.ClusterFramework.GetClientSets().GetClient().ResourceV1beta1().ResourceSlices().List(context.Background(), metav1.ListOptions{})
176+
driverPluginPods, err := getDriverPluginPods(config, namespace, draDaemonsetName)
177+
if err != nil {
178+
return false, fmt.Errorf("failed to list driverPluginPods: %v", err)
179+
}
180+
181+
workerCount := driverPluginPods
182+
183+
resourceSlices, err := config.ClusterFramework.GetClientSets().GetClient().ResourceV1().ResourceSlices().List(context.Background(), metav1.ListOptions{})
155184
if err != nil {
156185
return false, fmt.Errorf("failed to list resourceslices: %v", err)
157186
}
@@ -163,6 +192,26 @@ func isResourceSlicesPublished(config *dependency.Config) (bool, error) {
163192
return true, nil
164193
}
165194

195+
func getDriverPluginPods(config *dependency.Config, namespace string, namePrefix string) (int, error) {
196+
pods, err := config.ClusterFramework.GetClientSets().GetClient().CoreV1().Pods(namespace).List(context.Background(), metav1.ListOptions{})
197+
if err != nil {
198+
return 0, fmt.Errorf("failed to list pods in namespace %s: %w", namespace, err)
199+
}
200+
201+
runningPods := 0
202+
for _, pod := range pods.Items {
203+
if !strings.HasPrefix(pod.Name, namePrefix) {
204+
continue
205+
}
206+
207+
if pod.Status.Phase == corev1.PodRunning {
208+
runningPods++
209+
}
210+
}
211+
212+
return runningPods, nil
213+
}
214+
166215
func getWorkerCount(config *dependency.Config) interface{} {
167216
workerCount, ok := config.Params["WorkerNodeCount"]
168217
if !ok {
@@ -171,6 +220,43 @@ func getWorkerCount(config *dependency.Config) interface{} {
171220
return workerCount
172221
}
173222

223+
func getNamespace(config *dependency.Config) (string, error) {
224+
namespace, ok := config.Params["Namespace"]
225+
if !ok {
226+
namespace = draNamespace
227+
}
228+
namespaceString, ok := namespace.(string)
229+
230+
if !ok {
231+
return "", fmt.Errorf("namespace parameter is not a string: %v", namespace)
232+
}
233+
return namespaceString, nil
234+
}
235+
236+
func getManifests(config *dependency.Config) (string, error) {
237+
manifests, ok := config.Params["Manifests"]
238+
if !ok {
239+
manifests = draManifests
240+
}
241+
manifestsString, ok := manifests.(string)
242+
if !ok {
243+
return "", fmt.Errorf("manifests parameter is not a string: %v", manifests)
244+
}
245+
return "manifests/" + manifestsString + "/*.yaml", nil
246+
}
247+
248+
func getDaemonset(config *dependency.Config) (string, error) {
249+
daemonsetName, ok := config.Params["DaemonsetName"]
250+
if !ok {
251+
daemonsetName = draDaemonsetName
252+
}
253+
daemonsetNameString, ok := daemonsetName.(string)
254+
if !ok {
255+
return "", fmt.Errorf("DaemonsetName parameter is not a string: %v", daemonsetName)
256+
}
257+
return daemonsetNameString, nil
258+
}
259+
174260
// String returns string representation of this dependency.
175261
func (d *draDependency) String() string {
176262
return draDependencyName

clusterloader2/pkg/dependency/dra/manifests/clusterrole.yaml renamed to clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/clusterrole.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,4 @@ rules:
1313
verbs: ["get"]
1414
- apiGroups: ["resource.k8s.io"]
1515
resources: ["resourceslices"]
16-
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
16+
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]

clusterloader2/pkg/dependency/dra/manifests/clusterrolebinding.yaml renamed to clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/clusterrolebinding.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ subjects:
1111
roleRef:
1212
kind: ClusterRole
1313
name: dra-example-driver-role
14-
apiGroup: rbac.authorization.k8s.io
14+
apiGroup: rbac.authorization.k8s.io

clusterloader2/pkg/dependency/dra/manifests/kubeletplugin.yaml renamed to clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/kubeletplugin.yaml

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@ metadata:
66
name: dra-example-driver-kubeletplugin
77
namespace: {{.Namespace}}
88
labels:
9-
helm.sh/chart: dra-example-driver-0.1.3
9+
helm.sh/chart: dra-example-driver-0.0.0-dev
1010
app.kubernetes.io/name: dra-example-driver
1111
app.kubernetes.io/instance: dra-example-driver
12-
app.kubernetes.io/version: "v0.1.0"
12+
app.kubernetes.io/version: "v0.2.0"
13+
app.kubernetes.io/managed-by: Helm
1314
app.kubernetes.io/component: kubeletplugin
1415
spec:
1516
selector:
@@ -26,22 +27,33 @@ spec:
2627
app.kubernetes.io/instance: dra-example-driver
2728
app.kubernetes.io/component: kubeletplugin
2829
spec:
29-
priorityClassName: system-node-critical
3030
serviceAccountName: dra-example-driver-service-account
3131
securityContext:
3232
{}
3333
containers:
3434
- name: plugin
3535
securityContext:
3636
privileged: true
37-
image: registry.k8s.io/dra-example-driver/dra-example-driver:v0.1.0
38-
imagePullPolicy: IfNotPresent
37+
# image: /:v0.2.0
38+
image: registry.k8s.io/dra-example-driver/dra-example-driver:v0.2.0
39+
imagePullPolicy: Always
3940
command: ["dra-example-kubeletplugin"]
4041
resources:
4142
{}
43+
44+
livenessProbe:
45+
grpc:
46+
port: 51515
47+
service: liveness
48+
failureThreshold: 3
49+
periodSeconds: 10
4250
env:
4351
- name: CDI_ROOT
4452
value: /var/run/cdi
53+
- name: KUBELET_REGISTRAR_DIRECTORY_PATH
54+
value: "/var/lib/kubelet/plugins_registry"
55+
- name: KUBELET_PLUGINS_DIRECTORY_PATH
56+
value: "/var/lib/kubelet/plugins"
4557
- name: NODE_NAME
4658
valueFrom:
4759
fieldRef:
@@ -53,20 +65,26 @@ spec:
5365
# Simulated number of devices the example driver will pretend to have.
5466
- name: NUM_DEVICES
5567
value: "8"
68+
- name: HEALTHCHECK_PORT
69+
value: "51515"
5670
volumeMounts:
5771
- name: plugins-registry
58-
mountPath: /var/lib/kubelet/plugins_registry
72+
mountPath: "/var/lib/kubelet/plugins_registry"
5973
- name: plugins
60-
mountPath: /var/lib/kubelet/plugins
74+
mountPath: "/var/lib/kubelet/plugins"
6175
- name: cdi
6276
mountPath: /var/run/cdi
6377
volumes:
6478
- name: plugins-registry
6579
hostPath:
66-
path: /var/lib/kubelet/plugins_registry
80+
path: "/var/lib/kubelet/plugins_registry"
6781
- name: plugins
6882
hostPath:
69-
path: /var/lib/kubelet/plugins
83+
path: "/var/lib/kubelet/plugins"
7084
- name: cdi
7185
hostPath:
7286
path: /var/run/cdi
87+
tolerations:
88+
- effect: NoSchedule
89+
key: google.com/tpu
90+
operator: Exists

clusterloader2/pkg/dependency/dra/manifests/serviceaccount.yaml renamed to clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/serviceaccount.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ metadata:
66
name: dra-example-driver-service-account
77
namespace: {{.Namespace}}
88
labels:
9-
helm.sh/chart: dra-example-driver-0.1.3
9+
helm.sh/chart: dra-example-driver-0.0.0-dev
1010
app.kubernetes.io/name: dra-example-driver
1111
app.kubernetes.io/instance: dra-example-driver
12-
app.kubernetes.io/version: "v0.1.0"
12+
app.kubernetes.io/version: "v0.2.0"
13+
app.kubernetes.io/managed-by: Helm

clusterloader2/pkg/dependency/dra/manifests/validatingadmissionpolicy.yaml renamed to clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/validatingadmissionpolicy.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ spec:
99
matchConstraints:
1010
resourceRules:
1111
- apiGroups: ["resource.k8s.io"]
12-
apiVersions: ["v1beta1"]
12+
apiVersions: ["v1"]
1313
operations: ["CREATE", "UPDATE", "DELETE"]
1414
resources: ["resourceslices"]
1515
matchConditions:
@@ -30,4 +30,4 @@ spec:
3030
- expression: variables.userNodeName == variables.objectNodeName
3131
messageExpression: >-
3232
"this user running on node '"+variables.userNodeName+"' may not modify " +
33-
(variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'")
33+
(variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'")

0 commit comments

Comments
 (0)