diff --git a/clusterloader2/pkg/dependency/dra/dra.go b/clusterloader2/pkg/dependency/dra/dra.go index 710a35a6b6..cd82aa67fb 100644 --- a/clusterloader2/pkg/dependency/dra/dra.go +++ b/clusterloader2/pkg/dependency/dra/dra.go @@ -20,8 +20,10 @@ import ( "context" "embed" "fmt" + "strings" "time" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog/v2" @@ -31,16 +33,16 @@ import ( ) const ( - draDependencyName = "DRATestDriver" - //TODO: this needs to be converted into a parameter. Will will not need this until parititionable devices test + draDependencyName = "DRATestDriver" draNamespace = "dra-example-driver" + draManifests = "dra-example-driver" defaultWorkerNodeCount = "100" draDaemonsetName = "dra-example-driver-kubeletplugin" checkDRAReadyInterval = 30 * time.Second defaultDRATimeout = 10 * time.Minute ) -//go:embed manifests/*.yaml +//go:embed manifests/**/*.yaml var manifestsFS embed.FS func init() { @@ -57,13 +59,24 @@ type draDependency struct{} func (d *draDependency) Setup(config *dependency.Config) error { klog.V(2).Infof("%s: Installing DRA example driver", d) - if err := client.CreateNamespace(config.ClusterFramework.GetClientSets().GetClient(), draNamespace); err != nil { - return fmt.Errorf("namespace %s creation error: %v", draNamespace, err) + + namespace, err := getNamespace(config) + if err != nil { + return err } - namespace, ok := config.Params["Namespace"] - if !ok { - namespace = draNamespace + if err := client.CreateNamespace(config.ClusterFramework.GetClientSets().GetClient(), namespace); err != nil { + return fmt.Errorf("namespace %s creation error: %v", namespace, err) + } + + manifests, err := getManifests(config) + if err != nil { + return err + } + + daemonsetName, err := getDaemonset(config) + if err != nil { + return err } mapping := map[string]interface{}{ @@ -76,7 +89,7 @@ func (d *draDependency) Setup(config *dependency.Config) error { } if err := config.ClusterFramework.ApplyTemplatedManifests( manifestsFS, - "manifests/*.yaml", + manifests, mapping, client.Retry(client.IsRetryableAPIError), ); err != nil { @@ -86,8 +99,8 @@ func (d *draDependency) Setup(config *dependency.Config) error { if err != nil { return err } - klog.V(2).Infof("%s: checking if DRA driver %s is healthy", d, draDaemonsetName) - if err := d.waitForDRADriverToBeHealthy(config, timeout); err != nil { + klog.V(2).Infof("%s: checking if DRA driver %s is healthy", d, daemonsetName) + if err := d.waitForDRADriverToBeHealthy(config, timeout, daemonsetName, namespace); err != nil { return err } @@ -98,12 +111,17 @@ func (d *draDependency) Setup(config *dependency.Config) error { func (d *draDependency) Teardown(config *dependency.Config) error { klog.V(2).Infof("%s: Tearing down DRA example driver", d) + namespace, err := getNamespace(config) + if err != nil { + return err + } + // Delete namespace (this will delete all resources in it) - if err := client.DeleteNamespace(config.ClusterFramework.GetClientSets().GetClient(), draNamespace); err != nil { - return fmt.Errorf("deleting %s namespace error: %v", draNamespace, err) + if err := client.DeleteNamespace(config.ClusterFramework.GetClientSets().GetClient(), namespace); err != nil { + return fmt.Errorf("deleting %s namespace error: %v", namespace, err) } - if err := client.WaitForDeleteNamespace(config.ClusterFramework.GetClientSets().GetClient(), draNamespace, client.DefaultNamespaceDeletionTimeout); err != nil { + if err := client.WaitForDeleteNamespace(config.ClusterFramework.GetClientSets().GetClient(), namespace, client.DefaultNamespaceDeletionTimeout); err != nil { return err } @@ -111,12 +129,12 @@ func (d *draDependency) Teardown(config *dependency.Config) error { return nil } -func (d *draDependency) waitForDRADriverToBeHealthy(config *dependency.Config, timeout time.Duration) error { +func (d *draDependency) waitForDRADriverToBeHealthy(config *dependency.Config, timeout time.Duration, daemonsetName string, namespace string) error { if err := wait.PollImmediate( checkDRAReadyInterval, timeout, func() (done bool, err error) { - return d.isDRADriverReady(config) + return d.isDRADriverReady(config, daemonsetName, namespace) }); err != nil { return err } @@ -124,34 +142,45 @@ func (d *draDependency) waitForDRADriverToBeHealthy(config *dependency.Config, t checkDRAReadyInterval, timeout, func() (done bool, err error) { - return isResourceSlicesPublished(config) + return isResourceSlicesPublished(config, namespace) }); err != nil { return err } return nil } -func (d *draDependency) isDRADriverReady(config *dependency.Config) (done bool, err error) { +func (d *draDependency) isDRADriverReady(config *dependency.Config, daemonsetName string, namespace string) (done bool, err error) { ds, err := config.ClusterFramework.GetClientSets(). GetClient(). AppsV1(). - DaemonSets(draNamespace). - Get(context.Background(), draDaemonsetName, metav1.GetOptions{}) + DaemonSets(namespace). + Get(context.Background(), daemonsetName, metav1.GetOptions{}) if err != nil { - return false, fmt.Errorf("failed to get %s: %v", draDaemonsetName, err) + return false, fmt.Errorf("failed to get %s: %v", daemonsetName, err) } ready := ds.Status.NumberReady == ds.Status.DesiredNumberScheduled if !ready { klog.V(2).Infof("%s is not ready, "+ - "DesiredNumberScheduled: %d, NumberReady: %d", draDaemonsetName, ds.Status.DesiredNumberScheduled, ds.Status.NumberReady) + "DesiredNumberScheduled: %d, NumberReady: %d", daemonsetName, ds.Status.DesiredNumberScheduled, ds.Status.NumberReady) } return ready, nil } -func isResourceSlicesPublished(config *dependency.Config) (bool, error) { - workerCount := int(getWorkerCount(config).(float64)) +func isResourceSlicesPublished(config *dependency.Config, namespace string) (bool, error) { + // Get a list of all nodes + // nodes, err := getReadyNodesCount(config) + // if err != nil { + // return false, fmt.Errorf("failed to list nodes: %v", err) + // } - resourceSlices, err := config.ClusterFramework.GetClientSets().GetClient().ResourceV1beta1().ResourceSlices().List(context.Background(), metav1.ListOptions{}) + driverPluginPods, err := getDriverPluginPods(config, namespace, draDaemonsetName) + if err != nil { + return false, fmt.Errorf("failed to list driverPluginPods: %v", err) + } + + workerCount := driverPluginPods + + resourceSlices, err := config.ClusterFramework.GetClientSets().GetClient().ResourceV1().ResourceSlices().List(context.Background(), metav1.ListOptions{}) if err != nil { return false, fmt.Errorf("failed to list resourceslices: %v", err) } @@ -163,6 +192,26 @@ func isResourceSlicesPublished(config *dependency.Config) (bool, error) { return true, nil } +func getDriverPluginPods(config *dependency.Config, namespace string, namePrefix string) (int, error) { + pods, err := config.ClusterFramework.GetClientSets().GetClient().CoreV1().Pods(namespace).List(context.Background(), metav1.ListOptions{}) + if err != nil { + return 0, fmt.Errorf("failed to list pods in namespace %s: %w", namespace, err) + } + + runningPods := 0 + for _, pod := range pods.Items { + if !strings.HasPrefix(pod.Name, namePrefix) { + continue + } + + if pod.Status.Phase == corev1.PodRunning { + runningPods++ + } + } + + return runningPods, nil +} + func getWorkerCount(config *dependency.Config) interface{} { workerCount, ok := config.Params["WorkerNodeCount"] if !ok { @@ -171,6 +220,43 @@ func getWorkerCount(config *dependency.Config) interface{} { return workerCount } +func getNamespace(config *dependency.Config) (string, error) { + namespace, ok := config.Params["Namespace"] + if !ok { + namespace = draNamespace + } + namespaceString, ok := namespace.(string) + + if !ok { + return "", fmt.Errorf("namespace parameter is not a string: %v", namespace) + } + return namespaceString, nil +} + +func getManifests(config *dependency.Config) (string, error) { + manifests, ok := config.Params["Manifests"] + if !ok { + manifests = draManifests + } + manifestsString, ok := manifests.(string) + if !ok { + return "", fmt.Errorf("manifests parameter is not a string: %v", manifests) + } + return "manifests/" + manifestsString + "/*.yaml", nil +} + +func getDaemonset(config *dependency.Config) (string, error) { + daemonsetName, ok := config.Params["DaemonsetName"] + if !ok { + daemonsetName = draDaemonsetName + } + daemonsetNameString, ok := daemonsetName.(string) + if !ok { + return "", fmt.Errorf("DaemonsetName parameter is not a string: %v", daemonsetName) + } + return daemonsetNameString, nil +} + // String returns string representation of this dependency. func (d *draDependency) String() string { return draDependencyName diff --git a/clusterloader2/pkg/dependency/dra/manifests/clusterrole.yaml b/clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/clusterrole.yaml similarity index 97% rename from clusterloader2/pkg/dependency/dra/manifests/clusterrole.yaml rename to clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/clusterrole.yaml index d36887e110..5fc6f7c81b 100644 --- a/clusterloader2/pkg/dependency/dra/manifests/clusterrole.yaml +++ b/clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/clusterrole.yaml @@ -13,4 +13,4 @@ rules: verbs: ["get"] - apiGroups: ["resource.k8s.io"] resources: ["resourceslices"] - verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] \ No newline at end of file diff --git a/clusterloader2/pkg/dependency/dra/manifests/clusterrolebinding.yaml b/clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/clusterrolebinding.yaml similarity index 90% rename from clusterloader2/pkg/dependency/dra/manifests/clusterrolebinding.yaml rename to clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/clusterrolebinding.yaml index 084d455fed..2bcbd4545a 100644 --- a/clusterloader2/pkg/dependency/dra/manifests/clusterrolebinding.yaml +++ b/clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/clusterrolebinding.yaml @@ -11,4 +11,4 @@ subjects: roleRef: kind: ClusterRole name: dra-example-driver-role - apiGroup: rbac.authorization.k8s.io + apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/clusterloader2/pkg/dependency/dra/manifests/deviceclass.yaml b/clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/deviceclass.yaml similarity index 100% rename from clusterloader2/pkg/dependency/dra/manifests/deviceclass.yaml rename to clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/deviceclass.yaml diff --git a/clusterloader2/pkg/dependency/dra/manifests/kubeletplugin.yaml b/clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/kubeletplugin.yaml similarity index 66% rename from clusterloader2/pkg/dependency/dra/manifests/kubeletplugin.yaml rename to clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/kubeletplugin.yaml index f1a268e64e..e1affdcf72 100644 --- a/clusterloader2/pkg/dependency/dra/manifests/kubeletplugin.yaml +++ b/clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/kubeletplugin.yaml @@ -6,10 +6,11 @@ metadata: name: dra-example-driver-kubeletplugin namespace: {{.Namespace}} labels: - helm.sh/chart: dra-example-driver-0.1.3 + helm.sh/chart: dra-example-driver-0.0.0-dev app.kubernetes.io/name: dra-example-driver app.kubernetes.io/instance: dra-example-driver - app.kubernetes.io/version: "v0.1.0" + app.kubernetes.io/version: "v0.2.0" + app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: kubeletplugin spec: selector: @@ -26,7 +27,6 @@ spec: app.kubernetes.io/instance: dra-example-driver app.kubernetes.io/component: kubeletplugin spec: - priorityClassName: system-node-critical serviceAccountName: dra-example-driver-service-account securityContext: {} @@ -34,14 +34,26 @@ spec: - name: plugin securityContext: privileged: true - image: registry.k8s.io/dra-example-driver/dra-example-driver:v0.1.0 - imagePullPolicy: IfNotPresent + # image: /:v0.2.0 + image: registry.k8s.io/dra-example-driver/dra-example-driver:v0.2.0 + imagePullPolicy: Always command: ["dra-example-kubeletplugin"] resources: {} + + livenessProbe: + grpc: + port: 51515 + service: liveness + failureThreshold: 3 + periodSeconds: 10 env: - name: CDI_ROOT value: /var/run/cdi + - name: KUBELET_REGISTRAR_DIRECTORY_PATH + value: "/var/lib/kubelet/plugins_registry" + - name: KUBELET_PLUGINS_DIRECTORY_PATH + value: "/var/lib/kubelet/plugins" - name: NODE_NAME valueFrom: fieldRef: @@ -53,20 +65,26 @@ spec: # Simulated number of devices the example driver will pretend to have. - name: NUM_DEVICES value: "8" + - name: HEALTHCHECK_PORT + value: "51515" volumeMounts: - name: plugins-registry - mountPath: /var/lib/kubelet/plugins_registry + mountPath: "/var/lib/kubelet/plugins_registry" - name: plugins - mountPath: /var/lib/kubelet/plugins + mountPath: "/var/lib/kubelet/plugins" - name: cdi mountPath: /var/run/cdi volumes: - name: plugins-registry hostPath: - path: /var/lib/kubelet/plugins_registry + path: "/var/lib/kubelet/plugins_registry" - name: plugins hostPath: - path: /var/lib/kubelet/plugins + path: "/var/lib/kubelet/plugins" - name: cdi hostPath: path: /var/run/cdi + tolerations: + - effect: NoSchedule + key: google.com/tpu + operator: Exists \ No newline at end of file diff --git a/clusterloader2/pkg/dependency/dra/manifests/resourceQuota.yaml b/clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/resourceQuota.yaml similarity index 100% rename from clusterloader2/pkg/dependency/dra/manifests/resourceQuota.yaml rename to clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/resourceQuota.yaml diff --git a/clusterloader2/pkg/dependency/dra/manifests/serviceaccount.yaml b/clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/serviceaccount.yaml similarity index 69% rename from clusterloader2/pkg/dependency/dra/manifests/serviceaccount.yaml rename to clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/serviceaccount.yaml index 0d680d96eb..5d81813eca 100644 --- a/clusterloader2/pkg/dependency/dra/manifests/serviceaccount.yaml +++ b/clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/serviceaccount.yaml @@ -6,7 +6,8 @@ metadata: name: dra-example-driver-service-account namespace: {{.Namespace}} labels: - helm.sh/chart: dra-example-driver-0.1.3 + helm.sh/chart: dra-example-driver-0.0.0-dev app.kubernetes.io/name: dra-example-driver app.kubernetes.io/instance: dra-example-driver - app.kubernetes.io/version: "v0.1.0" + app.kubernetes.io/version: "v0.2.0" + app.kubernetes.io/managed-by: Helm \ No newline at end of file diff --git a/clusterloader2/pkg/dependency/dra/manifests/validatingadmissionpolicy.yaml b/clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/validatingadmissionpolicy.yaml similarity index 93% rename from clusterloader2/pkg/dependency/dra/manifests/validatingadmissionpolicy.yaml rename to clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/validatingadmissionpolicy.yaml index 6e0d0d2a91..dee2569b4d 100644 --- a/clusterloader2/pkg/dependency/dra/manifests/validatingadmissionpolicy.yaml +++ b/clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/validatingadmissionpolicy.yaml @@ -9,7 +9,7 @@ spec: matchConstraints: resourceRules: - apiGroups: ["resource.k8s.io"] - apiVersions: ["v1beta1"] + apiVersions: ["v1"] operations: ["CREATE", "UPDATE", "DELETE"] resources: ["resourceslices"] matchConditions: @@ -30,4 +30,4 @@ spec: - expression: variables.userNodeName == variables.objectNodeName messageExpression: >- "this user running on node '"+variables.userNodeName+"' may not modify " + - (variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'") + (variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'") \ No newline at end of file diff --git a/clusterloader2/pkg/dependency/dra/manifests/validatingadmissionpolicybinding.yaml b/clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/validatingadmissionpolicybinding.yaml similarity index 100% rename from clusterloader2/pkg/dependency/dra/manifests/validatingadmissionpolicybinding.yaml rename to clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/validatingadmissionpolicybinding.yaml diff --git a/clusterloader2/pkg/measurement/common/slos/resourceclaim_allocation_latency.go b/clusterloader2/pkg/measurement/common/slos/resourceclaim_allocation_latency.go index 7633e29cdb..07d483bd41 100644 --- a/clusterloader2/pkg/measurement/common/slos/resourceclaim_allocation_latency.go +++ b/clusterloader2/pkg/measurement/common/slos/resourceclaim_allocation_latency.go @@ -24,7 +24,7 @@ import ( "time" corev1 "k8s.io/api/core/v1" - resourcev1beta2 "k8s.io/api/resource/v1beta2" + resourcev1 "k8s.io/api/resource/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/watch" @@ -145,18 +145,16 @@ func (m *resourceClaimAllocationLatencyMeasurement) start(c clientset.Interface) m.isRunning = true m.stopCh = make(chan struct{}) m.client = c - lw := &cache.ListWatch{ ListFunc: func(options metav1.ListOptions) (runtime.Object, error) { m.selector.ApplySelectors(&options) - return c.ResourceV1beta2().ResourceClaims(m.selector.Namespace).List(context.TODO(), options) + return c.ResourceV1().ResourceClaims(m.selector.Namespace).List(context.TODO(), options) }, WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) { m.selector.ApplySelectors(&options) - return c.ResourceV1beta2().ResourceClaims(m.selector.Namespace).Watch(context.TODO(), options) + return c.ResourceV1().ResourceClaims(m.selector.Namespace).Watch(context.TODO(), options) }, } - claimInf := informer.NewInformer(lw, m.addEvent) podLW := &cache.ListWatch{ @@ -219,7 +217,7 @@ func (m *resourceClaimAllocationLatencyMeasurement) processEvent(ev *claimEventD return } - claim, ok := ev.obj.(*resourcev1beta2.ResourceClaim) + claim, ok := ev.obj.(*resourcev1.ResourceClaim) if !ok { return } @@ -303,7 +301,7 @@ func (m *resourceClaimAllocationLatencyMeasurement) gather(_ clientset.Interface return []measurement.Summary{measurement.CreateSummary(summaryName, "json", content)}, err } -func isAllocated(claim *resourcev1beta2.ResourceClaim) bool { +func isAllocated(claim *resourcev1.ResourceClaim) bool { return claim.Status.Allocation != nil || len(claim.Status.ReservedFor) > 0 || len(claim.Status.Devices) > 0 } @@ -316,7 +314,7 @@ func usesResourceClaimTemplate(p *corev1.Pod) bool { return false } -func (m *resourceClaimAllocationLatencyMeasurement) getCachedPodCreateTime(cl *resourcev1beta2.ResourceClaim) (time.Time, bool) { +func (m *resourceClaimAllocationLatencyMeasurement) getCachedPodCreateTime(cl *resourcev1.ResourceClaim) (time.Time, bool) { for _, o := range cl.OwnerReferences { if o.Kind == "Pod" && o.Name != "" { key := fmt.Sprintf("%s/%s", cl.Namespace, o.Name) @@ -329,7 +327,7 @@ func (m *resourceClaimAllocationLatencyMeasurement) getCachedPodCreateTime(cl *r return time.Time{}, false } -func (m *resourceClaimAllocationLatencyMeasurement) fetchPodCreateTime(cl *resourcev1beta2.ResourceClaim) (time.Time, bool) { +func (m *resourceClaimAllocationLatencyMeasurement) fetchPodCreateTime(cl *resourcev1.ResourceClaim) (time.Time, bool) { for _, o := range cl.OwnerReferences { if o.Kind == "Pod" && o.Name != "" { atomic.AddInt64(&m.podGetCalls, 1) diff --git a/clusterloader2/testing/dra/resourceclaimtemplate.yaml b/clusterloader2/testing/dra/resourceclaimtemplate.yaml index 053590bb29..2c64f505a4 100644 --- a/clusterloader2/testing/dra/resourceclaimtemplate.yaml +++ b/clusterloader2/testing/dra/resourceclaimtemplate.yaml @@ -1,4 +1,4 @@ -apiVersion: resource.k8s.io/v1beta1 +apiVersion: resource.k8s.io/v1 kind: ResourceClaimTemplate metadata: name: {{.Name}} @@ -7,4 +7,5 @@ spec: devices: requests: - name: gpu - deviceClassName: gpu.example.com \ No newline at end of file + exactly: + deviceClassName: gpu.example.com \ No newline at end of file