From 24c2cba232e9177af872f7da87f6e73045912346 Mon Sep 17 00:00:00 2001 From: Yu Li Date: Tue, 15 Jul 2025 23:12:32 +0000 Subject: [PATCH 1/2] Add tpu resouce flavor to kueue --- src/xpk/core/kueue.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index a6c6872b4..8da881c08 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -506,6 +506,10 @@ def get_kueue_covered_resources_config( resources: - name: "{resource_type}" nominalQuota: {total_chips} + - name: "cpu" + nominalQuota: 99999999999 + - name: "memory" + nominalQuota: 9999999Ti """ config_string = config_format.format( cluster_hardware_name=cluster_hardware_name, From 832f82ef11a7e232d3b747a3c197b6e316c62aec Mon Sep 17 00:00:00 2001 From: Yu Li Date: Wed, 16 Jul 2025 22:48:30 +0000 Subject: [PATCH 2/2] Add cpu and memory to covered resources --- src/xpk/core/kueue.py | 25 +++++++++++++++++++++---- src/xpk/core/pathways.py | 15 --------------- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 8da881c08..6b83b568a 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -28,7 +28,7 @@ run_command_with_updates, run_command_with_updates_retry, ) -from .pathways import add_pw_resource_flavors, add_pw_resources_to_kueue +from .pathways import add_pw_resource_flavors from .resources import AutoprovisioningConfig from .scheduling import ( create_accelerator_label, @@ -104,7 +104,6 @@ namespaceSelector: {{}} # match all. resourceGroups: {covered_resources_config} - {pw_resources_kueue} {admission_checks} --- apiVersion: kueue.x-k8s.io/v1beta1 @@ -439,6 +438,7 @@ def install_kueue_crs( cluster_hardware_name=cluster_hardware_name, resource_type=resource_type, total_chips=total_chips, + args=args, ) topology_label = '' if system.device_type in [ @@ -463,7 +463,6 @@ def install_kueue_crs( covered_resources_config=covered_resources_config, resource_type=res_type, pw_resource_flavors=add_pw_resource_flavors(args), - pw_resources_kueue=add_pw_resources_to_kueue(args), admission_checks=admission_checks, managed_resource=res_type, cluster_queue_name=CLUSTER_QUEUE_NAME, @@ -487,7 +486,7 @@ def install_kueue_crs( def get_kueue_covered_resources_config( - cluster_hardware_name, resource_type, total_chips + cluster_hardware_name, resource_type, total_chips, args ) -> str: """Gets Kueue covered resources configuration. @@ -501,6 +500,16 @@ def get_kueue_covered_resources_config( """ config_format = """ - coveredResources: ["{resource_type}"] + flavors: + - name: {cluster_hardware_name} + resources: + - name: "{resource_type}" + nominalQuota: {total_chips} + """ + + if args.enable_pathways: + config_format = """ + - coveredResources: ["{resource_type}", "cpu", "memory"] flavors: - name: {cluster_hardware_name} resources: @@ -510,6 +519,14 @@ def get_kueue_covered_resources_config( nominalQuota: 99999999999 - name: "memory" nominalQuota: 9999999Ti + - name: cpu-user + resources: + - name: "{resource_type}" + nominalQuota: 0 + - name: "cpu" + nominalQuota: 480 + - name: "memory" + nominalQuota: 2000G """ config_string = config_format.format( cluster_hardware_name=cluster_hardware_name, diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py index 81770eb04..245f4e88b 100644 --- a/src/xpk/core/pathways.py +++ b/src/xpk/core/pathways.py @@ -37,21 +37,6 @@ def add_pw_resource_flavors(args): return '' -def add_pw_resources_to_kueue(args): - """Add resource flavors required for Pathways, to the cluster queue.""" - resources_yaml = """- coveredResources: ["cpu", "memory"] - flavors: - - name: cpu-user - resources: - - name: "cpu" - nominalQuota: 480 - - name: "memory" - nominalQuota: 2000G""" - if args.enable_pathways: - return resources_yaml - return '' - - def ensure_pathways_workload_prerequisites(args, system) -> bool: """Check all Pathways workload prerequisites and set necessary args.