NexusGPU · Code2Life · Dec 11, 2025 · Dec 8, 2025 · Dec 9, 2025 · Dec 9, 2025
diff --git a/.cursor/rules/requirement.mdc b/.cursor/rules/requirement.mdc
@@ -0,0 +1,36 @@
+---
+alwaysApply: true
+---
+
+# Project Goals
+TensorFusion is building large scale heterogeneous GPU pooling and scheduling AI infra using cloudnative ecosystem projects libs, help enterprise save GPU costs, simplify O&M and increase observability, boost elasticity.
+
+Underlying tech: in this repo: Kubebulder, Scheduler, CDI. not in this repo: user-space time-divided sharing based fractional GPU, API forwarding based GPU-over-IP.
+
+Critical Modules: 
+- pod mutating webhook for augment user pods, add needed inputs and outputs
+- advanced scheduler with allocator/GPU-resource vertical scaler/bin-packing/rebalancer/quotas
+- custom resource operator, GPU cluster -> pool -> gpunode -> gpu, gpunodeclaim -> node -> gpunode, maintain resources and TensorFusion components status, eval alerts etc.
+- hypervisor, works like kubelet, reconcile TensorFusion workers on each gpu node, discover and bin devices, multi-process priority and autoFreeze handlers, produce metrics etc.
+- server, for offering API to assign remote vGPU worker, expose system debug endpoints
+- cloud provider integration (direct integration or with karpenter).
+- indexallocator is a special module to resolve CDI device plugin Allocate interface can not get Pod info issue, without CDI container -> Pod matching, not possible to get advanced allocation info (hack before k8s DRA deployed). using dummy resource name and number to compose a special index pass to hypervisor. this is not general device plugin patter, need remember this context only when changing device allocation and device plugin related functions.
+
+# Requirements
+
+You are professional cloudnative and AI infra engineer. High quality, robust codes with Golang and k8s best practices.
+Confirm the plan, then write code.
+Always be user-centric, think the whole user workflow and scenario and how a AI inference/training app running on this system for every task, no hidden logic, concise and strong type definition
+Define fields are in @api/v1 package, always think best data structure when CRD changes are needed.
+Don't abstract too much nor abstract nothing, extract interface based on business understanding, don't extract interface when not needed.
+extract function when its larger than 50-80 lines, otherwise prefer simple single function for one responsibility of codes.
+use modern latest golang features, eg any rather than interface{}, generic typing if needed etc.
+Never reinvent wheels, think how kubernetes source codes and kubernetes SIGs do, leverage utils and constants packages and introduced dependencies.
+Always prioritize security, scalability, and maintainability.
+Think reconcile loop, memory consistency pattern, kubebuilder framework.
+Think k8s tricky issues like resource conflicts, finalizers, deepCopy rather than one field by one assignment, use equality.semantic.DeepEqual rather than hard code comparing.
+Never write large task at once, break to smaller ones.
+Only write necessary comments, e.g for some complex algorithm and background info, never write stupid comment.
+Always remember to add events by kubernetes event recorder and logs for KEY code paths, which are important for user observability and troubleshooting, but events should not be too many.
+Always test-driven, write ginkgo based test cases, continue to run go/ginkgo test commands, review codes and refactor until test works, if test not work or perform, continue.
+When the task introduce some new memory state, consider expose it to server module for troubleshooting
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -61,7 +61,7 @@
                 "KUBECONFIG": "~/.kube/config-local-studio",
                 "ENABLE_WEBHOOKS": "false",
                 "ENABLE_SCHEDULER": "true",
-                "ENABLE_CR_CONTROLLER": "true",
+                "ENABLE_CR_CONTROLLER": "false",
                 "NVIDIA_OPERATOR_PROGRESSIVE_MIGRATION": "true"
             },
             "args": [
@@ -70,7 +70,7 @@
                 "--dynamic-config", "${workspaceFolder}/config/samples/dynamic-config.yaml",
                 "--scheduler-config", "${workspaceFolder}/config/samples/scheduler-config.yaml",
                 // "--enable-alert",
-                // "--enable-auto-scale",
+                "--enable-auto-scale",
                 "--enable-auto-expander",
                 "-v", "4"
             ],

diff --git a/api/v1/schedulingconfigtemplate_types.go b/api/v1/schedulingconfigtemplate_types.go
@@ -17,6 +17,7 @@ limitations under the License.
 package v1
 
 import (
+	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 )
@@ -29,10 +30,12 @@ type SchedulingConfigTemplateSpec struct {
 
 	// scale the workload based on the usage and traffic
 	// +optional
-	AutoScaling *AutoScalingConfig `json:"autoScaling,omitempty"`
+	VerticalScalingRules []VerticalScalingRule `json:"verticalScalingRules,omitempty"`
 
 	// avoid hot GPU devices and continuously balance the workload
-	// implemented by trigger a simulation scheduling and advise better GPU nodes for scheduler
+	// implemented by mark GPU as hot and trigger evict for re-scheduling
+	// The hot GPUs will get lower priority for scheduling
+	// TODO: not implemented yet
 	// +optional
 	ReBalancer *ReBalancerConfig `json:"reBalancer,omitempty"`
 
@@ -41,6 +44,14 @@ type SchedulingConfigTemplateSpec struct {
 	Hypervisor *HypervisorScheduling `json:"hypervisor,omitempty"`
 }
 
+type VerticalScalingRule struct {
+	Name string `json:"name,omitempty"`
+
+	// Rule auto applied in webhook, when pod matches the selector,
+	// the rule will be added into workload profile's autoScalingConfig and annotation
+	Selector metav1.LabelSelector `json:"selector,omitempty"`
+	Rule     *AutoScalingConfig   `json:"autoScaling,omitempty"`
+}
 type PlacementConfig struct {
 	// +kubebuilder:default=NodeCompactGPULowLoad
 	Mode PlacementMode `json:"mode"`
@@ -89,16 +100,13 @@ type GPUFilter struct {
 }
 
 type AutoScalingConfig struct {
-	// layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode
-	// Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks
-	AutoSetResources AutoSetResources `json:"autoSetResources,omitempty"`
-
-	// layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit
-	// HPA-like, aggregate metrics data 1m-1h (when tf-worker scaled-up, should also trigger client pod's owner[Deployment etc.]'s replica increasing, check if KNative works)
-	AutoSetReplicas AutoSetReplicas `json:"autoSetReplicas,omitempty"`
+	// Adjust baseline requests and limits to match the actual usage using recent metrics
+	AutoSetResources *AutoSetResources `json:"autoSetResources,omitempty"`
 
 	// CronScalingRules defines a list of CronScaling rules used to schedule scaling actions based on cron expressions.
 	CronScalingRules []CronScalingRule `json:"cronScalingRules,omitempty"`
+
+	ExternalScaler *ExternalScalerConfig `json:"externalScaler,omitempty"`
 }
 
 // CronScalingRule defines the rule for scaling resources based on a cron schedule.
@@ -115,102 +123,103 @@ type CronScalingRule struct {
 	End string `json:"end,omitempty"`
 	// DesiredResources specifies the target resources to scale to during the schedule.
 	DesiredResources Resources `json:"desiredResources,omitempty"`
-	// DesiredReplicas is the target number of replicas during the schedule.
-	DesiredReplicas *int32 `json:"desiredReplicas,omitempty"`
 }
 
 type AutoSetResources struct {
 	Enable bool `json:"enable,omitempty"`
 
-	// Target resource to scale, such as "tflops", "vram", or "all" by default
-	TargetResource string `json:"targetResource,omitempty"`
+	// Target resource to scale, such as "compute", "vram", or "all" by default
+	TargetResource ScalingTargetResource `json:"targetResource,omitempty"`
 
-	// Tflops usage percentile that will be used as a base for tflops target recommendation. Default: 0.9
-	TargetTflopsPercentile string `json:"targettflopspercentile,omitempty"`
+	// Tflops usage percentile that will be used as a base for tflops target recommendation. Default: 0.95
+	TargetComputePercentile string `json:"targetComputePercentile,omitempty"`
 
 	// Tflops usage percentile that will be used for the lower bound on tflops recommendation. Default: 0.5
-	LowerBoundTflopsPercentile string `json:"lowerboundtflopspercentile,omitempty"`
+	// When QoS is low or medium, request set to lower bound
+	LowerBoundComputePercentile string `json:"lowerBoundComputePercentile,omitempty"`
 
-	// Tflops usage percentile that will be used for the upper bound on tflops recommendation. Default: 0.95
-	UpperBoundTflopsPercentile string `json:"upperboundtflopspercentile,omitempty"`
+	// Tflops usage percentile that will be used for the upper bound on tflops recommendation. Default: 0.99
+	// Limit will be set to upper bound, when QoS is critical, also set limit request to upper bound
+	UpperBoundComputePercentile string `json:"upperBoundComputePercentile,omitempty"`
 
-	// Vram usage percentile that will be used as a base for vram target recommendation. Default: 0.9
-	TargetVramPercentile string `json:"targetvrampercentile,omitempty"`
+	// Vram usage percentile that will be used as a base for vram target recommendation. Default: 0.95
+	// The requests will be set to match this percentile of the actual usage, but won't change when current requests is in lower and upper bounds
+	// When QoS is high, set request to target
+	TargetVRAMPercentile string `json:"targetVRAMPercentile,omitempty"`
 
 	// Vram usage percentile that will be used for the lower bound on vram recommendation. Default: 0.5
-	LowerBoundVramPercentile string `json:"lowerboundvrampercentile,omitempty"`
+	LowerBoundVRAMPercentile string `json:"lowerBoundVRAMPercentile,omitempty"`
 
-	// Vram usage percentile that will be used for the upper bound on vram recommendation. Default: 0.95
-	UpperBoundVramPercentile string `json:"upperboundvrampercentile,omitempty"`
+	// Vram usage percentile that will be used for the upper bound on vram recommendation. Default: 0.99
+	UpperBoundVRAMPercentile string `json:"upperBoundVRAMPercentile,omitempty"`
 
 	// Fraction of usage added as the safety margin to the recommended request. Default: 0.15
-	RequestMarginFraction string `json:"requestMarginFraction,omitempty"`
+	MarginFraction string `json:"marginFraction,omitempty"`
 
-	// The time interval used for computing the confidence multiplier for the lower and upper bound. Default: 24h
-	ConfidenceInterval string `json:"confidenceInterval,omitempty"`
+	// Only when the difference between the recommended request and the current request is greater than this threshold, the request will be updated. Default: 0.1
+	// This value can't greater than MarginFraction, otherwise no update will be made since always inside the threshold after multiplying MarginFraction.
+	UpdateThreshold string `json:"updateThreshold,omitempty"`
 
-	// How much time back TSDB have to be queried to get historical metrics. Default: 1d
-	HistoryLength string `json:"historyLength,omitempty"`
+	// How much time back TSDB have to be queried to get historical metrics. Default: 2h
+	HistoryDataPeriod string `json:"historyDataPeriod,omitempty"`
 
-	// Resolution at which TSDB is queried for historical metrics. Default: 1m
-	HistoryResolution string `json:"historyResolution,omitempty"`
-}
+	// Min scaling ratio to original resources, e.g. request 10Gi, ratio 0.5, scale down limit to 5Gi, default: 0.2
+	MinVRAMResourcesRatio string `json:"minVRAMResourcesRatio,omitempty"`
 
-// A typical autoLimits algorithm could be checking every 5m, look back 1 day data,
-// select 99% of actual usage as preferredLimits,
-// calculate finalPreferredLimits, which is preferredLimits*(1+extraBufferRatio)
-// if they are equal with each other within a range (eg. 5%), do nothing
-// if finalPreferredLimits is less than current limits and exceeded error range,
-// set current limits to finalPreferredLimits
-// if finalPreferredLimits > current limits and exceeded error range,
-// set current limits to max(finalPreferredLimits, current limits * scaleUpStep)
-// if AI prediction enabled, it helps to detect history pattern, and set more reasonable, explainable limit value
-// the final set limits should be max(finalPreferredLimits, last(predict_value * (1 + extraTFlopsBufferRatio)))
-type AutoSetLimits struct {
-	Enable bool `json:"enable,omitempty"`
+	// Max scaling ratio to original resources, e.g. request 10Gi, ratio 2.0, scale up limit to 20Gi, default: 5.0
+	MaxVRAMResourcesRatio string `json:"maxVRAMResourcesRatio,omitempty"`
 
-	// target resource to scale limits, such as "tflops", "vram", or "all" by default
-	TargetResource string `json:"targetResource,omitempty"`
+	// Min scaling ratio to original resources, e.g. request 10Gi, ratio 0.5, scale down limit to 5Gi, default: 0.1
+	// This ratio only apply to tflops/compute request rather than limit, to avoid performance downgrade when not used for a long time
+	MinComputeResourcesRatio string `json:"minComputeResourcesRatio,omitempty"`
 
-	EvaluationPeriod string `json:"evaluationPeriod,omitempty"`
+	// Max scaling ratio to original resources, e.g. request 10Gi, ratio 2.0, scale up limit to 20Gi, default: 10.0
+	MaxComputeResourcesRatio string `json:"maxComputeResourcesRatio,omitempty"`
 
-	ExtraTFlopsBufferRatio string `json:"extraTFlopsBufferRatio,omitempty"`
+	// When workload is created, wait for this period to collect enough metrics before scaling, default: 30m
+	InitialDelayPeriod string `json:"initialDelayPeriod,omitempty"`
 
-	IgnoredDeltaRange string `json:"ignoredDeltaRange,omitempty"`
+	// How often to evaluate the scaling operation, default: same as global config's auto scaling interval
+	Interval string `json:"interval,omitempty"`
+}
 
-	ScaleUpStep string `json:"scaleUpStep,omitempty"`
+type ScalingTargetResource string
 
-	// the multiplier of requests, to avoid limit set too high, like 5.0
-	MaxRatioToRequests string `json:"maxRatioToRequests,omitempty"`
+const (
+	ScalingTargetResourceCompute ScalingTargetResource = "compute"
+	ScalingTargetResourceVRAM    ScalingTargetResource = "vram"
+	ScalingTargetResourceAll     ScalingTargetResource = "all"
+)
 
-	Prediction *SmartSchedulerModelInput `json:"prediction,omitempty"`
-}
+type ExternalScalerConfig struct {
+	Enable bool `json:"enable,omitempty"`
 
-// To handle burst traffic, scale up in short time (this feature requires GPU context migration & replication, not available yet)
-type AutoSetReplicas struct {
-	Enable                bool   `json:"enable,omitempty"`
-	TargetTFlopsOfLimits  string `json:"targetTFlopsOfLimits,omitempty"`
-	EvaluationPeriod      string `json:"evaluationPeriod,omitempty"`
-	ScaleUpStep           string `json:"scaleUpStep,omitempty"`
-	ScaleDownStep         string `json:"scaleDownStep,omitempty"`
-	ScaleUpCoolDownTime   string `json:"scaleUpCoolDownTime,omitempty"`
-	ScaleDownCoolDownTime string `json:"scaleDownCoolDownTime,omitempty"`
-}
+	URL string `json:"url,omitempty"`
 
-type AutoSetRequests struct {
-	Enable bool `json:"enable,omitempty"`
+	// API key will be set into the request header as "Authorization: Bearer <api key>"
+	APIKeySecretRef *v1.SecretReference `json:"apiKeySecretRef,omitempty"`
 
-	// target resource to scale requests, such as "tflops", "vram", or "all" by default
-	TargetResource string `json:"targetResource,omitempty"`
+	InitialDelayPeriod string `json:"initialDelayPeriod,omitempty"`
+
+	// How often to evaluate the scaling operation, default: same as global config's auto scaling interval
+	Interval string `json:"interval,omitempty"`
+}
+
+type ExternalScalerRequest struct {
+	WorkloadName     string    `json:"workloadName,omitempty"`
+	Namespace        string    `json:"namespace,omitempty"`
+	CurrentResources Resources `json:"currentResources,omitempty"`
+}
 
-	PercentileForAutoRequests string `json:"percentileForAutoRequests,omitempty"`
+type ExternalScalerResponse struct {
+	NeedScaleUp   bool `json:"needScaleUp,omitempty"`
+	NeedScaleDown bool `json:"needScaleDown,omitempty"`
 
-	// the request buffer ratio, for example actual usage is 1.0, 10% buffer will be 1.1 as final preferred requests
-	ExtraBufferRatio string `json:"extraBufferRatio,omitempty"`
+	// Explain why the scaling operation is needed or not needed, recorded to event and workload status
+	Reason string `json:"reason,omitempty"`
 
-	EvaluationPeriod  string                   `json:"evaluationPeriod,omitempty"`
-	AggregationPeriod string                   `json:"aggregationPeriod,omitempty"`
-	Prediction        SmartSchedulerModelInput `json:"prediction,omitempty"`
+	// If no scaling operation needed, this could be zero value
+	RecommendedResources Resources `json:"recommendedResources,omitempty"`
 }
 
 type AutoFreezeAndResume struct {

diff --git a/api/v1/workloadprofile_types.go b/api/v1/workloadprofile_types.go
@@ -79,7 +79,7 @@ type WorkloadProfileSpec struct {
 	// +optional
 	// AutoScalingConfig configured here will override Pool's schedulingConfig
 	// This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation,
-	// user can set tensor-fusion.ai/auto-resources|replicas: 'true'
+	// user can set tensor-fusion.ai/autoscale: 'true'
 	AutoScalingConfig AutoScalingConfig `json:"autoScalingConfig,omitempty"`
 
 	// +optional