diff --git a/api/v1alpha1/modelservice_types.go b/api/v1alpha1/modelservice_types.go index 3cd7c73..c164f0d 100644 --- a/api/v1alpha1/modelservice_types.go +++ b/api/v1alpha1/modelservice_types.go @@ -330,6 +330,16 @@ type Parallelism struct { // +kubebuilder:validation:Minimum=0 // +kubebuilder:default=1 Tensor *int32 `json:"tensor,omitempty"` + // +optional + // +nullable + // +kubebuilder:validation:Minimum=0 + // +kubebuilder:default=1 + Data *int32 `json:"data,omitempty"` + // +optional + // +nullable + // +kubebuilder:validation:Minimum=0 + // +kubebuilder:default=1 + DataLocal *int32 `json:"dataLocal,omitempty"` } // AcceleratorTypes specifies set of accelerators for scheduling. diff --git a/go.mod b/go.mod index beab53d..9cdc5a6 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ toolchain go1.24.2 godebug default=go1.23 require ( - github.com/onsi/ginkgo/v2 v2.23.3 + github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.37.0 k8s.io/api v0.33.0 k8s.io/apimachinery v0.33.0 @@ -20,6 +20,7 @@ require ( github.com/Masterminds/sprig/v3 v3.3.0 github.com/stretchr/testify v1.10.0 sigs.k8s.io/gateway-api v1.3.0 + sigs.k8s.io/lws v0.6.1 sigs.k8s.io/yaml v1.4.0 ) @@ -32,6 +33,7 @@ require ( github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/shopspring/decimal v1.4.0 // indirect github.com/spf13/cast v1.7.0 // indirect + go.uber.org/automaxprocs v1.6.0 // indirect golang.org/x/crypto v0.38.0 // indirect sigs.k8s.io/randfill v1.0.0 // indirect ) @@ -61,7 +63,7 @@ require ( github.com/google/cel-go v0.23.2 // indirect github.com/google/gnostic-models v0.6.9 // indirect github.com/google/go-cmp v0.7.0 // indirect - github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect + github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect github.com/google/uuid v1.6.0 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect diff --git a/go.sum b/go.sum index 2f7779e..9ca00af 100644 --- a/go.sum +++ b/go.sum @@ -25,8 +25,8 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/emicklei/go-restful/v3 v3.12.0 h1:y2DdzBAURM29NFF94q6RaY4vjIH1rtwDapwQtU84iWk= github.com/emicklei/go-restful/v3 v3.12.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= -github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ= +github.com/evanphx/json-patch v4.12.0+incompatible h1:4onqiflcdA9EOZ4RxV643DvftH5pOlLGNtQ5lPWQu84= +github.com/evanphx/json-patch v4.12.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= @@ -68,8 +68,8 @@ github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad h1:a6HEuzUHeKH6hwfN/ZoQgRgVIWFJljSWa/zetS2WTvg= -github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 h1:TmHmbvxPmaegwhDubVz0lICL0J5Ka2vwTzhoePEXsGE= @@ -105,8 +105,8 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.23.3 h1:edHxnszytJ4lD9D5Jjc4tiDkPBZ3siDeJJkUZJJVkp0= -github.com/onsi/ginkgo/v2 v2.23.3/go.mod h1:zXTP6xIp3U8aVuXN8ENK9IXRaTjFnpVB9mGmaSRvxnM= +github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= +github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8= github.com/onsi/gomega v1.37.0 h1:CdEG8g0S133B4OswTDC/5XPSzE1OeP29QOioj2PID2Y= github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= @@ -114,6 +114,8 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= +github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= @@ -170,6 +172,8 @@ go.opentelemetry.io/otel/trace v1.34.0 h1:+ouXS2V8Rd4hp4580a8q23bg0azF2nI8cqLYnC go.opentelemetry.io/otel/trace v1.34.0/go.mod h1:Svm7lSjQD7kG7KJ/MUHPVXSDGz2OX4h0M2jHBhmSfRE= go.opentelemetry.io/proto/otlp v1.4.0 h1:TA9WRvW6zMwP+Ssb6fLoUIuirti1gGbP28GcKG1jgeg= go.opentelemetry.io/proto/otlp v1.4.0/go.mod h1:PPBWZIP98o2ElSqI35IHfu7hIhSwvc5N38Jw8pXuGFY= +go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= +go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= @@ -269,6 +273,8 @@ sigs.k8s.io/gateway-api-inference-extension v0.3.0 h1:jLFNxWfG8GeosTa4KWOMr4eTIL sigs.k8s.io/gateway-api-inference-extension v0.3.0/go.mod h1:x6g5FKSs4MsivsIAZJigVEjrvDAtgxNNynoWyid4v28= sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/lws v0.6.1 h1:cWiRmMSflo8hQPBrmIIZtoaX3XuVkmAgFKkmjxlPULI= +sigs.k8s.io/lws v0.6.1/go.mod h1:aoT5ROMriBtN/H8JH0POBF6e2uyFCOxKGKtXSA3DVV8= sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= diff --git a/internal/controller/child_resources.go b/internal/controller/child_resources.go index 17a7c7f..abd7292 100644 --- a/internal/controller/child_resources.go +++ b/internal/controller/child_resources.go @@ -19,6 +19,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" giev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" gatewayv1 "sigs.k8s.io/gateway-api/apis/v1" + lwsv1 "sigs.k8s.io/lws/api/leaderworkerset/v1" "sigs.k8s.io/yaml" ) @@ -28,19 +29,21 @@ import ( // BaseConfig holds information read from the base configmap type BaseConfig struct { - ConfigMaps []corev1.ConfigMap `json:"configMaps,omitempty"` - PrefillDeployment *appsv1.Deployment `json:"prefillDeployment,omitempty"` - DecodeDeployment *appsv1.Deployment `json:"decodeDeployment,omitempty"` - PrefillService *corev1.Service `json:"prefillService,omitempty"` - DecodeService *corev1.Service `json:"decodeService,omitempty"` - HTTPRoute *gatewayv1.HTTPRoute `json:"httpRoute,omitempty"` - InferencePool *giev1alpha2.InferencePool `json:"inferencePool,omitempty"` - InferenceModel *giev1alpha2.InferenceModel `json:"inferenceModel,omitempty"` - EPPDeployment *appsv1.Deployment `json:"eppDeployment,omitempty"` - EPPService *corev1.Service `json:"eppService,omitempty"` - EPPServiceAccount *corev1.ServiceAccount `json:"eppServiceAccount,omitempty"` - PDServiceAccount *corev1.ServiceAccount `json:"pdServiceAccount,omitempty"` - EPPRoleBinding *rbacv1.RoleBinding `json:"eppRoleBinding,omitempty"` + ConfigMaps []corev1.ConfigMap `json:"configMaps,omitempty"` + PrefillDeployment *appsv1.Deployment `json:"prefillDeployment,omitempty"` + DecodeDeployment *appsv1.Deployment `json:"decodeDeployment,omitempty"` + PrefillLeaderWorkerSet *lwsv1.LeaderWorkerSet `json:"prefillLeaderWorkerSet,omitempty"` + DecodeLeaderWorkerSet *lwsv1.LeaderWorkerSet `json:"decodeLeaderWorkerSet,omitempty"` + PrefillService *corev1.Service `json:"prefillService,omitempty"` + DecodeService *corev1.Service `json:"decodeService,omitempty"` + HTTPRoute *gatewayv1.HTTPRoute `json:"httpRoute,omitempty"` + InferencePool *giev1alpha2.InferencePool `json:"inferencePool,omitempty"` + InferenceModel *giev1alpha2.InferenceModel `json:"inferenceModel,omitempty"` + EPPDeployment *appsv1.Deployment `json:"eppDeployment,omitempty"` + EPPService *corev1.Service `json:"eppService,omitempty"` + EPPServiceAccount *corev1.ServiceAccount `json:"eppServiceAccount,omitempty"` + PDServiceAccount *corev1.ServiceAccount `json:"pdServiceAccount,omitempty"` + EPPRoleBinding *rbacv1.RoleBinding `json:"eppRoleBinding,omitempty"` } // shouldCreateConfigMaps returns True if there is at least one ConfigMap to be created @@ -53,6 +56,10 @@ func (childResource *BaseConfig) shouldCreatePrefillDeployment() bool { return childResource.PrefillDeployment != nil } +func (childResource *BaseConfig) shouldCreatePrefillLeaderWorkerSet() bool { + return childResource.PrefillLeaderWorkerSet != nil +} + // shouldCreatePrefillService returns True if the prefill deployment needs to be created func (childResource *BaseConfig) shouldCreatePrefillService() bool { return childResource.shouldCreatePrefillDeployment() && childResource.PrefillService != nil @@ -63,6 +70,10 @@ func (childResource *BaseConfig) shouldCreateDecodeDeployment() bool { return childResource.DecodeDeployment != nil } +func (childResource *BaseConfig) shouldCreateDecodeLeaderWorkerSet() bool { + return childResource.DecodeLeaderWorkerSet != nil +} + // shouldCreateDecodeService returns True if the decode deployment needs to be created func (childResource *BaseConfig) shouldCreateDecodeService() bool { return childResource.shouldCreateDecodeDeployment() && childResource.DecodeService != nil @@ -278,6 +289,12 @@ func BaseConfigFromCM(cm *corev1.ConfigMap) (*BaseConfig, error) { if err := deserialize("decodeDeployment", &bc.DecodeDeployment); err != nil { return nil, fmt.Errorf("failed to decode decodeDeployment: %w", err) } + if err := deserialize("decodeLeaderWorkerSet", &bc.DecodeLeaderWorkerSet); err != nil { + return nil, fmt.Errorf("failed to decode decodeLeaderWorkerSet: %w", err) + } + if err := deserialize("prefillLeaderWorkerSet", &bc.PrefillLeaderWorkerSet); err != nil { + return nil, fmt.Errorf("failed to decode decodeLeaderWorkerSet: %w", err) + } if err := deserialize("prefillService", &bc.PrefillService); err != nil { return nil, fmt.Errorf("failed to decode prefillService: %w", err) } @@ -316,20 +333,25 @@ func (interpolatedBaseConfig *BaseConfig) MergeChildResources(ctx context.Contex // Step 3: update the child resources // Idea: updates do the mergo merge if modelService.Spec.Prefill != nil || interpolatedBaseConfig.PrefillDeployment != nil { - interpolatedBaseConfig.mergePDDeployment(ctx, modelService, PREFILL_ROLE, scheme) + // interpolatedBaseConfig.mergePDDeployment(ctx, modelService, PREFILL_ROLE, scheme) + interpolatedBaseConfig.mergePDLeaderWorkerSet(ctx, modelService, PREFILL_ROLE, scheme) if interpolatedBaseConfig.PrefillService != nil { interpolatedBaseConfig.mergePDService(ctx, modelService, PREFILL_ROLE, scheme) } } log.FromContext(ctx).V(1).Info("attempting to update decode deployment") if modelService.Spec.Decode != nil || interpolatedBaseConfig.DecodeDeployment != nil { - interpolatedBaseConfig.mergePDDeployment(ctx, modelService, DECODE_ROLE, scheme) + // interpolatedBaseConfig.mergePDDeployment(ctx, modelService, DECODE_ROLE, scheme) + interpolatedBaseConfig.mergePDLeaderWorkerSet(ctx, modelService, DECODE_ROLE, scheme) if interpolatedBaseConfig.DecodeService != nil { interpolatedBaseConfig.mergePDService(ctx, modelService, DECODE_ROLE, scheme) } } - if interpolatedBaseConfig.PrefillDeployment != nil || interpolatedBaseConfig.DecodeDeployment != nil { + if interpolatedBaseConfig.PrefillDeployment != nil || + interpolatedBaseConfig.DecodeDeployment != nil || + interpolatedBaseConfig.PrefillLeaderWorkerSet != nil || + interpolatedBaseConfig.DecodeLeaderWorkerSet != nil { // some pd pods are getting created; set SA and RB here interpolatedBaseConfig.setPDServiceAccount(ctx, modelService, scheme, rbacOptions) } @@ -372,12 +394,12 @@ func (childResource *BaseConfig) mergeConfigMaps(ctx context.Context, msvc *msv1 if strings.TrimSpace(childResource.ConfigMaps[i].Namespace) == "" { childResource.ConfigMaps[i].Namespace = msvc.Namespace } - // Note: there seems to be a controllerutil bug here ... - // Setting owner ref before setting namespace seems problematic - err := controllerutil.SetOwnerReference(msvc, &childResource.ConfigMaps[i], scheme) - if err != nil { - log.FromContext(ctx).V(1).Error(err, "unable to set owner reference") - } + // // Note: there seems to be a controllerutil bug here ... + // // Setting owner ref before setting namespace seems problematic + // err := controllerutil.SetOwnerReference(msvc, &childResource.ConfigMaps[i], scheme) + // if err != nil { + // log.FromContext(ctx).V(1).Error(err, "unable to set owner reference") + // } } return childResource } @@ -422,11 +444,11 @@ func (childResources *BaseConfig) mergeInferenceModel(ctx context.Context, msvc im.Spec.ModelName = msvc.Spec.Routing.ModelName im.Spec.PoolRef.Name = giev1alpha2.ObjectName(infPoolName(msvc)) - // Set owner reference for the merged service - if err := controllerutil.SetOwnerReference(msvc, im, scheme); err != nil { - log.FromContext(ctx).V(1).Error(err, "unable to set owner ref for inferencepool") - return childResources - } + // // Set owner reference for the merged service + // if err := controllerutil.SetOwnerReference(msvc, im, scheme); err != nil { + // log.FromContext(ctx).V(1).Error(err, "unable to set owner ref for inferencepool") + // return childResources + // } return childResources } @@ -500,11 +522,11 @@ func (childResource *BaseConfig) mergePDService(ctx context.Context, msvc *msv1a return childResource } - // Set owner reference for the merged service - if err := controllerutil.SetOwnerReference(msvc, &destService, scheme); err != nil { - log.FromContext(ctx).V(1).Error(err, "unable to set owner ref for service "+role) - return childResource - } + // // Set owner reference for the merged service + // if err := controllerutil.SetOwnerReference(msvc, &destService, scheme); err != nil { + // log.FromContext(ctx).V(1).Error(err, "unable to set owner ref for service "+role) + // return childResource + // } // Set the merged service for child resource if role == PREFILL_ROLE { @@ -601,11 +623,11 @@ func (childResource *BaseConfig) mergePDDeployment(ctx context.Context, msvc *ms }, } - // Finally, set owner references - err = controllerutil.SetOwnerReference(msvc, desiredDeployment, scheme) - if err != nil { - log.FromContext(ctx).V(1).Error(err, "unable to set owner reference") - } + // // Finally, set owner references + // err = controllerutil.SetOwnerReference(msvc, desiredDeployment, scheme) + // if err != nil { + // log.FromContext(ctx).V(1).Error(err, "unable to set owner reference") + // } // Finally, in Mergo merge... // We create a destination deployment object from baseconfig @@ -649,6 +671,156 @@ func (childResource *BaseConfig) mergePDDeployment(ctx context.Context, msvc *ms return childResource } +// mergePDLeaderWorkerSet uses msvc fields to update childResource prefill deployment +func (childResource *BaseConfig) mergePDLeaderWorkerSet(ctx context.Context, msvc *msv1alpha1.ModelService, role string, scheme *runtime.Scheme) *BaseConfig { + pdSpec := &msv1alpha1.PDSpec{} + if role == PREFILL_ROLE { + if msvc.Spec.Prefill != nil { + pdSpec = msvc.Spec.Prefill + } + if childResource.PrefillLeaderWorkerSet == nil { + childResource.PrefillLeaderWorkerSet = &lwsv1.LeaderWorkerSet{} + } + } + if role == DECODE_ROLE { + if msvc.Spec.Decode != nil { + pdSpec = msvc.Spec.Decode + } + if childResource.DecodeLeaderWorkerSet == nil { + childResource.DecodeLeaderWorkerSet = &lwsv1.LeaderWorkerSet{} + } + } + + var err error + + // Compute fields needed + podLabels := getPodLabels(ctx, msvc, role) + var nodeAffinity *corev1.Affinity + + // AcceleratorTypes maybe nil... TODO: check + na, err := AcceleratorTypesToNodeAffinity(pdSpec.AcceleratorTypes) + if err == nil { + nodeAffinity = &corev1.Affinity{ + NodeAffinity: na, + } + } else { + log.FromContext(ctx).V(1).Error(err, "unable to get node affinity") + } + + // Step 1: Create an empty deployment + desiredLeaderWorkerSet := &lwsv1.LeaderWorkerSet{ + TypeMeta: metav1.TypeMeta{ + Kind: "LeaderWorkerSet", + APIVersion: "leaderworkerset.x-k8s.io/v1", + }, + + ObjectMeta: metav1.ObjectMeta{ + Name: deploymentName(msvc, role), + Namespace: msvc.Namespace, + + // Define the labels for this PD deployment + // Same as pod labels + Labels: podLabels, + }, + Spec: lwsv1.LeaderWorkerSetSpec{ + // Define replicas + // Decouple scaling will be handled in the merge + Replicas: pdSpec.Replicas, + + LeaderWorkerTemplate: lwsv1.LeaderWorkerTemplate{ + + LeaderTemplate: &corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + // Define pod labels, must match selector labels + Labels: podLabels, + }, + Spec: corev1.PodSpec{ + // populate containers + InitContainers: convertToContainerSliceWithURIInfo(ctx, pdSpec.InitContainers, msvc), + Containers: convertToContainerSliceWithURIInfo(ctx, pdSpec.Containers, msvc), + + // populate node affinity + Affinity: nodeAffinity, + + // populate service account for PD pods + ServiceAccountName: pdServiceAccountName(msvc), + + // // populate volumes based on URI + // Volumes: getVolumeForPDDeployment(ctx, msvc), + }, + }, + + WorkerTemplate: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + // Define pod labels, must match selector labels + Labels: podLabels, + }, + Spec: corev1.PodSpec{ + // populate containers + Containers: convertToContainerSliceWithURIInfo(ctx, pdSpec.Containers, msvc), + + // populate node affinity + Affinity: nodeAffinity, + + // populate service account for PD pods + ServiceAccountName: pdServiceAccountName(msvc), + + // // populate volumes based on URI + // Volumes: getVolumeForPDDeployment(ctx, msvc), + }, + }, + }, + }, + } + + // // Finally, set owner references + // err = controllerutil.SetOwnerReference(msvc, desiredLeaderWorkerSet, scheme) + // if err != nil { + // log.FromContext(ctx).V(1).Error(err, "unable to set owner reference") + // } + + // Finally, in Mergo merge... + // We create a destination deployment object from baseconfig + // We create a source deployment object from model service + // We merge source into destination + // We apply the merged destination + + var originalLeaderWorkerSet *lwsv1.LeaderWorkerSet + + log.FromContext(ctx).V(1).Info("merging PD LeaderWorkerSet", "role", role, "desiredLeaderWorkerSet (src)", desiredLeaderWorkerSet) + if role == PREFILL_ROLE { + originalLeaderWorkerSet = childResource.PrefillLeaderWorkerSet + } + if role == DECODE_ROLE { + originalLeaderWorkerSet = childResource.DecodeLeaderWorkerSet + } + + // Mergo merge + log.FromContext(ctx).V(1).Info("merging PD LeaderWorkerSet", "desiredLeaderWorkerSet (dst)", originalLeaderWorkerSet) + if err = mergo.Merge( + originalLeaderWorkerSet, + desiredLeaderWorkerSet, + mergo.WithOverride, + mergo.WithAppendSlice, + mergo.WithTransformers(containerSliceTransformer{})); err != nil { + log.FromContext(ctx).V(1).Error(err, "mergo error") + } else { + + // Log errors + // technically we can log using originalDeployment here, but be safe + // and log what's directly stored in childResources.Deployment + var mergedLeaderWorkerSet *lwsv1.LeaderWorkerSet + if role == DECODE_ROLE { + mergedLeaderWorkerSet = childResource.PrefillLeaderWorkerSet + } else { + mergedLeaderWorkerSet = childResource.DecodeLeaderWorkerSet + } + log.FromContext(ctx).V(1).Info("merging was succesful", "merged leaderWorkerSet", mergedLeaderWorkerSet) + } + + return childResource +} + // setPDServiceAccount defines a servicd account for the P and D deployments func (childResource *BaseConfig) setPDServiceAccount(ctx context.Context, msvc *msv1alpha1.ModelService, scheme *runtime.Scheme, rbacOptions *RBACOptions) *BaseConfig { sa := &corev1.ServiceAccount{ @@ -666,12 +838,12 @@ func (childResource *BaseConfig) setPDServiceAccount(ctx context.Context, msvc * sa.ImagePullSecrets = append(sa.ImagePullSecrets, corev1.LocalObjectReference{Name: name}) } - // Set owner reference for service account - // TODO: should childresource be returned when owner ref is not set? - if err := controllerutil.SetOwnerReference(msvc, sa, scheme); err != nil { - log.FromContext(ctx).V(1).Error(err, "unable to set owner ref for service account") - return childResource - } + // // Set owner reference for service account + // // TODO: should childresource be returned when owner ref is not set? + // if err := controllerutil.SetOwnerReference(msvc, sa, scheme); err != nil { + // log.FromContext(ctx).V(1).Error(err, "unable to set owner ref for service account") + // return childResource + // } childResource.PDServiceAccount = sa @@ -696,10 +868,10 @@ func (childResource *BaseConfig) setEPPServiceAccount(ctx context.Context, msvc childResource.EPPServiceAccount = eppServiceAccount - // TODO: should childresource be returned when owner ref is not set? - if err := controllerutil.SetOwnerReference(msvc, eppServiceAccount, scheme); err != nil { - log.FromContext(ctx).V(1).Error(err, "unable to set owner ref for service account") - } + // // TODO: should childresource be returned when owner ref is not set? + // if err := controllerutil.SetOwnerReference(msvc, eppServiceAccount, scheme); err != nil { + // log.FromContext(ctx).V(1).Error(err, "unable to set owner ref for service account") + // } } func (childResource *BaseConfig) setEPPRoleBinding(ctx context.Context, msvc *msv1alpha1.ModelService, rbacOptions *RBACOptions, scheme *runtime.Scheme) { @@ -728,10 +900,10 @@ func (childResource *BaseConfig) setEPPRoleBinding(ctx context.Context, msvc *ms }, } - // Set owner reference for EPPRoleBinding - if err := controllerutil.SetOwnerReference(msvc, childResource.EPPRoleBinding, scheme); err != nil { - log.FromContext(ctx).V(1).Error(err, "unable to set owner ref for epp rolebinding") - } + // // Set owner reference for EPPRoleBinding + // if err := controllerutil.SetOwnerReference(msvc, childResource.EPPRoleBinding, scheme); err != nil { + // log.FromContext(ctx).V(1).Error(err, "unable to set owner ref for epp rolebinding") + // } } @@ -793,11 +965,11 @@ func (childResources *BaseConfig) mergeEppDeployment(ctx context.Context, msvc * return childResources } - // Set owner reference for the merged service - if err := controllerutil.SetOwnerReference(msvc, &dest, scheme); err != nil { - log.FromContext(ctx).V(1).Error(err, "unable to set owner ref for inferencepool") - return childResources - } + // // Set owner reference for the merged service + // if err := controllerutil.SetOwnerReference(msvc, &dest, scheme); err != nil { + // log.FromContext(ctx).V(1).Error(err, "unable to set owner ref for inferencepool") + // return childResources + // } log.FromContext(ctx).V(1).Info("deployment", "post-merge-label", dest.Labels, "post-merge-spec", dest.Spec) // Set the merged epp deployment in the child resource childResources.EPPDeployment = &dest @@ -830,10 +1002,10 @@ func (childResources *BaseConfig) mergeEppService(ctx context.Context, msvc *msv log.FromContext(ctx).V(1).Error(err, "problem with epp service merge") return childResources } - if err := controllerutil.SetOwnerReference(msvc, &dest, scheme); err != nil { - log.FromContext(ctx).V(1).Error(err, "unable to set owner ref for inferencepool") - return childResources - } + // if err := controllerutil.SetOwnerReference(msvc, &dest, scheme); err != nil { + // log.FromContext(ctx).V(1).Error(err, "unable to set owner ref for inferencepool") + // return childResources + // } // Set the merged epp service in the child resource childResources.EPPService = &dest @@ -891,11 +1063,11 @@ func (childResources *BaseConfig) mergeHTTPRoute(ctx context.Context, msvc *msv1 return childResources } - // Set owner reference for the merged service - if err := controllerutil.SetOwnerReference(msvc, &dest, scheme); err != nil { - log.FromContext(ctx).V(1).Error(err, "unable to set owner ref for httproute") - return childResources - } + // // Set owner reference for the merged service + // if err := controllerutil.SetOwnerReference(msvc, &dest, scheme); err != nil { + // log.FromContext(ctx).V(1).Error(err, "unable to set owner ref for httproute") + // return childResources + // } // Set the merged inferncepool in the child resource childResources.HTTPRoute = &dest @@ -943,11 +1115,11 @@ func (childResources *BaseConfig) mergeInferencePool(ctx context.Context, msvc * return childResources } - // Set owner reference for the merged service - if err := controllerutil.SetOwnerReference(msvc, &dest, scheme); err != nil { - log.FromContext(ctx).V(1).Error(err, "unable to set owner ref for inferencepool") - return childResources - } + // // Set owner reference for the merged service + // if err := controllerutil.SetOwnerReference(msvc, &dest, scheme); err != nil { + // log.FromContext(ctx).V(1).Error(err, "unable to set owner ref for inferencepool") + // return childResources + // } // Set the merged inferncepool in the child resource childResources.InferencePool = &dest @@ -970,6 +1142,10 @@ func (childResource *BaseConfig) invokeCreateOrUpdate(ctx context.Context, r *Mo results = append(results, createOrUpdatePDDeployment(ctx, r, childResource.PrefillDeployment, msvc.Spec.DecoupleScaling)) } + if childResource.shouldCreatePrefillLeaderWorkerSet() { + results = append(results, createOrUpdatePDLeaderWorkerSet(ctx, r, childResource.PrefillLeaderWorkerSet, msvc.Spec.DecoupleScaling)) + } + if childResource.shouldCreatePrefillService() { results = append(results, createOrUpdateService(ctx, r, childResource.PrefillService)) } @@ -978,6 +1154,10 @@ func (childResource *BaseConfig) invokeCreateOrUpdate(ctx context.Context, r *Mo results = append(results, createOrUpdatePDDeployment(ctx, r, childResource.DecodeDeployment, msvc.Spec.DecoupleScaling)) } + if childResource.shouldCreateDecodeLeaderWorkerSet() { + results = append(results, createOrUpdatePDLeaderWorkerSet(ctx, r, childResource.DecodeLeaderWorkerSet, msvc.Spec.DecoupleScaling)) + } + if childResource.shouldCreateDecodeService() { results = append(results, createOrUpdateService(ctx, r, childResource.DecodeService)) } @@ -1115,6 +1295,50 @@ func createOrUpdatePDDeploymentInCluster(ctx context.Context, r *ModelServiceRec return err } +func createOrUpdatePDLeaderWorkerSetInCluster(ctx context.Context, r *ModelServiceReconciler, desiredObjectState lwsv1.LeaderWorkerSet, emptyObject *lwsv1.LeaderWorkerSet, decoupleScaling bool) error { + var err error + + desiredObjName := desiredObjectState.GetName() + desiredObjNamespace := desiredObjectState.GetNamespace() + + // emptyObject is the object to look for in the cluster + log.FromContext(ctx).V(1).Info("looking to createOrUpdate object in cluster: ", "obj name", desiredObjName, "obj namespace", desiredObjNamespace, "obj kind", desiredObjectState.GetObjectKind()) + + // Set the empty object with the name and namespace so we can look it up in the cluster + // and populate emptyObject with the current state of the object + emptyObject.SetName(desiredObjName) + emptyObject.SetNamespace(desiredObjNamespace) + + op, err := controllerutil.CreateOrUpdate(ctx, r.Client, emptyObject, func() error { + log.FromContext(ctx).V(1).Info("initial replica count", "replica", emptyObject.Spec.Replicas) + + // We should only update replica if decoupleScaling is False + if !emptyObject.GetCreationTimestamp().Time.IsZero() && decoupleScaling { + log.FromContext(ctx).V(1).Info("scaling is decoupled, setting deployment replica value to incluster replica count") + desiredObjectState.Spec.Replicas = nil + } + + // Mergo merge with override means labels, annotations (maps) key-value pairs are preseved + // while other fields are overriden if not nil in desiredObjectState + mergeErr := mergo.Merge(emptyObject, desiredObjectState, mergo.WithOverride) + if mergeErr != nil { + log.FromContext(ctx).V(1).Error(err, "attemping to merge inside createOrUpdate, but failed for object "+emptyObject.GetName()) + } else { + log.FromContext(ctx).V(1).Info("successfully merged PD deployment in cluster" + emptyObject.GetName()) + log.FromContext(ctx).V(1).Info("successfully merged PD deployment in cluster", "replica", emptyObject.Spec.Replicas) + } + + return mergeErr + }) + + log.FromContext(ctx).V(1).Info("performed createOrUpdate", "obj name", emptyObject.GetName(), "operation", op) + if err != nil { + log.FromContext(ctx).V(1).Error(err, "createOrUpdate failed", "obj name", emptyObject.GetName()) + } + + return err +} + // createOrUpdateConfigMaps creates or updates multiple of ConfigMaps in the cluster func createOrUpdateConfigMaps(ctx context.Context, r *ModelServiceReconciler, desiredConfigMaps []corev1.ConfigMap) []error { var errors []error @@ -1138,6 +1362,11 @@ func createOrUpdatePDDeployment(ctx context.Context, r *ModelServiceReconciler, return createOrUpdatePDDeploymentInCluster(ctx, r, *desiredDeployment, &emptyDeployment, decoupleScaling) } +func createOrUpdatePDLeaderWorkerSet(ctx context.Context, r *ModelServiceReconciler, desiredLeaderWorkerSet *lwsv1.LeaderWorkerSet, decoupleScaling bool) error { + emptyLeaderWorkerSet := lwsv1.LeaderWorkerSet{} + return createOrUpdatePDLeaderWorkerSetInCluster(ctx, r, *desiredLeaderWorkerSet, &emptyLeaderWorkerSet, decoupleScaling) +} + // createOrUpdateService creates or updates a service object in the cluster func createOrUpdateService(ctx context.Context, r *ModelServiceReconciler, desiredService *corev1.Service) error { emptyService := corev1.Service{} diff --git a/internal/controller/modelservice_controller.go b/internal/controller/modelservice_controller.go index 4662b4a..0a3a648 100644 --- a/internal/controller/modelservice_controller.go +++ b/internal/controller/modelservice_controller.go @@ -20,6 +20,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/reconcile" gatewayv1 "sigs.k8s.io/gateway-api/apis/v1" + lwsv1 "sigs.k8s.io/lws/api/leaderworkerset/v1" msv1alpha1 "github.com/llm-d/llm-d-model-service/api/v1alpha1" giev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" @@ -56,22 +57,26 @@ type ModelServiceReconciler struct { // Context is intended to be use for interpolating template variables // in BaseConfig type TemplateVars struct { - ModelServiceName string `json:"modelServiceName,omitempty"` - ModelServiceNamespace string `json:"modelServiceNamespace,omitempty"` - ModelName string `json:"modelName,omitempty"` - HFModelName string `json:"hfModelName,omitempty"` - SanitizedModelName string `json:"sanitizedModelName,omitempty"` - ModelPath string `json:"modelPath,omitempty"` - MountedModelPath string `json:"mountedModelPath,omitempty"` - AuthSecretName string `json:"authSecretName,omitempty"` - EPPServiceName string `json:"eppServiceName,omitempty"` - EPPDeploymentName string `json:"eppDeploymentName,omitempty"` - PrefillDeploymentName string `json:"prefillDeploymentName,omitempty"` - DecodeDeploymentName string `json:"decodeDeploymentName,omitempty"` - PrefillServiceName string `json:"prefillServiceName,omitempty"` - DecodeServiceName string `json:"decodeServiceName,omitempty"` - InferencePoolName string `json:"inferencePoolName,omitempty"` - InferenceModelName string `json:"inferenceModelName,omitempty"` + ModelServiceName string `json:"modelServiceName,omitempty"` + ModelServiceNamespace string `json:"modelServiceNamespace,omitempty"` + ModelName string `json:"modelName,omitempty"` + HFModelName string `json:"hfModelName,omitempty"` + SanitizedModelName string `json:"sanitizedModelName,omitempty"` + ModelPath string `json:"modelPath,omitempty"` + MountedModelPath string `json:"mountedModelPath,omitempty"` + AuthSecretName string `json:"authSecretName,omitempty"` + EPPServiceName string `json:"eppServiceName,omitempty"` + EPPDeploymentName string `json:"eppDeploymentName,omitempty"` + PrefillDeploymentName string `json:"prefillDeploymentName,omitempty"` + DecodeDeploymentName string `json:"decodeDeploymentName,omitempty"` + PrefillServiceName string `json:"prefillServiceName,omitempty"` + DecodeServiceName string `json:"decodeServiceName,omitempty"` + InferencePoolName string `json:"inferencePoolName,omitempty"` + InferenceModelName string `json:"inferenceModelName,omitempty"` + DecodeTensorParallelism string `json:"decodeTensorParallelSize,omitempty"` + DecodeDataParallelism string `json:"decodeDataParallelSize,omitempty"` + PrefillTensorParallelism string `json:"prefillTensorParallelSize,omitempty"` + PrefillDataParallelism string `json:"prefillDataParallelSize,omitempty"` } // from populates the field values for TemplateVars from the model service @@ -100,6 +105,26 @@ func (t *TemplateVars) from(ctx context.Context, msvc *msv1alpha1.ModelService) t.ModelName = msvc.Spec.Routing.ModelName t.SanitizedModelName = sanitizeModelName(msvc) + t.DecodeTensorParallelism = "1" + if msvc.Spec.Decode != nil && msvc.Spec.Decode.Parallelism != nil { + t.DecodeTensorParallelism = fmt.Sprintf("%d", *msvc.Spec.Decode.Parallelism.Tensor) + } + + t.DecodeDataParallelism = "1" + if msvc.Spec.Decode != nil && msvc.Spec.Decode.Parallelism != nil { + t.DecodeDataParallelism = fmt.Sprintf("%d", *msvc.Spec.Decode.Parallelism.Data) + } + + t.PrefillTensorParallelism = "1" + if msvc.Spec.Prefill != nil && msvc.Spec.Prefill.Parallelism != nil { + t.PrefillTensorParallelism = fmt.Sprintf("%d", *msvc.Spec.Prefill.Parallelism.Tensor) + } + + t.PrefillDataParallelism = "1" + if msvc.Spec.Prefill != nil && msvc.Spec.Prefill.Parallelism != nil { + t.PrefillDataParallelism = fmt.Sprintf("%d", *msvc.Spec.Prefill.Parallelism.Data) + } + if msvc.Spec.ModelArtifacts.AuthSecretName != nil { t.AuthSecretName = *msvc.Spec.ModelArtifacts.AuthSecretName } @@ -164,6 +189,7 @@ func (t *TemplateFuncs) from(ctx context.Context, msvc *msv1alpha1.ModelService) // +kubebuilder:rbac:groups="",resources=services,verbs=list;watch;create;update;patch;delete // +kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=rolebindings,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups="leaderworkerset.x-k8s.io",resources=leaderworkersets,verbs=get;list;watch;create;update;patch;delete // For more details, check Reconcile and its Result here: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.20.4/pkg/reconcile @@ -243,6 +269,7 @@ func (r *ModelServiceReconciler) SetupWithManager(mgr ctrl.Manager) error { Watches(&giev1alpha2.InferenceModel{}, handler.EnqueueRequestsFromMapFunc(r.inferenceModelMapFunc)). Watches(&giev1alpha2.InferencePool{}, handler.EnqueueRequestsFromMapFunc(r.inferencePoolMapFunc)). Watches(&corev1.ServiceAccount{}, handler.EnqueueRequestsFromMapFunc(r.serviceAccountMapFunc)). + Watches(&lwsv1.LeaderWorkerSet{}, handler.EnqueueRequestsFromMapFunc(r.lwsMapFunc)). Complete(r) } @@ -515,6 +542,19 @@ func (r *ModelServiceReconciler) inferenceModelMapFunc(ctx context.Context, obj return nil } +// lwsMapFunc maps leaderworkersets to ModelService owner +func (r *ModelServiceReconciler) lwsMapFunc(ctx context.Context, obj client.Object) []reconcile.Request { + lws, ok := obj.(*lwsv1.LeaderWorkerSet) + if !ok { + return nil + } + shouldReturn, result := requeueMsvcReq(ctx, lws) + if shouldReturn { + return result + } + return nil +} + func requeueMsvcReq(ctx context.Context, obj client.Object) (bool, []reconcile.Request) { for _, owner := range obj.GetOwnerReferences() { if owner.Kind == "ModelService" && owner.APIVersion == "llm-d.ai/v1alpha1" { diff --git a/samples/deepseek/deepseek-1t1d-manifest.yaml b/samples/deepseek/deepseek-1t1d-manifest.yaml new file mode 100644 index 0000000..9af8fd8 --- /dev/null +++ b/samples/deepseek/deepseek-1t1d-manifest.yaml @@ -0,0 +1,882 @@ +--- +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: decode + name: deepsk-ai-deepsk-coder-v2-lite-instruct-decode +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: decode + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: gpu.nvidia.com/model + operator: In + values: + - H200 + containers: + - args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8200 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8200 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + command: + - /bin/sh + - -c + env: + - name: DP_SIZE + value: "1" + - name: TP_SIZE + value: "1" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: https://github.com/vllm-project/vllm.git + - name: VLLM_BRANCH + value: main + - name: VLLM_ALL2ALL_BACKEND + value: pplx + - name: NVIDIA_GDRCOPY + value: enabled + - name: NCCL_DEBUG + value: INFO + - name: NVSHMEM_DEBUG + value: TRACE + - name: NVSHMEM_DEBUG_SUBSYS + value: TRANSPORT,INIT,MEM,COLL,BOOTSTRAP + - name: NVSHMEM_REMOTE_TRANSPORT + value: ibrc + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1 + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: eth0 + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NCCL_IB_HCA + value: ibp + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: HF_TOKEN + name: hf-secret + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + key: GH_TOKEN + name: gh-token-secret + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/tms/vllm-dev-base:0.0.15 + imagePullPolicy: Always + name: vllm-worker + resources: + limits: + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + requests: + cpu: "8" + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + securityContext: + capabilities: + add: + - IPC_LOCK + stdin: true + tty: true + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /init-scripts + name: init-scripts-volume + workingDir: /app + initContainers: + - args: + - --port=8080 + - --vllm-port=8200 + - --connector=nixlv2 + - -v=6 + image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6 + imagePullPolicy: Always + name: routing-proxy + ports: + - containerPort: 8080 + protocol: TCP + resources: {} + restartPolicy: Always + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-sa + volumes: + - configMap: + defaultMode: 493 + name: vllm-init-scripts-config + name: init-scripts-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: dshm + restartPolicy: RecreateGroupOnPodRestart + size: 1 + workerTemplate: + metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: decode + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: gpu.nvidia.com/model + operator: In + values: + - H200 + containers: + - args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8200 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8200 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + command: + - /bin/sh + - -c + env: + - name: DP_SIZE + value: "1" + - name: TP_SIZE + value: "1" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: https://github.com/vllm-project/vllm.git + - name: VLLM_BRANCH + value: main + - name: VLLM_ALL2ALL_BACKEND + value: pplx + - name: NVIDIA_GDRCOPY + value: enabled + - name: NCCL_DEBUG + value: INFO + - name: NVSHMEM_DEBUG + value: TRACE + - name: NVSHMEM_DEBUG_SUBSYS + value: TRANSPORT,INIT,MEM,COLL,BOOTSTRAP + - name: NVSHMEM_REMOTE_TRANSPORT + value: ibrc + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1 + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: eth0 + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NCCL_IB_HCA + value: ibp + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: HF_TOKEN + name: hf-secret + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + key: GH_TOKEN + name: gh-token-secret + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/tms/vllm-dev-base:0.0.15 + imagePullPolicy: Always + name: vllm-worker + resources: + limits: + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + requests: + cpu: "8" + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + securityContext: + capabilities: + add: + - IPC_LOCK + stdin: true + tty: true + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /init-scripts + name: init-scripts-volume + workingDir: /app + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-sa + volumes: + - configMap: + defaultMode: 493 + name: vllm-init-scripts-config + name: init-scripts-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: dshm + replicas: 1 + rolloutStrategy: + type: "" + startupPolicy: LeaderCreated +status: {} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + creationTimestamp: null + labels: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp +spec: + selector: + matchLabels: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + strategy: {} + template: + metadata: + creationTimestamp: null + labels: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + spec: + containers: + - args: + - -poolName + - deepsk-ai-deepsk-coder-v2-lite-instruct-inference-pool + - -poolNamespace + - llmd-kalantar + - -v + - "5" + - --zap-encoder + - json + - -grpcPort + - "9002" + - -grpcHealthPort + - "9003" + env: + - name: PD_ENABLED + value: "true" + - name: PD_PROMPT_LEN_THRESHOLD + value: "10" + image: ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3 + livenessProbe: + failureThreshold: 3 + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + periodSeconds: 10 + name: epp + ports: + - containerPort: 9002 + protocol: TCP + - containerPort: 9003 + protocol: TCP + - containerPort: 9090 + name: metrics + protocol: TCP + readinessProbe: + failureThreshold: 3 + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + periodSeconds: 10 + resources: {} + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-sa +status: {} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: pod-read +subjects: +- kind: ServiceAccount + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-sa +--- +apiVersion: v1 +kind: Service +metadata: + creationTimestamp: null + labels: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-service +spec: + ports: + - appProtocol: http2 + port: 9002 + protocol: TCP + targetPort: 9002 + selector: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + type: ClusterIP +status: + loadBalancer: {} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-sa +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-http-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: deepsk-ai-deepsk-coder-v2-lite-instruct-inference-pool + port: 8080 +status: + parents: null +--- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + name: deepsk-ai-deepsk-coder-v2-lite-instruct +spec: + modelName: deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct + poolRef: + name: deepsk-ai-deepsk-coder-v2-lite-instruct-inference-pool +status: {} +--- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-inference-pool +spec: + extensionRef: + failureMode: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-service + selector: + leaderworkerset.sigs.k8s.io/worker-index: "0" + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + targetPortNumber: 8080 +status: {} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-sa +--- +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: prefill + name: deepsk-ai-deepsk-coder-v2-lite-instruct-prefill +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: prefill + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: gpu.nvidia.com/model + operator: In + values: + - H200 + containers: + - args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8080 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8080 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + command: + - /bin/sh + - -c + env: + - name: DP_SIZE + value: "1" + - name: TP_SIZE + value: "1" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: https://github.com/vllm-project/vllm.git + - name: VLLM_BRANCH + value: main + - name: VLLM_ALL2ALL_BACKEND + value: pplx + - name: NVIDIA_GDRCOPY + value: enabled + - name: NCCL_DEBUG + value: INFO + - name: NVSHMEM_DEBUG + value: TRACE + - name: NVSHMEM_DEBUG_SUBSYS + value: TRANSPORT,INIT,MEM,COLL,BOOTSTRAP + - name: NVSHMEM_REMOTE_TRANSPORT + value: ibrc + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1 + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: eth0 + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NCCL_IB_HCA + value: ibp + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: HF_TOKEN + name: hf-secret + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + key: GH_TOKEN + name: gh-token-secret + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/tms/vllm-dev-base:0.0.15 + imagePullPolicy: Always + name: vllm-worker + resources: + limits: + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + requests: + cpu: "8" + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + securityContext: + capabilities: + add: + - IPC_LOCK + stdin: true + tty: true + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /init-scripts + name: init-scripts-volume + workingDir: /app + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-sa + volumes: + - configMap: + defaultMode: 493 + name: vllm-init-scripts-config + name: init-scripts-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: dshm + restartPolicy: RecreateGroupOnPodRestart + size: 1 + workerTemplate: + metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: prefill + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: gpu.nvidia.com/model + operator: In + values: + - H200 + containers: + - args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8080 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8080 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + command: + - /bin/sh + - -c + env: + - name: DP_SIZE + value: "1" + - name: TP_SIZE + value: "1" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: https://github.com/vllm-project/vllm.git + - name: VLLM_BRANCH + value: main + - name: VLLM_ALL2ALL_BACKEND + value: pplx + - name: NVIDIA_GDRCOPY + value: enabled + - name: NCCL_DEBUG + value: INFO + - name: NVSHMEM_DEBUG + value: TRACE + - name: NVSHMEM_DEBUG_SUBSYS + value: TRANSPORT,INIT,MEM,COLL,BOOTSTRAP + - name: NVSHMEM_REMOTE_TRANSPORT + value: ibrc + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1 + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: eth0 + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NCCL_IB_HCA + value: ibp + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: HF_TOKEN + name: hf-secret + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + key: GH_TOKEN + name: gh-token-secret + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/tms/vllm-dev-base:0.0.15 + imagePullPolicy: Always + name: vllm-worker + resources: + limits: + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + requests: + cpu: "8" + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + securityContext: + capabilities: + add: + - IPC_LOCK + stdin: true + tty: true + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /init-scripts + name: init-scripts-volume + workingDir: /app + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-sa + volumes: + - configMap: + defaultMode: 493 + name: vllm-init-scripts-config + name: init-scripts-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: dshm + replicas: 1 + rolloutStrategy: + type: "" + startupPolicy: LeaderCreated +status: {} + diff --git a/samples/deepseek/deepseek-1t1d.yaml b/samples/deepseek/deepseek-1t1d.yaml new file mode 100644 index 0000000..a44ba70 --- /dev/null +++ b/samples/deepseek/deepseek-1t1d.yaml @@ -0,0 +1,47 @@ +apiVersion: llm-d.ai/v1alpha1 +kind: ModelService +metadata: + name: deepsk-ai-deepsk-coder-v2-lite-instruct +spec: + decoupleScaling: false + + baseConfigMapRef: + name: lws-baseconfig + + routing: + modelName: deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct + ports: + - name: app_port + port: 8080 + - name: internal_port + port: 8200 + + modelArtifacts: + uri: hf://deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct + authSecretName: hf-secret + + # describe decode pods + decode: + replicas: 1 + parallelism: + tensor: 1 + # tensor: 2 + data: 1 + # data: 4 + acceleratorTypes: + labelKey: gpu.nvidia.com/model + labelValues: + - H200 + + # describe the prefill pods + prefill: + replicas: 1 + parallelism: + tensor: 1 + # tensor: 2 + data: 1 + # data: 4 + acceleratorTypes: + labelKey: gpu.nvidia.com/model + labelValues: + - H200 diff --git a/samples/deepseek/deepseek-1t2d-manifest.yaml b/samples/deepseek/deepseek-1t2d-manifest.yaml new file mode 100644 index 0000000..682f6fe --- /dev/null +++ b/samples/deepseek/deepseek-1t2d-manifest.yaml @@ -0,0 +1,882 @@ +--- +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: decode + name: deepsk-ai-deepsk-coder-v2-lite-instruct-decode +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: decode + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: gpu.nvidia.com/model + operator: In + values: + - H200 + containers: + - args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8200 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8200 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + command: + - /bin/sh + - -c + env: + - name: DP_SIZE + value: "2" + - name: TP_SIZE + value: "1" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: https://github.com/vllm-project/vllm.git + - name: VLLM_BRANCH + value: main + - name: VLLM_ALL2ALL_BACKEND + value: pplx + - name: NVIDIA_GDRCOPY + value: enabled + - name: NCCL_DEBUG + value: INFO + - name: NVSHMEM_DEBUG + value: TRACE + - name: NVSHMEM_DEBUG_SUBSYS + value: TRANSPORT,INIT,MEM,COLL,BOOTSTRAP + - name: NVSHMEM_REMOTE_TRANSPORT + value: ibrc + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1 + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: eth0 + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NCCL_IB_HCA + value: ibp + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: HF_TOKEN + name: hf-secret + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + key: GH_TOKEN + name: gh-token-secret + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/tms/vllm-dev-base:0.0.15 + imagePullPolicy: Always + name: vllm-worker + resources: + limits: + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + requests: + cpu: "8" + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + securityContext: + capabilities: + add: + - IPC_LOCK + stdin: true + tty: true + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /init-scripts + name: init-scripts-volume + workingDir: /app + initContainers: + - args: + - --port=8080 + - --vllm-port=8200 + - --connector=nixlv2 + - -v=6 + image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6 + imagePullPolicy: Always + name: routing-proxy + ports: + - containerPort: 8080 + protocol: TCP + resources: {} + restartPolicy: Always + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-sa + volumes: + - configMap: + defaultMode: 493 + name: vllm-init-scripts-config + name: init-scripts-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: dshm + restartPolicy: RecreateGroupOnPodRestart + size: 2 + workerTemplate: + metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: decode + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: gpu.nvidia.com/model + operator: In + values: + - H200 + containers: + - args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8200 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8200 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + command: + - /bin/sh + - -c + env: + - name: DP_SIZE + value: "2" + - name: TP_SIZE + value: "1" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: https://github.com/vllm-project/vllm.git + - name: VLLM_BRANCH + value: main + - name: VLLM_ALL2ALL_BACKEND + value: pplx + - name: NVIDIA_GDRCOPY + value: enabled + - name: NCCL_DEBUG + value: INFO + - name: NVSHMEM_DEBUG + value: TRACE + - name: NVSHMEM_DEBUG_SUBSYS + value: TRANSPORT,INIT,MEM,COLL,BOOTSTRAP + - name: NVSHMEM_REMOTE_TRANSPORT + value: ibrc + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1 + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: eth0 + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NCCL_IB_HCA + value: ibp + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: HF_TOKEN + name: hf-secret + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + key: GH_TOKEN + name: gh-token-secret + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/tms/vllm-dev-base:0.0.15 + imagePullPolicy: Always + name: vllm-worker + resources: + limits: + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + requests: + cpu: "8" + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + securityContext: + capabilities: + add: + - IPC_LOCK + stdin: true + tty: true + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /init-scripts + name: init-scripts-volume + workingDir: /app + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-sa + volumes: + - configMap: + defaultMode: 493 + name: vllm-init-scripts-config + name: init-scripts-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: dshm + replicas: 1 + rolloutStrategy: + type: "" + startupPolicy: LeaderCreated +status: {} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + creationTimestamp: null + labels: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp +spec: + selector: + matchLabels: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + strategy: {} + template: + metadata: + creationTimestamp: null + labels: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + spec: + containers: + - args: + - -poolName + - deepsk-ai-deepsk-coder-v2-lite-instruct-inference-pool + - -poolNamespace + - llmd-kalantar + - -v + - "5" + - --zap-encoder + - json + - -grpcPort + - "9002" + - -grpcHealthPort + - "9003" + env: + - name: PD_ENABLED + value: "true" + - name: PD_PROMPT_LEN_THRESHOLD + value: "10" + image: ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3 + livenessProbe: + failureThreshold: 3 + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + periodSeconds: 10 + name: epp + ports: + - containerPort: 9002 + protocol: TCP + - containerPort: 9003 + protocol: TCP + - containerPort: 9090 + name: metrics + protocol: TCP + readinessProbe: + failureThreshold: 3 + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + periodSeconds: 10 + resources: {} + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-sa +status: {} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: pod-read +subjects: +- kind: ServiceAccount + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-sa +--- +apiVersion: v1 +kind: Service +metadata: + creationTimestamp: null + labels: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-service +spec: + ports: + - appProtocol: http2 + port: 9002 + protocol: TCP + targetPort: 9002 + selector: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + type: ClusterIP +status: + loadBalancer: {} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-sa +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-http-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: deepsk-ai-deepsk-coder-v2-lite-instruct-inference-pool + port: 8080 +status: + parents: null +--- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + name: deepsk-ai-deepsk-coder-v2-lite-instruct +spec: + modelName: deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct + poolRef: + name: deepsk-ai-deepsk-coder-v2-lite-instruct-inference-pool +status: {} +--- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-inference-pool +spec: + extensionRef: + failureMode: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-service + selector: + leaderworkerset.sigs.k8s.io/worker-index: "0" + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + targetPortNumber: 8080 +status: {} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-sa +--- +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: prefill + name: deepsk-ai-deepsk-coder-v2-lite-instruct-prefill +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: prefill + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: gpu.nvidia.com/model + operator: In + values: + - H200 + containers: + - args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8080 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8080 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + command: + - /bin/sh + - -c + env: + - name: DP_SIZE + value: "2" + - name: TP_SIZE + value: "1" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: https://github.com/vllm-project/vllm.git + - name: VLLM_BRANCH + value: main + - name: VLLM_ALL2ALL_BACKEND + value: pplx + - name: NVIDIA_GDRCOPY + value: enabled + - name: NCCL_DEBUG + value: INFO + - name: NVSHMEM_DEBUG + value: TRACE + - name: NVSHMEM_DEBUG_SUBSYS + value: TRANSPORT,INIT,MEM,COLL,BOOTSTRAP + - name: NVSHMEM_REMOTE_TRANSPORT + value: ibrc + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1 + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: eth0 + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NCCL_IB_HCA + value: ibp + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: HF_TOKEN + name: hf-secret + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + key: GH_TOKEN + name: gh-token-secret + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/tms/vllm-dev-base:0.0.15 + imagePullPolicy: Always + name: vllm-worker + resources: + limits: + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + requests: + cpu: "8" + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + securityContext: + capabilities: + add: + - IPC_LOCK + stdin: true + tty: true + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /init-scripts + name: init-scripts-volume + workingDir: /app + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-sa + volumes: + - configMap: + defaultMode: 493 + name: vllm-init-scripts-config + name: init-scripts-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: dshm + restartPolicy: RecreateGroupOnPodRestart + size: 2 + workerTemplate: + metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: prefill + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: gpu.nvidia.com/model + operator: In + values: + - H200 + containers: + - args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8080 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8080 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + command: + - /bin/sh + - -c + env: + - name: DP_SIZE + value: "2" + - name: TP_SIZE + value: "1" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: https://github.com/vllm-project/vllm.git + - name: VLLM_BRANCH + value: main + - name: VLLM_ALL2ALL_BACKEND + value: pplx + - name: NVIDIA_GDRCOPY + value: enabled + - name: NCCL_DEBUG + value: INFO + - name: NVSHMEM_DEBUG + value: TRACE + - name: NVSHMEM_DEBUG_SUBSYS + value: TRANSPORT,INIT,MEM,COLL,BOOTSTRAP + - name: NVSHMEM_REMOTE_TRANSPORT + value: ibrc + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1 + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: eth0 + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NCCL_IB_HCA + value: ibp + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: HF_TOKEN + name: hf-secret + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + key: GH_TOKEN + name: gh-token-secret + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/tms/vllm-dev-base:0.0.15 + imagePullPolicy: Always + name: vllm-worker + resources: + limits: + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + requests: + cpu: "8" + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + securityContext: + capabilities: + add: + - IPC_LOCK + stdin: true + tty: true + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /init-scripts + name: init-scripts-volume + workingDir: /app + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-sa + volumes: + - configMap: + defaultMode: 493 + name: vllm-init-scripts-config + name: init-scripts-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: dshm + replicas: 1 + rolloutStrategy: + type: "" + startupPolicy: LeaderCreated +status: {} + diff --git a/samples/deepseek/deepseek-1t2d.yaml b/samples/deepseek/deepseek-1t2d.yaml new file mode 100644 index 0000000..65e05b0 --- /dev/null +++ b/samples/deepseek/deepseek-1t2d.yaml @@ -0,0 +1,47 @@ +apiVersion: llm-d.ai/v1alpha1 +kind: ModelService +metadata: + name: deepsk-ai-deepsk-coder-v2-lite-instruct +spec: + decoupleScaling: false + + baseConfigMapRef: + name: lws-baseconfig + + routing: + modelName: deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct + ports: + - name: app_port + port: 8080 + - name: internal_port + port: 8200 + + modelArtifacts: + uri: hf://deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct + authSecretName: hf-secret + + # describe decode pods + decode: + replicas: 1 + parallelism: + tensor: 1 + # tensor: 2 + data: 2 + # data: 4 + acceleratorTypes: + labelKey: gpu.nvidia.com/model + labelValues: + - H200 + + # describe the prefill pods + prefill: + replicas: 1 + parallelism: + tensor: 1 + # tensor: 2 + data: 2 + # data: 4 + acceleratorTypes: + labelKey: gpu.nvidia.com/model + labelValues: + - H200 diff --git a/samples/deepseek/deepseek-2t2d-manifest.yaml b/samples/deepseek/deepseek-2t2d-manifest.yaml new file mode 100644 index 0000000..5d1f790 --- /dev/null +++ b/samples/deepseek/deepseek-2t2d-manifest.yaml @@ -0,0 +1,882 @@ +--- +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: decode + name: deepsk-ai-deepsk-coder-v2-lite-instruct-decode +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: decode + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: gpu.nvidia.com/model + operator: In + values: + - H200 + containers: + - args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8200 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8200 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + command: + - /bin/sh + - -c + env: + - name: DP_SIZE + value: "2" + - name: TP_SIZE + value: "2" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: https://github.com/vllm-project/vllm.git + - name: VLLM_BRANCH + value: main + - name: VLLM_ALL2ALL_BACKEND + value: pplx + - name: NVIDIA_GDRCOPY + value: enabled + - name: NCCL_DEBUG + value: INFO + - name: NVSHMEM_DEBUG + value: TRACE + - name: NVSHMEM_DEBUG_SUBSYS + value: TRANSPORT,INIT,MEM,COLL,BOOTSTRAP + - name: NVSHMEM_REMOTE_TRANSPORT + value: ibrc + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1 + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: eth0 + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NCCL_IB_HCA + value: ibp + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: HF_TOKEN + name: hf-secret + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + key: GH_TOKEN + name: gh-token-secret + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/tms/vllm-dev-base:0.0.15 + imagePullPolicy: Always + name: vllm-worker + resources: + limits: + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "2" + rdma/ib: "1" + requests: + cpu: "8" + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "2" + rdma/ib: "1" + securityContext: + capabilities: + add: + - IPC_LOCK + stdin: true + tty: true + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /init-scripts + name: init-scripts-volume + workingDir: /app + initContainers: + - args: + - --port=8080 + - --vllm-port=8200 + - --connector=nixlv2 + - -v=6 + image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6 + imagePullPolicy: Always + name: routing-proxy + ports: + - containerPort: 8080 + protocol: TCP + resources: {} + restartPolicy: Always + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-sa + volumes: + - configMap: + defaultMode: 493 + name: vllm-init-scripts-config + name: init-scripts-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: dshm + restartPolicy: RecreateGroupOnPodRestart + size: 2 + workerTemplate: + metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: decode + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: gpu.nvidia.com/model + operator: In + values: + - H200 + containers: + - args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8200 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8200 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + command: + - /bin/sh + - -c + env: + - name: DP_SIZE + value: "2" + - name: TP_SIZE + value: "2" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: https://github.com/vllm-project/vllm.git + - name: VLLM_BRANCH + value: main + - name: VLLM_ALL2ALL_BACKEND + value: pplx + - name: NVIDIA_GDRCOPY + value: enabled + - name: NCCL_DEBUG + value: INFO + - name: NVSHMEM_DEBUG + value: TRACE + - name: NVSHMEM_DEBUG_SUBSYS + value: TRANSPORT,INIT,MEM,COLL,BOOTSTRAP + - name: NVSHMEM_REMOTE_TRANSPORT + value: ibrc + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1 + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: eth0 + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NCCL_IB_HCA + value: ibp + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: HF_TOKEN + name: hf-secret + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + key: GH_TOKEN + name: gh-token-secret + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/tms/vllm-dev-base:0.0.15 + imagePullPolicy: Always + name: vllm-worker + resources: + limits: + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "2" + rdma/ib: "1" + requests: + cpu: "8" + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "2" + rdma/ib: "1" + securityContext: + capabilities: + add: + - IPC_LOCK + stdin: true + tty: true + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /init-scripts + name: init-scripts-volume + workingDir: /app + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-sa + volumes: + - configMap: + defaultMode: 493 + name: vllm-init-scripts-config + name: init-scripts-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: dshm + replicas: 1 + rolloutStrategy: + type: "" + startupPolicy: LeaderCreated +status: {} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + creationTimestamp: null + labels: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp +spec: + selector: + matchLabels: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + strategy: {} + template: + metadata: + creationTimestamp: null + labels: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + spec: + containers: + - args: + - -poolName + - deepsk-ai-deepsk-coder-v2-lite-instruct-inference-pool + - -poolNamespace + - llmd-kalantar + - -v + - "5" + - --zap-encoder + - json + - -grpcPort + - "9002" + - -grpcHealthPort + - "9003" + env: + - name: PD_ENABLED + value: "true" + - name: PD_PROMPT_LEN_THRESHOLD + value: "10" + image: ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3 + livenessProbe: + failureThreshold: 3 + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + periodSeconds: 10 + name: epp + ports: + - containerPort: 9002 + protocol: TCP + - containerPort: 9003 + protocol: TCP + - containerPort: 9090 + name: metrics + protocol: TCP + readinessProbe: + failureThreshold: 3 + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + periodSeconds: 10 + resources: {} + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-sa +status: {} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: pod-read +subjects: +- kind: ServiceAccount + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-sa +--- +apiVersion: v1 +kind: Service +metadata: + creationTimestamp: null + labels: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-service +spec: + ports: + - appProtocol: http2 + port: 9002 + protocol: TCP + targetPort: 9002 + selector: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + type: ClusterIP +status: + loadBalancer: {} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-sa +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-http-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: deepsk-ai-deepsk-coder-v2-lite-instruct-inference-pool + port: 8080 +status: + parents: null +--- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + name: deepsk-ai-deepsk-coder-v2-lite-instruct +spec: + modelName: deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct + poolRef: + name: deepsk-ai-deepsk-coder-v2-lite-instruct-inference-pool +status: {} +--- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-inference-pool +spec: + extensionRef: + failureMode: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-service + selector: + leaderworkerset.sigs.k8s.io/worker-index: "0" + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + targetPortNumber: 8080 +status: {} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-sa +--- +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: prefill + name: deepsk-ai-deepsk-coder-v2-lite-instruct-prefill +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: prefill + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: gpu.nvidia.com/model + operator: In + values: + - H200 + containers: + - args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8080 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8080 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + command: + - /bin/sh + - -c + env: + - name: DP_SIZE + value: "2" + - name: TP_SIZE + value: "2" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: https://github.com/vllm-project/vllm.git + - name: VLLM_BRANCH + value: main + - name: VLLM_ALL2ALL_BACKEND + value: pplx + - name: NVIDIA_GDRCOPY + value: enabled + - name: NCCL_DEBUG + value: INFO + - name: NVSHMEM_DEBUG + value: TRACE + - name: NVSHMEM_DEBUG_SUBSYS + value: TRANSPORT,INIT,MEM,COLL,BOOTSTRAP + - name: NVSHMEM_REMOTE_TRANSPORT + value: ibrc + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1 + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: eth0 + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NCCL_IB_HCA + value: ibp + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: HF_TOKEN + name: hf-secret + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + key: GH_TOKEN + name: gh-token-secret + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/tms/vllm-dev-base:0.0.15 + imagePullPolicy: Always + name: vllm-worker + resources: + limits: + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "2" + rdma/ib: "1" + requests: + cpu: "8" + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "2" + rdma/ib: "1" + securityContext: + capabilities: + add: + - IPC_LOCK + stdin: true + tty: true + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /init-scripts + name: init-scripts-volume + workingDir: /app + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-sa + volumes: + - configMap: + defaultMode: 493 + name: vllm-init-scripts-config + name: init-scripts-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: dshm + restartPolicy: RecreateGroupOnPodRestart + size: 2 + workerTemplate: + metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: prefill + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: gpu.nvidia.com/model + operator: In + values: + - H200 + containers: + - args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8080 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8080 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + command: + - /bin/sh + - -c + env: + - name: DP_SIZE + value: "2" + - name: TP_SIZE + value: "2" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: https://github.com/vllm-project/vllm.git + - name: VLLM_BRANCH + value: main + - name: VLLM_ALL2ALL_BACKEND + value: pplx + - name: NVIDIA_GDRCOPY + value: enabled + - name: NCCL_DEBUG + value: INFO + - name: NVSHMEM_DEBUG + value: TRACE + - name: NVSHMEM_DEBUG_SUBSYS + value: TRANSPORT,INIT,MEM,COLL,BOOTSTRAP + - name: NVSHMEM_REMOTE_TRANSPORT + value: ibrc + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1 + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: eth0 + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NCCL_IB_HCA + value: ibp + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: HF_TOKEN + name: hf-secret + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + key: GH_TOKEN + name: gh-token-secret + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/tms/vllm-dev-base:0.0.15 + imagePullPolicy: Always + name: vllm-worker + resources: + limits: + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "2" + rdma/ib: "1" + requests: + cpu: "8" + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "2" + rdma/ib: "1" + securityContext: + capabilities: + add: + - IPC_LOCK + stdin: true + tty: true + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /init-scripts + name: init-scripts-volume + workingDir: /app + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-sa + volumes: + - configMap: + defaultMode: 493 + name: vllm-init-scripts-config + name: init-scripts-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: dshm + replicas: 1 + rolloutStrategy: + type: "" + startupPolicy: LeaderCreated +status: {} + diff --git a/samples/deepseek/deepseek-2t2d.yaml b/samples/deepseek/deepseek-2t2d.yaml new file mode 100644 index 0000000..97dcac5 --- /dev/null +++ b/samples/deepseek/deepseek-2t2d.yaml @@ -0,0 +1,47 @@ +apiVersion: llm-d.ai/v1alpha1 +kind: ModelService +metadata: + name: deepsk-ai-deepsk-coder-v2-lite-instruct +spec: + decoupleScaling: false + + baseConfigMapRef: + name: lws-baseconfig + + routing: + modelName: deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct + ports: + - name: app_port + port: 8080 + - name: internal_port + port: 8200 + + modelArtifacts: + uri: hf://deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct + authSecretName: hf-secret + + # describe decode pods + decode: + replicas: 1 + parallelism: + tensor: 2 + # tensor: 2 + data: 2 + # data: 4 + acceleratorTypes: + labelKey: gpu.nvidia.com/model + labelValues: + - H200 + + # describe the prefill pods + prefill: + replicas: 1 + parallelism: + tensor: 2 + # tensor: 2 + data: 2 + # data: 4 + acceleratorTypes: + labelKey: gpu.nvidia.com/model + labelValues: + - H200 diff --git a/samples/deepseek/deepseek-manifest.yaml b/samples/deepseek/deepseek-manifest.yaml new file mode 100644 index 0000000..9af8fd8 --- /dev/null +++ b/samples/deepseek/deepseek-manifest.yaml @@ -0,0 +1,882 @@ +--- +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: decode + name: deepsk-ai-deepsk-coder-v2-lite-instruct-decode +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: decode + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: gpu.nvidia.com/model + operator: In + values: + - H200 + containers: + - args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8200 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8200 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + command: + - /bin/sh + - -c + env: + - name: DP_SIZE + value: "1" + - name: TP_SIZE + value: "1" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: https://github.com/vllm-project/vllm.git + - name: VLLM_BRANCH + value: main + - name: VLLM_ALL2ALL_BACKEND + value: pplx + - name: NVIDIA_GDRCOPY + value: enabled + - name: NCCL_DEBUG + value: INFO + - name: NVSHMEM_DEBUG + value: TRACE + - name: NVSHMEM_DEBUG_SUBSYS + value: TRANSPORT,INIT,MEM,COLL,BOOTSTRAP + - name: NVSHMEM_REMOTE_TRANSPORT + value: ibrc + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1 + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: eth0 + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NCCL_IB_HCA + value: ibp + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: HF_TOKEN + name: hf-secret + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + key: GH_TOKEN + name: gh-token-secret + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/tms/vllm-dev-base:0.0.15 + imagePullPolicy: Always + name: vllm-worker + resources: + limits: + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + requests: + cpu: "8" + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + securityContext: + capabilities: + add: + - IPC_LOCK + stdin: true + tty: true + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /init-scripts + name: init-scripts-volume + workingDir: /app + initContainers: + - args: + - --port=8080 + - --vllm-port=8200 + - --connector=nixlv2 + - -v=6 + image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6 + imagePullPolicy: Always + name: routing-proxy + ports: + - containerPort: 8080 + protocol: TCP + resources: {} + restartPolicy: Always + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-sa + volumes: + - configMap: + defaultMode: 493 + name: vllm-init-scripts-config + name: init-scripts-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: dshm + restartPolicy: RecreateGroupOnPodRestart + size: 1 + workerTemplate: + metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: decode + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: gpu.nvidia.com/model + operator: In + values: + - H200 + containers: + - args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8200 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8200 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + command: + - /bin/sh + - -c + env: + - name: DP_SIZE + value: "1" + - name: TP_SIZE + value: "1" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: https://github.com/vllm-project/vllm.git + - name: VLLM_BRANCH + value: main + - name: VLLM_ALL2ALL_BACKEND + value: pplx + - name: NVIDIA_GDRCOPY + value: enabled + - name: NCCL_DEBUG + value: INFO + - name: NVSHMEM_DEBUG + value: TRACE + - name: NVSHMEM_DEBUG_SUBSYS + value: TRANSPORT,INIT,MEM,COLL,BOOTSTRAP + - name: NVSHMEM_REMOTE_TRANSPORT + value: ibrc + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1 + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: eth0 + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NCCL_IB_HCA + value: ibp + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: HF_TOKEN + name: hf-secret + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + key: GH_TOKEN + name: gh-token-secret + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/tms/vllm-dev-base:0.0.15 + imagePullPolicy: Always + name: vllm-worker + resources: + limits: + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + requests: + cpu: "8" + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + securityContext: + capabilities: + add: + - IPC_LOCK + stdin: true + tty: true + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /init-scripts + name: init-scripts-volume + workingDir: /app + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-sa + volumes: + - configMap: + defaultMode: 493 + name: vllm-init-scripts-config + name: init-scripts-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: dshm + replicas: 1 + rolloutStrategy: + type: "" + startupPolicy: LeaderCreated +status: {} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + creationTimestamp: null + labels: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp +spec: + selector: + matchLabels: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + strategy: {} + template: + metadata: + creationTimestamp: null + labels: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + spec: + containers: + - args: + - -poolName + - deepsk-ai-deepsk-coder-v2-lite-instruct-inference-pool + - -poolNamespace + - llmd-kalantar + - -v + - "5" + - --zap-encoder + - json + - -grpcPort + - "9002" + - -grpcHealthPort + - "9003" + env: + - name: PD_ENABLED + value: "true" + - name: PD_PROMPT_LEN_THRESHOLD + value: "10" + image: ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3 + livenessProbe: + failureThreshold: 3 + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + periodSeconds: 10 + name: epp + ports: + - containerPort: 9002 + protocol: TCP + - containerPort: 9003 + protocol: TCP + - containerPort: 9090 + name: metrics + protocol: TCP + readinessProbe: + failureThreshold: 3 + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + periodSeconds: 10 + resources: {} + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-sa +status: {} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: pod-read +subjects: +- kind: ServiceAccount + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-sa +--- +apiVersion: v1 +kind: Service +metadata: + creationTimestamp: null + labels: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-service +spec: + ports: + - appProtocol: http2 + port: 9002 + protocol: TCP + targetPort: 9002 + selector: + llm-d.ai/epp: deepsk-ai-deepsk-coder-v2-lite-instruct-epp + type: ClusterIP +status: + loadBalancer: {} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-sa +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-http-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: deepsk-ai-deepsk-coder-v2-lite-instruct-inference-pool + port: 8080 +status: + parents: null +--- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + name: deepsk-ai-deepsk-coder-v2-lite-instruct +spec: + modelName: deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct + poolRef: + name: deepsk-ai-deepsk-coder-v2-lite-instruct-inference-pool +status: {} +--- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-inference-pool +spec: + extensionRef: + failureMode: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-epp-service + selector: + leaderworkerset.sigs.k8s.io/worker-index: "0" + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + targetPortNumber: 8080 +status: {} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + creationTimestamp: null + name: deepsk-ai-deepsk-coder-v2-lite-instruct-sa +--- +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: prefill + name: deepsk-ai-deepsk-coder-v2-lite-instruct-prefill +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: prefill + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: gpu.nvidia.com/model + operator: In + values: + - H200 + containers: + - args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8080 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8080 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + command: + - /bin/sh + - -c + env: + - name: DP_SIZE + value: "1" + - name: TP_SIZE + value: "1" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: https://github.com/vllm-project/vllm.git + - name: VLLM_BRANCH + value: main + - name: VLLM_ALL2ALL_BACKEND + value: pplx + - name: NVIDIA_GDRCOPY + value: enabled + - name: NCCL_DEBUG + value: INFO + - name: NVSHMEM_DEBUG + value: TRACE + - name: NVSHMEM_DEBUG_SUBSYS + value: TRANSPORT,INIT,MEM,COLL,BOOTSTRAP + - name: NVSHMEM_REMOTE_TRANSPORT + value: ibrc + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1 + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: eth0 + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NCCL_IB_HCA + value: ibp + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: HF_TOKEN + name: hf-secret + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + key: GH_TOKEN + name: gh-token-secret + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/tms/vllm-dev-base:0.0.15 + imagePullPolicy: Always + name: vllm-worker + resources: + limits: + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + requests: + cpu: "8" + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + securityContext: + capabilities: + add: + - IPC_LOCK + stdin: true + tty: true + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /init-scripts + name: init-scripts-volume + workingDir: /app + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-sa + volumes: + - configMap: + defaultMode: 493 + name: vllm-init-scripts-config + name: init-scripts-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: dshm + restartPolicy: RecreateGroupOnPodRestart + size: 1 + workerTemplate: + metadata: + creationTimestamp: null + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepseek-coder-v2-lite-instruct + llm-d.ai/role: prefill + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: gpu.nvidia.com/model + operator: In + values: + - H200 + containers: + - args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8080 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8080 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + command: + - /bin/sh + - -c + env: + - name: DP_SIZE + value: "1" + - name: TP_SIZE + value: "1" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: https://github.com/vllm-project/vllm.git + - name: VLLM_BRANCH + value: main + - name: VLLM_ALL2ALL_BACKEND + value: pplx + - name: NVIDIA_GDRCOPY + value: enabled + - name: NCCL_DEBUG + value: INFO + - name: NVSHMEM_DEBUG + value: TRACE + - name: NVSHMEM_DEBUG_SUBSYS + value: TRANSPORT,INIT,MEM,COLL,BOOTSTRAP + - name: NVSHMEM_REMOTE_TRANSPORT + value: ibrc + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1 + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: eth0 + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NCCL_IB_HCA + value: ibp + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: HF_TOKEN + name: hf-secret + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + key: GH_TOKEN + name: gh-token-secret + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/tms/vllm-dev-base:0.0.15 + imagePullPolicy: Always + name: vllm-worker + resources: + limits: + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + requests: + cpu: "8" + ephemeral-storage: 256Gi + memory: 64Gi + nvidia.com/gpu: "1" + rdma/ib: "1" + securityContext: + capabilities: + add: + - IPC_LOCK + stdin: true + tty: true + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /init-scripts + name: init-scripts-volume + workingDir: /app + serviceAccountName: deepsk-ai-deepsk-coder-v2-lite-instruct-sa + volumes: + - configMap: + defaultMode: 493 + name: vllm-init-scripts-config + name: init-scripts-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: dshm + replicas: 1 + rolloutStrategy: + type: "" + startupPolicy: LeaderCreated +status: {} + diff --git a/samples/deepseek/deepseek.yaml b/samples/deepseek/deepseek.yaml new file mode 100644 index 0000000..a44ba70 --- /dev/null +++ b/samples/deepseek/deepseek.yaml @@ -0,0 +1,47 @@ +apiVersion: llm-d.ai/v1alpha1 +kind: ModelService +metadata: + name: deepsk-ai-deepsk-coder-v2-lite-instruct +spec: + decoupleScaling: false + + baseConfigMapRef: + name: lws-baseconfig + + routing: + modelName: deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct + ports: + - name: app_port + port: 8080 + - name: internal_port + port: 8200 + + modelArtifacts: + uri: hf://deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct + authSecretName: hf-secret + + # describe decode pods + decode: + replicas: 1 + parallelism: + tensor: 1 + # tensor: 2 + data: 1 + # data: 4 + acceleratorTypes: + labelKey: gpu.nvidia.com/model + labelValues: + - H200 + + # describe the prefill pods + prefill: + replicas: 1 + parallelism: + tensor: 1 + # tensor: 2 + data: 1 + # data: 4 + acceleratorTypes: + labelKey: gpu.nvidia.com/model + labelValues: + - H200 diff --git a/samples/deepseek/lws-base.yaml b/samples/deepseek/lws-base.yaml new file mode 100644 index 0000000..9581993 --- /dev/null +++ b/samples/deepseek/lws-base.yaml @@ -0,0 +1,812 @@ +# A universal baseconfig for models that can be downloaded from Hugging Face + +# Requirements: +# Any consuming ModelService should define ports labeled: +# - app_port - the external port number for the prefill and decode pods +# - internal_port - the port number used by the sidecar to communicate with a vllm container +apiVersion: v1 +kind: ConfigMap +metadata: + name: lws-baseconfig +immutable: true +data: + decodeLeaderWorkerSet: | + apiVersion: leaderworkerset.x-k8s.io/v1 + kind: LeaderWorkerSet + metadata: + name: vllm-decode + spec: + startupPolicy: LeaderCreated + leaderWorkerTemplate: + size: {{ .DecodeDataParallelism }} + restartPolicy: RecreateGroupOnPodRestart + + leaderTemplate: + spec: + initContainers: + - name: routing-proxy + args: + - "--port={{ "app_port" | getPort }}" + - "--vllm-port={{ "internal_port" | getPort }}" + - --connector=nixlv2 + - -v=6 + image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6 + imagePullPolicy: Always + ports: + - containerPort: {{ "app_port" | getPort }} + protocol: TCP + resources: {} + restartPolicy: Always + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + containers: + - name: vllm-worker + image: "quay.io/tms/vllm-dev-base:0.0.15" + imagePullPolicy: Always + workingDir: /app + stdin: true + tty: true + command: ["/bin/sh","-c"] + args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + {{ .HFModelName }} \ + --port {{ "internal_port" | getPort }} \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + {{ .HFModelName }} \ + --port {{ "internal_port" | getPort }} \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + env: + - name: DP_SIZE + value: "{{ .DecodeDataParallelism }}" + - name: TP_SIZE + value: "{{ .DecodeTensorParallelism }}" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: "https://github.com/vllm-project/vllm.git" + - name: VLLM_BRANCH + value: "main" + - name: VLLM_ALL2ALL_BACKEND + # value: "naive" + value: "pplx" + # value: "deepep_high_throughput" + # value: "deepep_low_latency" + # + # Needed for GDRCOPY to be used. + # See: https://github.com/NVIDIA/nvidia-container-toolkit/releases/tag/v1.15.0 + - name: NVIDIA_GDRCOPY + value: "enabled" + # - name: NVIDIA_NVSWITCH + # value: "enabled" + # - name: NVIDIA_GDS + # value: "enabled" + + # NVIDIA_MOFED is likely needed for using IBGDA but causes crashes + # - name: NVIDIA_MOFED + # value: "enabled" + # + - name: NCCL_DEBUG + value: "INFO" + - name: NVSHMEM_DEBUG + value: "TRACE" + - name: NVSHMEM_DEBUG_SUBSYS + value: "TRANSPORT,INIT,MEM,COLL,BOOTSTRAP" + - name: NVSHMEM_REMOTE_TRANSPORT + value: "ibrc" + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: "ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1" + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: "eth0" + - name: GLOO_SOCKET_IFNAME + value: "eth0" + - name: NCCL_SOCKET_IFNAME + value: "eth0" + - name: NCCL_IB_HCA + value: "ibp" + - name: VLLM_LOGGING_LEVEL + value: "DEBUG" + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: HF_TOKEN + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + name: gh-token-secret + key: GH_TOKEN + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + + securityContext: + capabilities: + add: [ "IPC_LOCK" ] + resources: + limits: + nvidia.com/gpu: "{{ .DecodeTensorParallelism }}" + memory: 64Gi + ephemeral-storage: 256Gi + rdma/ib: 1 + requests: + cpu: 8 + memory: 64Gi + ephemeral-storage: 256Gi + nvidia.com/gpu: "{{ .DecodeTensorParallelism }}" + rdma/ib: 1 + volumeMounts: + - mountPath: /dev/shm + name: dshm + - name: init-scripts-volume + mountPath: /init-scripts + volumes: + # Volume for the init script from ConfigMap + - name: init-scripts-volume + configMap: + name: vllm-init-scripts-config + defaultMode: 0755 # Set execute permissions for the script + # Needed for NCCL to function + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 1Gi + + workerTemplate: + spec: + containers: + - name: vllm-worker + image: "quay.io/tms/vllm-dev-base:0.0.15" + imagePullPolicy: Always + workingDir: /app + stdin: true + tty: true + command: ["/bin/sh","-c"] + args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + {{ .HFModelName }} \ + --port {{ "internal_port" | getPort }} \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + {{ .HFModelName }} \ + --port {{ "internal_port" | getPort }} \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + env: + - name: DP_SIZE + value: "{{ .DecodeDataParallelism }}" + - name: TP_SIZE + value: "{{ .DecodeTensorParallelism }}" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: "https://github.com/vllm-project/vllm.git" + - name: VLLM_BRANCH + value: "main" + - name: VLLM_ALL2ALL_BACKEND + # value: "naive" + value: "pplx" + # value: "deepep_high_throughput" + # value: "deepep_low_latency" + # + # Needed for GDRCOPY to be used. + # See: https://github.com/NVIDIA/nvidia-container-toolkit/releases/tag/v1.15.0 + - name: NVIDIA_GDRCOPY + value: "enabled" + # - name: NVIDIA_NVSWITCH + # value: "enabled" + # - name: NVIDIA_GDS + # value: "enabled" + + # NVIDIA_MOFED is likely needed for using IBGDA but causes crashes + # - name: NVIDIA_MOFED + # value: "enabled" + # + - name: NCCL_DEBUG + value: "INFO" + - name: NVSHMEM_DEBUG + value: "TRACE" + - name: NVSHMEM_DEBUG_SUBSYS + value: "TRANSPORT,INIT,MEM,COLL,BOOTSTRAP" + - name: NVSHMEM_REMOTE_TRANSPORT + value: "ibrc" + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: "ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1" + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: "eth0" + - name: GLOO_SOCKET_IFNAME + value: "eth0" + - name: NCCL_SOCKET_IFNAME + value: "eth0" + - name: NCCL_IB_HCA + value: "ibp" + - name: VLLM_LOGGING_LEVEL + value: "DEBUG" + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: HF_TOKEN + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + name: gh-token-secret + key: GH_TOKEN + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + + securityContext: + capabilities: + add: [ "IPC_LOCK" ] + resources: + limits: + nvidia.com/gpu: "{{ .DecodeTensorParallelism }}" + memory: 64Gi + ephemeral-storage: 256Gi + rdma/ib: 1 + requests: + cpu: 8 + memory: 64Gi + ephemeral-storage: 256Gi + nvidia.com/gpu: "{{ .DecodeTensorParallelism }}" + rdma/ib: 1 + volumeMounts: + - mountPath: /dev/shm + name: dshm + - name: init-scripts-volume + mountPath: /init-scripts + volumes: + # Volume for the init script from ConfigMap + - name: init-scripts-volume + configMap: + name: vllm-init-scripts-config + defaultMode: 0755 # Set execute permissions for the script + # Needed for NCCL to function + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 1Gi + + prefillLeaderWorkerSet: | + apiVersion: leaderworkerset.x-k8s.io/v1 + kind: LeaderWorkerSet + metadata: + name: vllm-prefill + spec: + startupPolicy: LeaderCreated + leaderWorkerTemplate: + size: {{ .PrefillDataParallelism }} + restartPolicy: RecreateGroupOnPodRestart + + leaderTemplate: + + spec: + + containers: + - name: vllm-worker + image: "quay.io/tms/vllm-dev-base:0.0.15" + imagePullPolicy: Always + workingDir: /app + stdin: true + tty: true + command: ["/bin/sh","-c"] + args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + {{ .HFModelName }} \ + --port {{ "app_port" | getPort }} \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + {{ .HFModelName }} \ + --port {{ "app_port" | getPort }} \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + env: + - name: DP_SIZE + value: "{{ .PrefillDataParallelism }}" + - name: TP_SIZE + value: "{{ .PrefillTensorParallelism }}" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: "https://github.com/vllm-project/vllm.git" + - name: VLLM_BRANCH + value: "main" + - name: VLLM_ALL2ALL_BACKEND + # value: "naive" + value: "pplx" + # value: "deepep_high_throughput" + # value: "deepep_low_latency" + # + # Needed for GDRCOPY to be used. + # See: https://github.com/NVIDIA/nvidia-container-toolkit/releases/tag/v1.15.0 + - name: NVIDIA_GDRCOPY + value: "enabled" + # - name: NVIDIA_NVSWITCH + # value: "enabled" + # - name: NVIDIA_GDS + # value: "enabled" + + # NVIDIA_MOFED is likely needed for using IBGDA but causes crashes + # - name: NVIDIA_MOFED + # value: "enabled" + # + - name: NCCL_DEBUG + value: "INFO" + - name: NVSHMEM_DEBUG + value: "TRACE" + - name: NVSHMEM_DEBUG_SUBSYS + value: "TRANSPORT,INIT,MEM,COLL,BOOTSTRAP" + - name: NVSHMEM_REMOTE_TRANSPORT + value: "ibrc" + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: "ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1" + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: "eth0" + - name: GLOO_SOCKET_IFNAME + value: "eth0" + - name: NCCL_SOCKET_IFNAME + value: "eth0" + - name: NCCL_IB_HCA + value: "ibp" + - name: VLLM_LOGGING_LEVEL + value: "DEBUG" + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: HF_TOKEN + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + name: gh-token-secret + key: GH_TOKEN + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + + securityContext: + capabilities: + add: [ "IPC_LOCK" ] + resources: + limits: + nvidia.com/gpu: "{{ .PrefillTensorParallelism }}" + memory: 64Gi + ephemeral-storage: 256Gi + rdma/ib: 1 + requests: + cpu: 8 + memory: 64Gi + ephemeral-storage: 256Gi + nvidia.com/gpu: "{{ .PrefillTensorParallelism }}" + rdma/ib: 1 + volumeMounts: + - mountPath: /dev/shm + name: dshm + - name: init-scripts-volume + mountPath: /init-scripts + volumes: + # Volume for the init script from ConfigMap + - name: init-scripts-volume + configMap: + name: vllm-init-scripts-config + defaultMode: 0755 # Set execute permissions for the script + # Needed for NCCL to function + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 1Gi + + workerTemplate: + + spec: + + containers: + - name: vllm-worker + image: "quay.io/tms/vllm-dev-base:0.0.15" + imagePullPolicy: Always + workingDir: /app + stdin: true + tty: true + command: ["/bin/sh","-c"] + args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + {{ .HFModelName }} \ + --port {{ "app_port" | getPort }} \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + {{ .HFModelName }} \ + --port {{ "app_port" | getPort }} \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + env: + - name: DP_SIZE + value: "{{ .PrefillDataParallelism }}" + - name: TP_SIZE + value: "{{ .PrefillTensorParallelism }}" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: "https://github.com/vllm-project/vllm.git" + - name: VLLM_BRANCH + value: "main" + - name: VLLM_ALL2ALL_BACKEND + # value: "naive" + value: "pplx" + # value: "deepep_high_throughput" + # value: "deepep_low_latency" + # + # Needed for GDRCOPY to be used. + # See: https://github.com/NVIDIA/nvidia-container-toolkit/releases/tag/v1.15.0 + - name: NVIDIA_GDRCOPY + value: "enabled" + # - name: NVIDIA_NVSWITCH + # value: "enabled" + # - name: NVIDIA_GDS + # value: "enabled" + + # NVIDIA_MOFED is likely needed for using IBGDA but causes crashes + # - name: NVIDIA_MOFED + # value: "enabled" + # + - name: NCCL_DEBUG + value: "INFO" + - name: NVSHMEM_DEBUG + value: "TRACE" + - name: NVSHMEM_DEBUG_SUBSYS + value: "TRANSPORT,INIT,MEM,COLL,BOOTSTRAP" + - name: NVSHMEM_REMOTE_TRANSPORT + value: "ibrc" + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: "ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1" + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: "eth0" + - name: GLOO_SOCKET_IFNAME + value: "eth0" + - name: NCCL_SOCKET_IFNAME + value: "eth0" + - name: NCCL_IB_HCA + value: "ibp" + - name: VLLM_LOGGING_LEVEL + value: "DEBUG" + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: HF_TOKEN + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + name: gh-token-secret + key: GH_TOKEN + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + + securityContext: + capabilities: + add: [ "IPC_LOCK" ] + resources: + limits: + nvidia.com/gpu: "{{ .PrefillTensorParallelism }}" + memory: 64Gi + ephemeral-storage: 256Gi + rdma/ib: 1 + requests: + cpu: 8 + memory: 64Gi + ephemeral-storage: 256Gi + nvidia.com/gpu: "{{ .PrefillTensorParallelism }}" + rdma/ib: 1 + volumeMounts: + - mountPath: /dev/shm + name: dshm + - name: init-scripts-volume + mountPath: /init-scripts + + volumes: + # Volume for the init script from ConfigMap + - name: init-scripts-volume + configMap: + name: vllm-init-scripts-config + defaultMode: 0755 # Set execute permissions for the script + # Needed for NCCL to function + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 1Gi + + eppService: | + apiVersion: v1 + kind: Service + spec: + ports: + # Needs to match the port of the eppDeployment + - port: 9002 + protocol: TCP + targetPort: 9002 + appProtocol: http2 + type: ClusterIP + + eppDeployment: | + apiVersion: apps/v1 + kind: Deployment + spec: + template: + spec: + containers: + - name: "epp" + args: + - -poolName + - {{ .InferencePoolName }} + - -poolNamespace + - llmd-kalantar + - -v + - "5" + - --zap-encoder + - json + - -grpcPort + - "9002" + - -grpcHealthPort + - "9003" + env: + - name: PD_ENABLED + value: "true" + - name: PD_PROMPT_LEN_THRESHOLD + value: "10" + image: ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3 + livenessProbe: + failureThreshold: 3 + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + periodSeconds: 10 + ports: + - containerPort: 9002 + protocol: TCP + - containerPort: 9003 + protocol: TCP + - containerPort: 9090 + name: metrics + protocol: TCP + readinessProbe: + failureThreshold: 3 + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + periodSeconds: 10 + + inferencePool: | + apiVersion: inference.networking.x-k8s.io/v1alpha2 + kind: InferencePool + spec: + targetPortNumber: {{ "app_port" | getPort }} + selector: + leaderworkerset.sigs.k8s.io/worker-index: "0" + + inferenceModel: | + apiVersion: inference.networking.x-k8s.io/v1alpha2 + kind: InferenceModel + + httpRoute: | + apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: {{ .InferencePoolName }} + port: {{ "app_port" | getPort }} \ No newline at end of file diff --git a/samples/nixl-xpyd/universal-baseconfig-hf.yaml b/samples/nixl-xpyd/universal-baseconfig-hf.yaml index d7d9662..50a4a46 100644 --- a/samples/nixl-xpyd/universal-baseconfig-hf.yaml +++ b/samples/nixl-xpyd/universal-baseconfig-hf.yaml @@ -162,7 +162,6 @@ data: - {{ .InferencePoolName }} - -poolNamespace - {{ .ModelServiceNamespace }} - - serverless-workstream - -v - "4" - --zap-encoder @@ -214,6 +213,10 @@ data: apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway rules: - backendRefs: - group: inference.networking.x-k8s.io