InftyAI · InftyAI-Agent · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025
diff --git a/api/inference/v1alpha1/backendruntime_types.go b/api/inference/v1alpha1/backendruntime_types.go
@@ -50,16 +50,6 @@ type ScaleTrigger struct {
 	HPA *HPATrigger `json:"hpa,omitempty"`
 }
 
-// MultiHostCommands represents leader & worker commands for multiple nodes scenarios.
-type MultiHostCommands struct {
-	// Leader commands.
-	// +optional
-	Leader []string `json:"leader,omitempty"`
-	// Worker commands.
-	// +optional
-	Worker []string `json:"worker,omitempty"`
-}
-
 // RecommendedConfig represents the recommended configurations for the backendRuntime,
 // user can choose one of them to apply.
 type RecommendedConfig struct {
@@ -89,10 +79,6 @@ type BackendRuntimeSpec struct {
 	// Commands represents the default commands for the backendRuntime.
 	// +optional
 	Commands []string `json:"commands,omitempty"`
-	// MultiHostCommands represents leader and worker commands for nodes with
-	// different roles.
-	// +optional
-	MultiHostCommands *MultiHostCommands `json:"multiHostCommands,omitempty"`
 	// Image represents the default image registry of the backendRuntime.
 	// It will work together with version to make up a real image.
 	Image string `json:"image"`

diff --git a/api/inference/v1alpha1/config_types.go b/api/inference/v1alpha1/config_types.go
@@ -41,12 +41,10 @@ type BackendRuntimeConfig struct {
 	Envs []corev1.EnvVar `json:"envs,omitempty"`
 	// ConfigName represents the recommended configuration name for the backend,
 	// It will be inferred from the models in the runtime if not specified, e.g. default,
-	// speculative-decoding or model-parallelism.
+	// speculative-decoding.
 	ConfigName *string `json:"configName,omitempty"`
-	// Args represents all the arguments for the command.
-	// Argument around with {{ .CONFIG }} is a configuration waiting for render.
-	// +optional
-	// Args defined here will "append" the args in the recommendedConfig.
+	// Args defined here will "append" the args defined in the recommendedConfig,
+	// either explicitly configured in configName or inferred in the runtime.
 	// +optional
 	Args []string `json:"args,omitempty"`
 	// Resources represents the resource requirements for backend, like cpu/mem,
@@ -60,11 +58,6 @@ type BackendRuntimeConfig struct {
 	// SharedMemorySize defined here will "overwrite" the sharedMemorySize in the recommendedConfig.
 	// +optional
 	SharedMemorySize *resource.Quantity `json:"sharedMemorySize,omitempty"`
-	// ScaleTrigger defines the rules to scale the workloads.
-	// Only one trigger cloud work at a time, mostly used in Playground.
-	// ScaleTrigger defined here will "overwrite" the scaleTrigger in the recommendedConfig.
-	// +optional
-	ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
 }
 
 // TODO: Do not support DRA yet, we can support that once needed.

diff --git a/api/inference/v1alpha1/playground_types.go b/api/inference/v1alpha1/playground_types.go
@@ -58,6 +58,11 @@ type ElasticConfig struct {
 	// Default to nil means there's no limit for the instance number.
 	// +optional
 	MaxReplicas *int32 `json:"maxReplicas,omitempty"`
+	// ScaleTrigger defines the rules to scale the workloads.
+	// Only one trigger cloud work at a time, mostly used in Playground.
+	// ScaleTrigger defined here will "overwrite" the scaleTrigger in the recommendedConfig.
+	// +optional
+	ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
 }
 
 const (

diff --git a/api/inference/v1alpha1/service_types.go b/api/inference/v1alpha1/service_types.go
@@ -35,12 +35,16 @@ const (
 type ServiceSpec struct {
 	// ModelClaims represents multiple claims for different models.
 	ModelClaims coreapi.ModelClaims `json:"modelClaims,omitempty"`
-	// WorkloadTemplate defines the underlying workload layout and configuration.
-	// Note: the LWS spec might be twisted with various LWS instances to support
-	// accelerator fungibility or other cutting-edge researches.
-	// LWS supports both single-host and multi-host scenarios, for single host
-	// cases, only need to care about replicas, rolloutStrategy and workerTemplate.
-	WorkloadTemplate lws.LeaderWorkerSetSpec `json:"workloadTemplate"`
+	// Replicas represents the replica number of inference workloads.
+	// +kubebuilder:default=1
+	// +optional
+	Replicas *int32 `json:"replicas,omitempty"`
+	// WorkloadTemplate defines the template for leader/worker pods
+	WorkloadTemplate lws.LeaderWorkerTemplate `json:"workloadTemplate"`
+	// RolloutStrategy defines the strategy that will be applied to update replicas
+	// when a revision is made to the leaderWorkerTemplate.
+	// +optional
+	RolloutStrategy lws.RolloutStrategy `json:"rolloutStrategy,omitempty"`
 }
 
 const (

diff --git a/api/inference/v1alpha1/zz_generated.deepcopy.go b/api/inference/v1alpha1/zz_generated.deepcopy.go
diff --git a/chart/crds/backendruntime-crd.yaml b/chart/crds/backendruntime-crd.yaml
@@ -322,22 +322,6 @@ spec:
                     format: int32
                     type: integer
                 type: object
-              multiHostCommands:
-                description: |-
-                  MultiHostCommands represents leader and worker commands for nodes with
-                  different roles.
-                properties:
-                  leader:
-                    description: Leader commands.
-                    items:
-                      type: string
-                    type: array
-                  worker:
-                    description: Worker commands.
-                    items:
-                      type: string
-                    type: array
-                type: object
               readinessProbe:
                 description: |-
                   Periodic probe of backend readiness.

diff --git a/chart/crds/playground-crd.yaml b/chart/crds/playground-crd.yaml
@@ -47,9 +47,8 @@ spec:
                 properties:
                   args:
                     description: |-
-                      Args represents all the arguments for the command.
-                      Argument around with {{ .CONFIG }} is a configuration waiting for render.
-                      Args defined here will "append" the args in the recommendedConfig.
+                      Args defined here will "append" the args defined in the recommendedConfig,
+                      either explicitly configured in configName or inferred in the runtime.
                     items:
                       type: string
                     type: array
@@ -62,7 +61,7 @@ spec:
                     description: |-
                       ConfigName represents the recommended configuration name for the backend,
                       It will be inferred from the models in the runtime if not specified, e.g. default,
-                      speculative-decoding or model-parallelism.
+                      speculative-decoding.
                     type: string
                   envs:
                     description: Envs represents the environments set to the container.
@@ -216,6 +215,41 @@ spec:
                           More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
                         type: object
                     type: object
+                  sharedMemorySize:
+                    anyOf:
+                    - type: integer
+                    - type: string
+                    description: |-
+                      SharedMemorySize represents the size of /dev/shm required in the runtime of
+                      inference workload.
+                      SharedMemorySize defined here will "overwrite" the sharedMemorySize in the recommendedConfig.
+                    pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                    x-kubernetes-int-or-string: true
+                  version:
+                    description: |-
+                      Version represents the backend version if you want a different one
+                      from the default version.
+                    type: string
+                type: object
+              elasticConfig:
+                description: |-
+                  ElasticConfig defines the configuration for elastic usage,
+                  e.g. the max/min replicas.
+                properties:
+                  maxReplicas:
+                    description: |-
+                      MaxReplicas indicates the maximum number of inference workloads based on the traffic.
+                      Default to nil means there's no limit for the instance number.
+                    format: int32
+                    type: integer
+                  minReplicas:
+                    default: 1
+                    description: |-
+                      MinReplicas indicates the minimum number of inference workloads based on the traffic.
+                      Default to 1.
+                      MinReplicas couldn't be 0 now, will support serverless in the future.
+                    format: int32
+                    type: integer
                   scaleTrigger:
                     description: |-
                       ScaleTrigger defines the rules to scale the workloads.
@@ -829,41 +863,6 @@ spec:
                             type: array
                         type: object
                     type: object
-                  sharedMemorySize:
-                    anyOf:
-                    - type: integer
-                    - type: string
-                    description: |-
-                      SharedMemorySize represents the size of /dev/shm required in the runtime of
-                      inference workload.
-                      SharedMemorySize defined here will "overwrite" the sharedMemorySize in the recommendedConfig.
-                    pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
-                    x-kubernetes-int-or-string: true
-                  version:
-                    description: |-
-                      Version represents the backend version if you want a different one
-                      from the default version.
-                    type: string
-                type: object
-              elasticConfig:
-                description: |-
-                  ElasticConfig defines the configuration for elastic usage,
-                  e.g. the max/min replicas.
-                properties:
-                  maxReplicas:
-                    description: |-
-                      MaxReplicas indicates the maximum number of inference workloads based on the traffic.
-                      Default to nil means there's no limit for the instance number.
-                    format: int32
-                    type: integer
-                  minReplicas:
-                    default: 1
-                    description: |-
-                      MinReplicas indicates the minimum number of inference workloads based on the traffic.
-                      Default to 1.
-                      MinReplicas couldn't be 0 now, will support serverless in the future.
-                    format: int32
-                    type: integer
                 type: object
               modelClaim:
                 description: |-