8000 API refactor by kerthcet · Pull Request #285 · InftyAI/llmaz · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

API refactor #285

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

8000
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 0 additions & 14 deletions api/inference/v1alpha1/backendruntime_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,6 @@ type ScaleTrigger struct {
HPA *HPATrigger `json:"hpa,omitempty"`
}

// MultiHostCommands represents leader & worker commands for multiple nodes scenarios.
type MultiHostCommands struct {
// Leader commands.
// +optional
Leader []string `json:"leader,omitempty"`
// Worker commands.
// +optional
Worker []string `json:"worker,omitempty"`
}

// RecommendedConfig represents the recommended configurations for the backendRuntime,
// user can choose one of them to apply.
type RecommendedConfig struct {
Expand Down Expand Up @@ -89,10 +79,6 @@ type BackendRuntimeSpec struct {
// Commands represents the default commands for the backendRuntime.
// +optional
Commands []string `json:"commands,omitempty"`
// MultiHostCommands represents leader and worker commands for nodes with
// different roles.
// +optional
MultiHostCommands *MultiHostCommands `json:"multiHostCommands,omitempty"`
// Image represents the default image registry of the backendRuntime.
// It will work together with version to make up a real image.
Image string `json:"image"`
Expand Down
13 changes: 3 additions & 10 deletions api/inference/v1alpha1/config_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,10 @@ type BackendRuntimeConfig struct {
Envs []corev1.EnvVar `json:"envs,omitempty"`
// ConfigName represents the recommended configuration name for the backend,
// It will be inferred from the models in the runtime if not specified, e.g. default,
// speculative-decoding or model-parallelism.
// speculative-decoding.
ConfigName *string `json:"configName,omitempty"`
// Args represents all the arguments for the command.
// Argument around with {{ .CONFIG }} is a configuration waiting for render.
// +optional
// Args defined here will "append" the args in the recommendedConfig.
// Args defined here will "append" the args defined in the recommendedConfig,
// either explicitly configured in configName or inferred in the runtime.
// +optional
Args []string `json:"args,omitempty"`
// Resources represents the resource requirements for backend, like cpu/mem,
Expand All @@ -60,11 +58,6 @@ type BackendRuntimeConfig struct {
// SharedMemorySize defined here will "overwrite" the sharedMemorySize in the recommendedConfig.
// +optional
SharedMemorySize *resource.Quantity `json:"sharedMemorySize,omitempty"`
// ScaleTrigger defines the rules to scale the workloads.
// Only one trigger cloud work at a time, mostly used in Playground.
// ScaleTrigger defined here will "overwrite" the scaleTrigger in the recommendedConfig.
// +optional
ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
}

// TODO: Do not support DRA yet, we can support that once needed.
Expand Down
5 changes: 5 additions & 0 deletions api/inference/v1alpha1/playground_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ type ElasticConfig struct {
// Default to nil means there's no limit for the instance number.
// +optional
MaxReplicas *int32 `json:"maxReplicas,omitempty"`
// ScaleTrigger defines the rules to scale the workloads.
// Only one trigger cloud work at a time, mostly used in Playground.
// ScaleTrigger defined here will "overwrite" the scaleTrigger in the recommendedConfig.
// +optional
ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
}

const (
Expand Down
16 changes: 10 additions & 6 deletions api/inference/v1alpha1/service_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,16 @@ const (
type ServiceSpec struct {
// ModelClaims represents multiple claims for different models.
ModelClaims coreapi.ModelClaims `json:"modelClaims,omitempty"`
// WorkloadTemplate defines the underlying workload layout and configuration.
// Note: the LWS spec might be twisted with various LWS instances to support
// accelerator fungibility or other cutting-edge researches.
// LWS supports both single-host and multi-host scenarios, for single host
// cases, only need to care about replicas, rolloutStrategy and workerTemplate.
WorkloadTemplate lws.LeaderWorkerSetSpec `json:"workloadTemplate"`
// Replicas represents the replica number of inference workloads.
// +kubebuilder:default=1
// +optional
Replicas *int32 `json:"replicas,omitempty"`
// WorkloadTemplate defines the template for leader/worker pods
WorkloadTemplate lws.LeaderWorkerTemplate `json:"workloadTemplate"`
// RolloutStrategy defines the strategy that will be applied to update replicas
// when a revision is made to the leaderWorkerTemplate.
// +optional
RolloutStrategy lws.RolloutStrategy `json:"rolloutStrategy,omitempty"`
}

const (
Expand Down
46 changes: 11 additions & 35 deletions api/inference/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 0 additions & 16 deletions chart/crds/backendruntime-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -322,22 +322,6 @@ spec:
format: int32
type: integer
type: object
multiHostCommands:
description: |-
MultiHostCommands represents leader and worker commands for nodes with
different roles.
properties:
leader:
description: Leader commands.
items:
type: string
type: array
worker:
description: Worker commands.
items:
type: string
type: array
type: object
readinessProbe:
description: |-
Periodic probe of backend readiness.
Expand Down
77 changes: 38 additions & 39 deletions chart/crds/playground-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,8 @@ spec:
properties:
args:
description: |-
Args represents all the arguments for the command.
Argument around with {{ .CONFIG }} is a configuration waiting for render.
Args defined here will "append" the args in the recommendedConfig.
Args defined here will "append" the args defined in the recommendedConfig,
either explicitly configured in configName or inferred in the runtime.
items:
type: string
type: array
Expand All @@ -62,7 +61,7 @@ spec:
description: |-
ConfigName represents the recommended configuration name for the backend,
It will be inferred from the models in the runtime if not specified, e.g. default,
speculative-decoding or model-parallelism.
speculative-decoding.
type: string
envs:
description: Envs represents the environments set to the container.
Expand Down Expand Up @@ -216,6 +215,41 @@ spec:
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
type: object
type: object
sharedMemorySize:
anyOf:
- type: integer
- type: string
description: |-
SharedMemorySize represents the size of /dev/shm required in the runtime of
inference workload.
SharedMemorySize defined here will "overwrite" the sharedMemorySize in the recommendedConfig.
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
version:
description: |-
Version represents the backend version if you want a different one
from the default version.
type: string
type: object
elasticConfig:
description: |-
ElasticConfig defines the configuration for elastic usage,
e.g. the max/min replicas.
properties:
maxReplicas:
description: |-
MaxReplicas indicates the maximum number of inference workloads based on the traffic.
Default to nil means there's no limit for the instance number.
format: int32
type: integer
minReplicas:
default: 1
description: |-
MinReplicas indicates the minimum number of inference workloads based on the traffic.
Default to 1.
MinReplicas couldn't be 0 now, will support serverless in the future.
format: int32
type: integer
scaleTrigger:
description: |-
ScaleTrigger defines the rules to scale the workloads.
Expand Down Expand Up @@ -829,41 +863,6 @@ spec:
type: array
type: object
type: object
sharedMemorySize:
anyOf:
- type: integer
- type: string
description: |-
SharedMemorySize represents the size of /dev/shm required in the runtime of
inference workload.
SharedMemorySize defined here will "overwrite" the sharedMemorySize in the recommendedConfig.
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
version:
description: |-
Version represents the backend version if you want a different one
from the default version.
type: string
type: object
elasticConfig:
description: |-
ElasticConfig defines the configuration for elastic usage,
e.g. the max/min replicas.
properties:
maxReplicas:
description: |-
MaxReplicas indicates the maximum number of inference workloads based on the traffic.
Default to nil means there's no limit for the instance number.
format: int32
type: integer
minReplicas:
default: 1
description: |-
MinReplicas indicates the minimum number of inference workloads based on the traffic.
Default to 1.
MinReplicas couldn't be 0 now, will support serverless in the future.
format: int32
type: integer
type: object
modelClaim:
description: |-
Expand Down
Loading
Loading
0