8000 docs: update dynamo serve trtllm agg example yaml files by ziqif-nv · Pull Request #600 · ai-dynamo/dynamo · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

docs: update dynamo serve trtllm agg example yaml files #600

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions examples/tensorrt_llm/configs/agg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,12 @@ Frontend:

Processor:
engine_args: "configs/llm_api_config.yaml"
block-size: 64
router: round-robin

TensorRTLLMWorker:
engine_args: "configs/llm_api_config.yaml"
router: random
router: round-robin
ServiceArgs:
workers: 1
resources:
gpu: 1
gpu: 1
5 changes: 2 additions & 3 deletions examples/tensorrt_llm/configs/agg_router.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,16 @@ Frontend:

Processor:
engine_args: "configs/llm_api_config.yaml"
block-size: 64
router: kv

Router:
model-name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
min-workers: 1

TensorRTLLMWorker:
engine_args: "configs/llm_api_config.yaml"
engine_args: "configs/llm_api_config_router.yaml"
router: kv
ServiceArgs:
workers: 1
resources:
gpu: 1
gpu: 1
12 changes: 4 additions & 8 deletions examples/tensorrt_llm/configs/llm_api_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,15 @@ model_path: null
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 10240
max_num_tokens: 8192
max_batch_size: 16
trust_remote_code: true
backend: pytorch
enable_chunked_prefill: true

kv_cache_config:
free_gpu_memory_fraction: 0.95
# Uncomment to enable kv cache event collection
#event_buffer_max_size: 1024
#enable_block_reuse: true

pytorch_backend_config:
enable_overlap_scheduler: false
use_cuda_graph: false
# Uncomment to enable iter perf stats
#enable_iter_perf_stats: true
enable_overlap_scheduler: true
use_cuda_graph: true
39 changes: 39 additions & 0 deletions examples/tensorrt_llm/configs/llm_api_config_router.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# In the case of disaggregated deployment, this config will apply to each server
# and will be overwritten by the disaggregated config file

model_name: "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
model_path: null
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 8192
max_batch_size: 16
trust_remote_code: true
backend: pytorch
enable_chunked_prefill: true

kv_cache_config:
free_gpu_memory_fraction: 0.95
event_buffer_max_size: 1024
enable_block_reuse: true

pytorch_backend_config:
enable_overlap_scheduler: true
use_cuda_graph: true
enable_iter_perf_stats: true
Loading
0