ai-dynamo · ziqif-nv · Apr 10, 2025 · Apr 10, 2025 · Apr 10, 2025
@@ -20,13 +20,12 @@ Frontend:
 
 Processor:
   engine_args: "configs/llm_api_config.yaml"
-  block-size: 64
   router: round-robin
 
 TensorRTLLMWorker:
   engine_args: "configs/llm_api_config.yaml"
-  router: random
+  router: round-robin
   ServiceArgs:
     workers: 1
     resources:
-      gpu: 1
+      gpu: 1
@@ -20,17 +20,16 @@ Frontend:
 
 Processor:
   engine_args: "configs/llm_api_config.yaml"
-  block-size: 64
   router: kv
 
 Router:
   model-name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
   min-workers: 1
 
 TensorRTLLMWorker:
-  engine_args: "configs/llm_api_config.yaml"
+  engine_args: "configs/llm_api_config_router.yaml"
   router: kv
   ServiceArgs:
     workers: 1
     resources:
-      gpu: 1
+      gpu: 1
@@ -22,19 +22,15 @@ model_path: null
 tensor_parallel_size: 1
 moe_expert_parallel_size: 1
 enable_attention_dp: false
-max_num_tokens: 10240
+max_num_tokens: 8192
 max_batch_size: 16
 trust_remote_code: true
 backend: pytorch
+enable_chunked_prefill: true
 
 kv_cache_config:
   free_gpu_memory_fraction: 0.95
-  # Uncomment to enable kv cache event collection
-  #event_buffer_max_size: 1024
-  #enable_block_reuse: true
 
 pytorch_backend_config:
-  enable_overlap_scheduler: false
-  use_cuda_graph: false
-  # Uncomment to enable iter perf stats
-  #enable_iter_perf_stats: true
+  enable_overlap_scheduler: true
+  use_cuda_graph: true
@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# In the case of disaggregated deployment, this config will apply to each server
+# and will be overwritten by the disaggregated config file
+
+model_name: "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+model_path: null
+tensor_parallel_size: 1
+moe_expert_parallel_size: 1
+enable_attention_dp: false
+max_num_tokens: 8192
+max_batch_size: 16
+trust_remote_code: true
+backend: pytorch
+enable_chunked_prefill: true
+
+kv_cache_config:
+  free_gpu_memory_fraction: 0.95
+  event_buffer_max_size: 1024
+  enable_block_reuse: true
+
+pytorch_backend_config:
+  enable_overlap_scheduler: true
+  use_cuda_graph: true
+  enable_iter_perf_stats: true