ai-dynamo · grahamking · May 8, 2025 · May 8, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md
@@ -26,7 +26,7 @@ Usage:
 dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mistralrs|llamacpp|sglang|vllm|dyn://<path> [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin]
 ```
 
-Example: `dynamo run Qwen/Qwen2.5-3B-Instruct`.
+Example: `dynamo run Qwen/Qwen3-0.6B`
 
 Set environment variable `DYN_LOG` to adjust logging level, e.g. `export DYN_LOG=debug`. It has the same syntax as `RUST_LOG`, ask AI for details.
 
@@ -38,9 +38,9 @@ The vllm and sglang engines require [etcd](https://etcd.io/) and [nats](https://
 
 ### Use model from Hugging Face
 
-This will automatically download Qwen2.5 3B from Hugging Face (6 GiB download) and start it in interactive text mode:
+This will automatically download Qwen3 4B from Hugging Face (16 GiB download) and start it in interactive text mode:
 ```
-dynamo run out=vllm Qwen/Qwen2.5-3B-Instruct
+dynamo run out=vllm Qwen/Qwen3-4B
 ```
 
 General format for HF download:
@@ -65,12 +65,12 @@ curl -L -o Llama-3.2-3B-Instruct-Q4_K_M.gguf "https://huggingface.co/bartowski/L
 #### Run model from local file
 **Text interface**
 ```
-dynamo run out=vllm Llama-3.2-3B-Instruct-Q4_K_M.gguf # or path to a Hugging Face repo checkout instead of the GGUF
+dynamo run Llama-3.2-3B-Instruct-Q4_K_M.gguf # or path to a Hugging Face repo checkout instead of the GGUF
 ```
 
 **HTTP interface**
 ```
-dynamo run in=http out=vllm Llama-3.2-3B-Instruct-Q4_K_M.gguf
+dynamo run in=http out=mistralrs Llama-3.2-3B-Instruct-Q4_K_M.gguf
 ```
 
 **List the models**
@@ -94,7 +94,7 @@ You will need [etcd](https://etcd.io/) and [nats](https://nats.io) with jetstrea
 OpenAI compliant HTTP server, optional pre-processing, worker discovery.
 
 ```
-dynamo run in=http out=dyn://llama3B_pool
+dynamo-run in=http out=dyn://llama3B_pool
 ```
 
 **Node 2:**
@@ -109,11 +109,11 @@ This will use etcd to auto-discover the model and NATS to talk to it. You can ru
 
 The `llama3B_pool` name is purely symbolic, pick anything as long as it matches the other node.
 
-Run `dynamo run --help` for more options.
+Run `dynamo-run --help` for more options.
 
 ## Full usage details
 
-`dynamo-run` is what `dynamo run` executes. It is an example of what you can build in Rust with the `dynamo-llm` and `dynamo-runtime`. The following guide demonstrates how you can build from source with all the features.
+`dynamo-run` is what `dynamo run` executes. It is also an example of what you can build in Rust with the `dynamo-llm` and `dynamo-runtime` crates. The following guide demonstrates how you can build from source with all the features.
 
 ### Setup
 
@@ -181,13 +181,13 @@ Build with `--release` for a smaller binary and better performance, but longer b
 [mistral.rs](https://github.com/EricLBuehler/mistral.rs) is a pure Rust engine that is fast to run, fast to load, supports GGUF as well as safetensors, and runs well on CPU as well as GPU. For those reasons it is the default engine.
 
 ```
-dynamo-run Qwen/Qwen2.5-3B-Instruct
+dynamo-run Qwen/Qwen3-4B
 ```
 
 is equivalent to
 
 ```
-dynamo-run in=text out=mistralrs Qwen/Qwen2.5-3B-Instruct
+dynamo-run in=text out=mistralrs Qwen/Qwen3-4B
 ```
 
 If you have multiple GPUs, mistral.rs does automatic tensor parallelism. You do not need to pass any extra flags to dynamo-run to enable it.
@@ -204,7 +204,14 @@ cargo build --features llamacpp[,cuda|metal|vulkan] -p dynamo-run
 dynamo-run out=llamacpp ~/llms/Llama-3.2-3B-Instruct-Q6_K.gguf
 ```
 
-llamacpp is best for single-GPU inference with a quantized GGUF model file.
+Note that in some cases we are unable to extract the tokenizer from the GGUF, and so a Hugging Face checkout of a matching model must also be passed. Dynamo will use the weights from the GGUF and the pre-processor (`tokenizer.json`, etc) from the `--model-config`:
+```
+dynamo-run out=llamacpp ~/llms/gemma-3-1b-it-q4_0.gguf --model-config ~/llms/gemma-3-1b-it
+
+dynamo-run out=llamacpp ~/llms/Llama-4-Scout-17B-16E-Instruct-UD-IQ1_S.gguf --model-config ~/llms/Llama-4-Scout-17B-16E-Instruct
+```
+
+If you have multiple GPUs, llama.cpp does automatic tensor parallelism. You do not need to pass any extra flags to dynamo-run to enable it.
 
 ### sglang
 
@@ -233,9 +240,13 @@ To pass extra arguments to the sglang engine see *Extra engine arguments* below.
 
 **Multi-GPU**
 
-Pass `--tensor-parallel-size <NUM-GPUS>` to `dynamo-run`. To specify which GPU to start from pass `--base-gpu-id <num>`.
+Pass `--tensor-parallel-size <NUM-GPUS>` to `dynamo-run`.
+
+```
+dynamo-run out=sglang ~/llms/Llama-4-Scout-17B-16E-Instruct/ --tensor-parallel-size 8
+```
 
-For example on a shared eight GPU machine where GPUs 0-3 are already in use:
+To specify which GPU to start from pass `--base-gpu-id <num>`, for example on a shared eight GPU machine where GPUs 0-3 are already in use:
 ```
 dynamo-run out=sglang <model> --tensor-parallel-size 4 --base-gpu-id 4
 ```

@@ -16,10 +16,6 @@ pub use dynamo_llm::request_template::RequestTemplate;
 pub use opt::{Input, Output};
 mod subprocess;
 
-/// When `in=text` the user doesn't need to know the model name, and doesn't need to provide it on
-/// the command line. Hence it's optional, and defaults to this.
-const INVISIBLE_MODEL_NAME: &str = "dynamo-run";
-
 const CHILD_STOP_TIMEOUT: Duration = Duration::from_secs(2);
 
 /// How we identify a python string endpoint
@@ -64,15 +60,10 @@ pub async fn run(
         _ => {
             match &maybe_path {
                 Some(model_path) => {
-                    let maybe_model_name = if in_opt == Input::Text {
-                        Some(INVISIBLE_MODEL_NAME.to_string())
-                    } else {
-                        flags.model_name.clone()
-                    };
                     LocalModel::prepare(
                         model_path.to_str().context("Invalid UTF-8 in model path")?,
                         flags.model_config.as_deref(),
-                        maybe_model_name,
+                        flags.model_name.clone(),
                     )
                     .await?
                 }
@@ -121,7 +112,7 @@ pub async fn run(
         }
         #[cfg(feature = "mistralrs")]
         Output::MistralRs => EngineConfig::StaticFull {
-            engine: dynamo_engine_mistralrs::make_engine(local_model.path()).await?,
+            engine: dynamo_engine_mistralrs::make_engine(&local_model).await?,
             model: Box::new(local_model),
         },
         Output::SgLang => {

diff --git a/lib/bindings/python/Cargo.lock b/lib/bindings/python/Cargo.lock
@@ -38,4 +38,4 @@ async-stream = { workspace = true }
 tokio = { workspace = true }
 tracing = { workspace = true }
 
-llama-cpp-2 = { version = "0.1.102" }
+llama-cpp-2 = { version = "0.1.103" }
@@ -228,9 +228,8 @@ fn run_request(
         limit,
     );
 
-    // create a llama_batch with size 512
     // we use this object to submit token data for decoding
-    let mut batch = LlamaBatch::new(512, 1);
+    let mut batch = LlamaBatch::new(std::cmp::max(512, max_output_tokens as usize), 1);
     let last_index: i32 = (tokens_list.len() - 1) as i32;
     for (i, token) in (0_i32..).zip(tokens_list.into_iter()) {
         // llama_decode will output logits only for the last token of the prompt

@@ -40,7 +40,7 @@ async-trait = { workspace = true }
 candle-core = { version = "0.8.0" }
 either = { workspace = true }
 indexmap = { version = "2.6" }
-mistralrs = { git = "https://github.com/EricLBuehler/mistral.rs.git", rev = "aaafc2ef" }
+mistralrs = { git = "https://github.com/EricLBuehler/mistral.rs.git", rev = "ebd50e35e" }
 serde_json = { workspace = true }
 tokio = { workspace = true }
 tracing = { workspace = true }