8000 fix: should route based on waiting requests, not active by PeaBrane · Pull Request #989 · ai-dynamo/dynamo · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

fix: should route based on waiting requests, not active #989

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send y 8000 ou account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 8, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 15 additions & 11 deletions lib/llm/src/kv_router/scheduler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -215,8 +215,10 @@ pub fn process_worker_selection(
.get_mut(&selection.worker_id)
.expect("worker not found");

// Update worker state
worker.data.request_active_slots += 1;
// Update worker state predictively
// Will be overwritten on next polling of metrics
worker.data.num_requests_waiting += 1;
// Assumes radix attention so KV load is only incremented by uncached blocks
worker.data.kv_active_blocks += selection.required_blocks - selection.overlap_blocks as u64;

// Emit event
Expand Down Expand Up @@ -245,7 +247,7 @@ impl WorkerSelector for DefaultWorkerSelector {
assert!(request.isl_tokens > 0);

let mut worker_scores = HashMap::new();
let mut max_active = 0.0;
let mut max_waiting = 0.0;

// Calculate worker scores and find max waiting requests
for (worker_id, ep) in workers.endpoints.iter() {
Expand All @@ -256,16 +258,16 @@ impl WorkerSelector for DefaultWorkerSelector {
}

// Track max waiting requests
max_active = f64::max(max_active, ep.data.request_active_slots as f64);
max_waiting = f64::max(max_waiting, ep.data.num_requests_waiting as f64);
}

if max_active == 0.0 {
if max_waiting == 0.0 {
return Err(KvSchedulerError::NoEndpoints);
}

// make immutable
let worker_scores = worker_scores;
let max_active = max_active;
let max_waiting = max_waiting;

// Calculate logits for each worker
let mut best_logit = f64::NEG_INFINITY;
Expand All @@ -280,22 +282,22 @@ impl WorkerSelector for DefaultWorkerSelector {
// Calculate normalized metrics
assert!(ep.data.kv_total_blocks > 0);
let gpu_cache_usage = ep.data.kv_active_blocks as f64 / ep.data.kv_total_blocks as f64;
let normalized_active = if max_active > 0.0 {
ep.data.request_active_slots as f64 / max_active
let normalized_waiting = if max_waiting > 0.0 {
ep.data.num_requests_waiting as f64 / max_waiting
} else {
0.0
};

// Calculate logit using same formula as Python
let logit = 2.0 * score - gpu_cache_usage - normalized_active;
let logit = 2.0 * score - gpu_cache_usage - normalized_waiting;

tracing::info!(
"Formula for {}: {:.3} = 2.0 * {:.3} - {:.3} - {:.3}",
worker_id,
logit,
score,
gpu_cache_usage,
normalized_active
normalized_waiting
);

// Track best workers
Expand All @@ -313,8 +315,10 @@ impl WorkerSelector for DefaultWorkerSelector {
}

// Return early if no valid workers found
if best_workers.is_empty() || best_logit == 0.0 {
if best_workers.is_empty() {
return Err(KvSchedulerError::NoEndpoints);
} else if best_logit == 0.0 {
tracing::warn!("best worker logit is 0");
}

let worker_id = if best_workers.len() == 1 {
Expand Down
Loading
0