8000 fix: fix missing num_remote_prefill_groups in vLLM patch by ptarasiewiczNV · Pull Request #981 · ai-dynamo/dynamo · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

fix: fix missing num_remote_prefill_groups in vLLM patch #981

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

A 8000 lready on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 7, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 35 additions & 28 deletions container/deps/vllm/vllm_v0.8.4-dynamo-kv-disagg-patch.patch
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,7 @@ index 000000000..79eb8db67
+
+ self.event_id_counter += 1
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index cf85a2135..f9087b5c3 100644
index cf85a2135..f157aa231 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1,16 +1,30 @@
Expand Down Expand Up @@ -702,15 +702,21 @@ index cf85a2135..f9087b5c3 100644
running_queue = self.running
assert len(self._async_stopped) == 0
while running_queue:
@@ -1073,6 +1138,7 @@ class Scheduler:
@@ -1068,11 +1133,13 @@ class Scheduler:
ignored_seq_groups=[],
num_lookahead_slots=self._get_num_lookahead_slots(
is_prefill=True, enable_chunking=enable_chunking),
+ num_remote_prefill_groups=0
)
ignored_seq_groups: List[SequenceGroup] = []
seq_groups: List[ScheduledSequenceGroup] = []

waiting_queue = self.waiting
+ num_remote_prefill_groups = 0

leftover_waiting_sequences: Deque[SequenceGroup] = deque()
while self._passed_delay(time.time()) and waiting_queue:
@@ -1121,8 +1187,10 @@ class Scheduler:
@@ -1121,8 +1188,10 @@ class Scheduler:
True, enable_chunking)

# If the sequence group cannot be allocated, stop.
Expand All @@ -722,7 +728,7 @@ index cf85a2135..f9087b5c3 100644
if can_allocate == AllocStatus.LATER:
break
elif can_allocate == AllocStatus.NEVER:
@@ -1170,7 +1238,18 @@ class Scheduler:
@@ -1170,7 +1239,18 @@ class Scheduler:
if curr_loras is not None and lora_int_id > 0:
curr_loras.add(lora_int_id)
waiting_queue.popleft()
Expand All @@ -742,7 +748,7 @@ index cf85a2135..f9087b5c3 100644

if partial_prefill_metadata is not None:
partial_prefill_metadata.maybe_increment_partial_prefills(
@@ -1214,9 +1293,10 @@ class Scheduler:
@@ -1214,9 +1294,10 @@ class Scheduler:
ignored_seq_groups=ignored_seq_groups,
num_lookahead_slots=self._get_num_lookahead_slots(
is_prefill=True, enable_chunking=enable_chunking),
Expand All @@ -754,7 +760,7 @@ index cf85a2135..f9087b5c3 100644
"""Schedule queued requests.

The current policy is designed to optimize the throughput. First,
@@ -1234,6 +1314,9 @@ class Scheduler:
@@ -1234,6 +1315,9 @@ class Scheduler:
for seq_group in self.running:
budget.add_num_seqs(seq_group.request_id,
seq_group.get_max_num_running_seqs())
Expand All @@ -764,7 +770,7 @@ index cf85a2135..f9087b5c3 100644
curr_loras = (set(
seq_group.lora_int_id for seq_group in self.running
if seq_group.lora_int_id > 0) if self.lora_enabled else None)
@@ -1258,7 +1341,9 @@ class Scheduler:
@@ -1258,7 +1342,9 @@ class Scheduler:
if len(prefills.seq_groups) == 0:
running_scheduled = self._schedule_running(budget,
curr_loras,
Expand All @@ -775,7 +781,7 @@ index cf85a2135..f9087b5c3 100644

# If any sequence group is preempted, do not swap in any sequence
# group. because it means there's no slot for new running requests.
@@ -1275,7 +1360,12 @@ class Scheduler:
@@ -1275,7 +1361,12 @@ class Scheduler:
self.waiting.extendleft(running_scheduled.preempted)
# Update new running requests.
if len(prefills.seq_groups) > 0:
Expand All @@ -789,7 +795,7 @@ index cf85a2135..f9087b5c3 100644

self.running.extend(running_scheduled.decode_seq_groups_list)

@@ -1452,12 +1542,14 @@ class Scheduler:
@@ -1452,12 +1543,14 @@ class Scheduler:
]
return finishing + not_finishing

Expand All @@ -806,7 +812,7 @@ index cf85a2135..f9087b5c3 100644

def _can_append_slots(self, seq_group: SequenceGroup,
enable_chunking: bool) -> bool:
@@ -1491,14 +1583,16 @@ class Scheduler:
@@ -1491,14 +1584,16 @@ class Scheduler:
return no_single_seq

def schedule(
Expand All @@ -826,7 +832,7 @@ index cf85a2135..f9087b5c3 100644
now = time.time()

if not self.cache_config.enable_prefix_caching:
@@ -1537,7 +1631,8 @@ class Scheduler:
@@ -1537,7 +1632,8 @@ class Scheduler:
encoder_seq_data = None
cross_block_table = None

Expand All @@ -836,7 +842,7 @@ index cf85a2135..f9087b5c3 100644
seq_id = seq.seq_id
seq_data[seq_id] = seq.data
block_tables[seq_id] = self.block_manager.get_block_table(seq)
@@ -1546,7 +1641,9 @@ class Scheduler:
@@ -1546,7 +1642,9 @@ class Scheduler:
if self.cache_config.enable_prefix_caching:
common_computed_block_nums = (
self.block_manager.get_common_computed_block_ids(
Expand All @@ -847,7 +853,7 @@ index cf85a2135..f9087b5c3 100644

do_sample = True
is_prompt = seq_group.is_prefill()
@@ -1568,9 +1665,30 @@ class Scheduler:
@@ -1568,9 +1666,30 @@ class Scheduler:
< seqs[0].data.get_len()):
do_sample = False

Expand Down Expand Up @@ -878,15 +884,15 @@ index cf85a2135..f9087b5c3 100644
seq_group_metadata = SequenceGroupMetadata(
request_id=seq_group.request_id,
is_prompt=is_prompt,
@@ -1598,6 +1716,7 @@ class Scheduler:
@@ -1598,6 +1717,7 @@ class Scheduler:
if scheduler_outputs.num_prefill_groups > 0 else None),
mm_processor_kwargs=seq_group.mm_processor_kwargs,
prompt_adapter_request=seq_group.prompt_adapter_request,
+ do_remote_prefill=is_remote_prefill,
)
else:
# When SPMD mode is enabled, we only send delta data except for
@@ -1696,10 +1815,16 @@ class Scheduler:
@@ -1696,10 +1816,16 @@ class Scheduler:

self._async_stopped.clear()

Expand Down Expand Up @@ -1039,10 +1045,10 @@ index 000000000..a2f9ce99e
\ No newline at end of file
diff --git a/vllm/distributed/device_communicators/nixl.py b/vllm/distributed/device_communicators/nixl.py
new file mode 100644
index 000000000..4c5ed707f
index 000000000..bd4ac984e
--- /dev/null
+++ b/vllm/distributed/device_communicators/nixl.py
@@ -0,0 +1,447 @@
@@ -0,0 +1,445 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
Expand Down Expand Up @@ -1487,8 +1493,6 @@ index 000000000..4c5ed707f
+ done_req_ids.append(req_id)
+ else:
+ self._transfers[req_id] = running_reqs
+ for req_id in done_req_ids:
+ del self._transfers[req_id]
+ return done_req_ids
diff --git a/vllm/distributed/kv_transfer/kv_connector/dynamo_connector.py b/vllm/distributed/kv_transfer/kv_connector/dynamo_connector.py
new file mode 100644
Expand Down Expand Up @@ -2892,7 +2896,7 @@ index 975afe5ad..2208abea0 100644
use_v1 = True

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 54f7b8fb6..0559f9db2 100644
index 54f7b8fb6..< 8000 span class="x x-first x-last">9c1c2635f 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,11 +1,28 @@
Expand Down Expand Up @@ -3135,7 +3139,7 @@ index 54f7b8fb6..0559f9db2 100644

# Skip the scheduler if there are any remaining steps in the seq groups.
# This ensures that the scheduler is only called again when the current
@@ -1372,7 +1452,41 @@ class LLMEngine:
@@ -1372,7 +1452,43 @@ class LLMEngine:
# Schedule iteration
(seq_group_metadata_list, scheduler_outputs,
allow_async_output_proc
Expand Down Expand Up @@ -3165,6 +3169,7 @@ index 54f7b8fb6..0559f9db2 100644
+ logger.debug("No blocks to prefill")
+ self._finished_prefills.add(seq_group_metadata.request_id)
+ continue
+
+ remote_prefill_request = RemotePrefillRequest(
+ request_id=seq_group_metadata.request_id,
+ # prompt_token_ids=scheduled_seq_group.seq_group.seqs[0].inputs.prompt_token_ids[:-1], # last one will be decoded on decode for sampling anyway
Expand All @@ -3173,12 +3178,13 @@ index 54f7b8fb6..0559f9db2 100644
+ block_ids=block_table,
+ engine_id=self.engine_id,
+ computed_block_ids=seq_group_metadata.computed_block_nums,
+ multimodal_data_source=scheduled_seq_group.seq_group.remote_prefill_params.multimodal_data_source
+ )
+ scheduled_seq_group.seq_group.remote_prefill_params.remote_prefill_request_callback(remote_prefill_request)

ctx.seq_group_metadata_list = seq_group_metadata_list
ctx.scheduler_outputs = scheduler_outputs
@@ -1427,8 +1541,46 @@ class LLMEngine:
@@ -1427,8 +1543,46 @@ class LLMEngine:
execute_model_req.async_callback = self.async_callbacks[
virtual_engine]

Expand Down Expand Up @@ -3226,15 +3232,15 @@ index 54f7b8fb6..0559f9db2 100644
execute_model_req=execute_model_req)
self._skip_scheduling_next_step = False
except InputProcessingError as e:
@@ -1444,7 +1596,6 @@ class LLMEngine:
@@ -1444,7 +1598,6 @@ class LLMEngine:
allow_async_output_proc=allow_async_output_proc)
# Raise so the caller is notified that this request failed
raise
-
# We need to do this here so that last step's sampled_token_ids can
# be passed to the next iteration for PP.
if self.scheduler_config.is_multi_step:
@@ -1455,7 +1606,26 @@ class LLMEngine:
@@ -1455,7 +1608,26 @@ class LLMEngine:
if len(ctx.output_queue) > 0:
self._process_model_outputs(ctx=ctx)
# No outputs in this case
Expand Down Expand Up @@ -3262,7 +3268,7 @@ index 54f7b8fb6..0559f9db2 100644

# Finish the current step for all the sequence groups.
if self.scheduler_config.is_multi_step:
@@ -1515,7 +1685,7 @@ class LLMEngine:
@@ -1515,7 +1687,7 @@ class LLMEngine:
# queued control plane messages, such as add/remove lora adapters.
logger.debug("Stopping remote worker execution loop.")
self.model_executor.stop_remote_worker_execution_loop()
Expand Down Expand Up @@ -4174,10 +4180,10 @@ index 0ed221043..08dbc0e78 100644
"The vLLM package was not found, so its version could not be "
diff --git a/vllm/remote_prefill.py b/vllm/remote_prefill.py
new file mode 100644
index 000000000..83f6cd575
index 000000000..0a063f1ca
--- /dev/null
+++ b/vllm/remote_prefill.py
@@ -0,0 +1,82 @@
@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
Expand Down Expand Up @@ -4223,6 +4229,7 @@ index 000000000..83f6cd575
+ sampling_params: SamplingParams
+ block_ids: List[int]
+ computed_block_ids: List[int]
+ multimodal_data_source: Optional[dict[str, str]] = None
+
+
+class MemoryOpType(str, Enum):
Expand Down Expand Up @@ -4260,7 +4267,7 @@ index 000000000..83f6cd575
+ decode_computed_block_ids: Optional[List[int]] = None
+ decode_engine_id: Optional[str] = None
+ remote_prefill_request_callback: Optional[RemotePrefillRequestCallback] = None
\ No newline at end of file
+ multimodal_data_source: Optional[dict[str, str]] = None
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 68ed99664..5b0b7e6dc 100644
--- a/vllm/sampling_params.py
Expand Down
Loading
0