first commit

2026-03-10 13:31:25 +08:00
parent ba974cecfa
commit b62b889355
2604 changed files with 438977 additions and 0 deletions
--- a/vllm_br/v1/engine/init.py
+++ b/vllm_br/v1/engine/init.py
@@ -0,0 +1,19 @@
+################################################################################
+# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+
+from . import async_llm  # noqa
+from . import core  # noqa: F401
+from . import llm_engine  # noqa
--- a/vllm_br/v1/engine/pycache/init.cpython-310.pyc
+++ b/vllm_br/v1/engine/pycache/init.cpython-310.pyc
--- a/vllm_br/v1/engine/pycache/async_llm.cpython-310.pyc
+++ b/vllm_br/v1/engine/pycache/async_llm.cpython-310.pyc
--- a/vllm_br/v1/engine/pycache/core.cpython-310.pyc
+++ b/vllm_br/v1/engine/pycache/core.cpython-310.pyc
--- a/vllm_br/v1/engine/pycache/llm_engine.cpython-310.pyc
+++ b/vllm_br/v1/engine/pycache/llm_engine.cpython-310.pyc
--- a/vllm_br/v1/engine/async_llm.py
+++ b/vllm_br/v1/engine/async_llm.py
@@ -0,0 +1,179 @@
+################################################################################
+# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+
+import asyncio
+import os
+import socket
+from typing import Optional
+
+import torch
+from fastcore.basics import patch_to
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.tracing import init_tracer
+from vllm.transformers_utils.config import (
+    maybe_register_config_serialize_by_value)
+from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
+from vllm.usage.usage_lib import UsageContext
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.engine.output_processor import OutputProcessor
+from vllm.v1.engine.processor import Processor
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager
+from vllm_br import envs as envs_br
+from vllm_br.utils import (create_cpu_all_reduce_shared_mem,
+                           get_cpu_all_reduce_shared_mem)
+
+logger = init_logger(__name__)
+
+
+@patch_to(AsyncLLM)
+def __init__(
+    self,
+    vllm_config: VllmConfig,
+    executor_class: type[Executor],
+    log_stats: bool,
+    usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+    mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+    use_cached_outputs: bool = False,
+    log_requests: bool = True,
+    start_engine_loop: bool = True,
+    stat_loggers: Optional[list[StatLoggerFactory]] = None,
+    client_addresses: Optional[dict[str, str]] = None,
+    client_count: int = 1,
+    client_index: int = 0,
+) -> None:
+    """
+    Create an AsyncLLM.
+
+    Args:
+        vllm_config: global configuration.
+        executor_class: an Executor impl, e.g. MultiprocExecutor.
+        log_stats: Whether to log stats.
+        usage_context: Usage context of the LLM.
+        mm_registry: Multi-modal registry.
+        use_cached_outputs: Whether to use cached outputs.
+        log_requests: Whether to log requests.
+        start_engine_loop: Whether to start the engine loop.
+        stat_loggers: customized stat loggers for the engine.
+            If not provided, default stat loggers will be used.
+            PLEASE BE AWARE THAT STAT LOGGER IS NOT STABLE
+            IN V1, AND ITS BASE CLASS INTERFACE MIGHT CHANGE.
+
+    Returns:
+        None
+    """
+    if not envs.VLLM_USE_V1:
+        raise ValueError(
+            "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. "
+            "This should not happen. As a workaround, try using "
+            "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
+            "VLLM_USE_V1=0 or 1 and report this issue on Github.")
+    if envs_br.VLLM_BR_USE_CPU_ALL_REDUCE != 0:
+        create_cpu_all_reduce_shared_mem()
+    # Ensure we can serialize custom transformer configs
+    maybe_register_config_serialize_by_value()
+
+    self.model_config = vllm_config.model_config
+    self.vllm_config = vllm_config
+    self.observability_config = vllm_config.observability_config
+    self.log_requests = log_requests
+
+    self.log_stats = log_stats or (stat_loggers is not None)
+    if not log_stats and stat_loggers is not None:
+        logger.info(
+            "AsyncLLM created with log_stats=False and non-empty custom "
+            "logger list; enabling logging without default stat loggers")
+
+    if self.model_config.skip_tokenizer_init:
+        self.tokenizer = None
+    else:
+        # Tokenizer (+ ensure liveness if running in another process).
+        self.tokenizer = init_tokenizer_from_configs(
+            model_config=vllm_config.model_config)
+
+    # Processor (converts Inputs --> EngineCoreRequests).
+    self.processor = Processor(
+        vllm_config=vllm_config,
+        tokenizer=self.tokenizer,
+        mm_registry=mm_registry,
+    )
+
+    # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
+    self.output_processor = OutputProcessor(self.tokenizer,
+                                            log_stats=self.log_stats)
+    if self.observability_config.otlp_traces_endpoint is not None:
+        tracer = init_tracer("vllm.llm_engine",
+                             self.observability_config.otlp_traces_endpoint)
+        self.output_processor.tracer = tracer
+
+    # EngineCore (starts the engine in background process).
+    self.engine_core = EngineCoreClient.make_async_mp_client(
+        vllm_config=vllm_config,
+        executor_class=executor_class,
+        log_stats=self.log_stats,
+        client_addresses=client_addresses,
+        client_count=client_count,
+        client_index=client_index,
+    )
+
+    # Loggers.
+    self.logger_manager: Optional[StatLoggerManager] = None  # type: ignore
+    if self.log_stats:
+        self.logger_manager = StatLoggerManager(
+            vllm_config=vllm_config,
+            engine_idxs=self.engine_core.engine_ranks_managed,
+            custom_stat_loggers=stat_loggers,
+            enable_default_loggers=log_stats,
+            client_count=client_count,
+        )
+        self.logger_manager.log_engine_initialized()
+
+    self.output_handler: Optional[asyncio.Task] = None  # type: ignore
+    try:
+        # Start output handler eagerly if we are in the asyncio eventloop.
+        asyncio.get_running_loop()
+        self._run_output_handler()
+    except RuntimeError:
+        pass
+
+    if envs.VLLM_TORCH_PROFILER_DIR:
+        logger.info(
+            "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s",  # noqa: E501
+            envs.VLLM_TORCH_PROFILER_DIR)
+        worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm"
+        self.profiler = torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+            ],
+            with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
+            on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                envs.VLLM_TORCH_PROFILER_DIR,
+                worker_name=worker_name,
+                use_gzip=True))
+    else:
+        self.profiler = None
+
+
+@patch_to(AsyncLLM)
+def __del__(self):
+    if get_cpu_all_reduce_shared_mem() is not None:
+        get_cpu_all_reduce_shared_mem()._cleanup()
+    self.shutdown()
--- a/vllm_br/v1/engine/core.py
+++ b/vllm_br/v1/engine/core.py
@@ -0,0 +1,157 @@
+################################################################################
+# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+
+import os
+import time
+from typing import Optional
+
+from fastcore.basics import patch_to
+
+from vllm.config import ParallelConfig, VllmConfig
+from vllm.logger import logger
+from vllm.v1.core.kv_cache_utils import (generate_scheduler_kv_cache_config,
+                                         get_kv_cache_configs)
+from vllm.v1.engine import EngineCoreOutputs
+from vllm.v1.engine.core import EngineCore, EngineCoreProc
+from vllm.v1.kv_cache_interface import KVCacheConfig
+
+
+@patch_to(EngineCore)
+def _initialize_kv_caches(
+        self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
+    start = time.time()
+
+    # Get all kv cache needed by the model
+    kv_cache_specs = self.model_executor.get_kv_cache_specs()
+
+    has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs)
+    if has_kv_cache:
+        if os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1":
+            dp_group = getattr(self, "dp_group", None)
+            assert dp_group is not None
+            self.available_gpu_memory_for_kv_cache = \
+                ParallelConfig.sync_kv_cache_memory_size(dp_group, -1)
+            available_gpu_memory = [self.available_gpu_memory_for_kv_cache
+                                    ] * len(kv_cache_specs)
+        else:
+            # Profiles the peak memory usage of the model to determine how
+            # much memory can be allocated for kv cache.
+            available_gpu_memory = (
+                self.model_executor.determine_available_memory())
+            self.available_gpu_memory_for_kv_cache = \
+                available_gpu_memory[0]
+    else:
+        # Attention free models don't need memory for kv cache
+        available_gpu_memory = [0] * len(kv_cache_specs)
+    available_gpu_memory = self.model_executor.determine_available_memory()
+    assert len(kv_cache_specs) == len(available_gpu_memory)
+
+    kv_cache_configs = get_kv_cache_configs(vllm_config, kv_cache_specs,
+                                            available_gpu_memory)
+    scheduler_kv_cache_config = generate_scheduler_kv_cache_config(
+        kv_cache_configs)
+    num_gpu_blocks = scheduler_kv_cache_config.num_blocks
+    num_cpu_blocks = 0
+
+    # Initialize kv cache and warmup the execution
+    self.model_executor.initialize_from_config(kv_cache_configs)
+
+    elapsed = time.time() - start
+    logger.info(("init engine (profile, create kv cache, "
+                 "warmup model) took %.2f seconds"), elapsed)
+    return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config
+
+
+@patch_to(EngineCore)
+def step_with_batch_queue(
+        self) -> tuple[Optional[dict[int, EngineCoreOutputs]], bool]:
+    """Schedule and execute batches with the batch queue.
+    Note that if nothing to output in this step, None is returned.
+
+    The execution flow is as follows:
+    1. Try to schedule a new batch if the batch queue is not full.
+    If a new batch is scheduled, directly return an empty engine core
+    output. In other words, fulfilling the batch queue has a higher priority
+    than getting model outputs.
+    2. If there is no new scheduled batch, meaning that the batch queue
+    is full or no other requests can be scheduled, we block until the first
+    batch in the job queue is finished.
+    3. Update the scheduler from the output.
+    """
+    batch_queue = self.batch_queue
+    assert batch_queue is not None
+
+    # Try to schedule a new batch if the batch queue is not full, but
+    # the scheduler may return an empty batch if all requests are scheduled.
+    # Note that this is not blocking.
+    assert len(batch_queue) < self.batch_queue_size
+
+    model_executed = False
+    if self.scheduler.has_requests():
+        scheduler_output = self.scheduler.schedule()
+        future = self.model_executor.execute_model(scheduler_output,
+                                                   non_block=True)
+        batch_queue.appendleft(
+            (future, scheduler_output))  # type: ignore[arg-type]
+
+        model_executed = scheduler_output.total_num_scheduled_tokens > 0
+        if model_executed and len(batch_queue) < self.batch_queue_size \
+            and not batch_queue[-1][0].done():
+            # Don't block on next worker response unless the queue is full
+            # or there are no more requests to schedule.
+            return None, True
+
+    elif not batch_queue:
+        # Queue is empty. We should not reach here since this method should
+        # only be called when the scheduler contains requests or the queue
+        # is non-empty.
+        return None, False
+
+    # Block until the next result is available.
+    future, scheduler_output = batch_queue.pop()
+    model_output = self.execute_model_with_error_logging(
+        lambda _: future.result(), scheduler_output)
+    if scheduler_output.total_num_scheduled_tokens != 0:
+        engine_core_outputs = self.scheduler.update_from_output(
+            scheduler_output, model_output)
+        if self.use_spec_decode:
+            # Take the draft token ids.
+            # draft_token_ids = self.model_executor.take_draft_token_ids()
+            if model_output.draft_token_ids is not None:
+                model_output.draft_token_ids.req_ids = model_output.req_ids
+                self.scheduler.update_draft_token_ids(
+                    model_output.draft_token_ids)
+        else:
+            pass
+        return engine_core_outputs, model_executed
+    else:
+        return None, False
+
+
+@patch_to(EngineCoreProc)
+def _process_engine_step(self) -> bool:
+    """Called only when there are unfinished local requests."""
+
+    # Step the engine core.
+    outputs, model_executed = self.step_fn()
+    # Put EngineCoreOutputs into the output queue.
+    for output in (outputs.items() if outputs else ()):
+        self.output_queue.put_nowait(output)
+    # Post-step hook.
+    # if outputs is not None:
+    #     self.post_step(model_executed)
+
+    return model_executed
--- a/vllm_br/v1/engine/llm_engine.py
+++ b/vllm_br/v1/engine/llm_engine.py
@@ -0,0 +1,143 @@
+################################################################################
+# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+
+from typing import Optional
+
+from fastcore.basics import patch_to
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.distributed import stateless_destroy_torch_distributed_process_group
+from vllm.distributed.parallel_state import get_dp_group
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.tracing import init_tracer
+from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
+from vllm.usage.usage_lib import UsageContext
+from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.engine.llm_engine import LLMEngine
+from vllm.v1.engine.output_processor import OutputProcessor
+from vllm.v1.engine.processor import Processor
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager
+from vllm_br import envs as envs_br
+from vllm_br.utils import (create_cpu_all_reduce_shared_mem,
+                           get_cpu_all_reduce_shared_mem)
+
+
+@patch_to(LLMEngine)
+def __init__(
+    self,
+    vllm_config: VllmConfig,
+    executor_class: type[Executor],
+    log_stats: bool,
+    usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+    stat_loggers: Optional[list[StatLoggerFactory]] = None,
+    mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+    use_cached_outputs: bool = False,
+    multiprocess_mode: bool = False,
+) -> None:
+    if not envs.VLLM_USE_V1:
+        raise ValueError("Using V1 LLMEngine, but envs.VLLM_USE_V1=False. "
+                         "This should not happen. As a workaround, try using "
+                         "LLMEngine.from_vllm_config(...) or explicitly set "
+                         "VLLM_USE_V1=0 or 1 and report this issue on Github.")
+
+    if stat_loggers is not None:
+        raise NotImplementedError(
+            "Passing StatLoggers to LLMEngine in V1 is not yet supported. "
+            "Set VLLM_USE_V1=0 and file and issue on Github.")
+    if envs_br.VLLM_BR_USE_CPU_ALL_REDUCE != 0:
+        create_cpu_all_reduce_shared_mem()
+
+    self.vllm_config = vllm_config
+    self.observability_config = vllm_config.observability_config
+    self.model_config = vllm_config.model_config
+    self.cache_config = vllm_config.cache_config
+
+    self.log_stats = log_stats
+
+    executor_backend = (
+        self.vllm_config.parallel_config.distributed_executor_backend)
+    parallel_config = vllm_config.parallel_config
+    self.external_launcher_dp = (parallel_config.data_parallel_size > 1
+                                 and executor_backend == "external_launcher")
+    # important: init dp group before init the engine_core
+    # In the decoupled engine case this is handled in EngineCoreProc.
+    if not multiprocess_mode and parallel_config.data_parallel_size > 1 \
+        and not self.external_launcher_dp:
+        self.dp_group = parallel_config.stateless_init_dp_group()
+    else:
+        self.dp_group = None
+    self.should_execute_dummy_batch = False
+
+    if self.model_config.skip_tokenizer_init:
+        self.tokenizer = None
+    else:
+        # Tokenizer (+ ensure liveness if running in another process).
+        self.tokenizer = init_tokenizer_from_configs(
+            model_config=vllm_config.model_config)
+
+    # Processor (convert Inputs --> EngineCoreRequests)
+    self.processor = Processor(vllm_config=vllm_config,
+                               tokenizer=self.tokenizer,
+                               mm_registry=mm_registry)
+
+    # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
+    self.output_processor = OutputProcessor(self.tokenizer,
+                                            log_stats=self.log_stats)
+    if self.observability_config.otlp_traces_endpoint is not None:
+        tracer = init_tracer("vllm.llm_engine",
+                             self.observability_config.otlp_traces_endpoint)
+        self.output_processor.tracer = tracer
+
+    # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
+    self.engine_core = EngineCoreClient.make_client(
+        multiprocess_mode=multiprocess_mode,
+        asyncio_mode=False,
+        vllm_config=vllm_config,
+        executor_class=executor_class,
+        log_stats=self.log_stats,
+    )
+
+    self.logger_manager: Optional[StatLoggerManager] = None  # type: ignore
+    if self.log_stats:
+        self.logger_manager = StatLoggerManager(
+            vllm_config=vllm_config,
+            custom_stat_loggers=stat_loggers,
+            enable_default_loggers=log_stats,
+        )
+        self.logger_manager.log_engine_initialized()
+
+    if not multiprocess_mode:
+        # for v0 compatibility
+        self.model_executor = self.engine_core.engine_core.model_executor  # type: ignore
+
+    if self.external_launcher_dp:
+        # If we use DP in external launcher mode, we reuse the
+        # existing DP group used for data communication.
+        self.dp_group = get_dp_group().cpu_group
+
+    # Don't keep the dummy data in memory
+    self.reset_mm_cache()
+
+
+@patch_to(LLMEngine)
+def __del__(self):
+    if dp_group := getattr(self, "dp_group",
+                           None) and not self.external_launcher_dp:
+        stateless_destroy_torch_distributed_process_group(dp_group)
+    if get_cpu_all_reduce_shared_mem() is not None:
+        get_cpu_all_reduce_shared_mem()._cleanup()