init

2026-01-09 13:34:11 +08:00
parent dfa6476b58
commit b2ef04d792
538 changed files with 105693 additions and 2 deletions
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -0,0 +1,91 @@
+from typing import List, Set, Tuple
+
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.utils import make_async
+
+logger = init_logger(__name__)
+
+
+class NeuronExecutor(ExecutorBase):
+
+    def _init_executor(self) -> None:
+        assert (self.lora_config is
+                None), "LoRA is not supported for Neuron backend."
+        assert (not self.speculative_config
+                ), "Speculative decoding not yet supported for Neuron backend."
+
+        # Instantiate the worker and load the model to the device.
+        self._init_worker()
+
+    def _init_worker(self):
+        from vllm.worker.neuron_worker import NeuronWorker
+
+        self.driver_worker = NeuronWorker(
+            self.model_config,
+            self.parallel_config,
+            self.scheduler_config,
+            self.device_config,
+            self.cache_config,
+        )
+        self.driver_worker.init_device()
+        self.driver_worker.load_model()
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available KV blocks by invoking the
+        underlying worker.
+        """
+        return self.driver_worker.determine_num_available_blocks()
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Initialize the KV cache by invoking the underlying worker.
+        """
+        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        assert (execute_model_req.blocks_to_swap_in == {}
+                and execute_model_req.blocks_to_swap_out == {}
+                and execute_model_req.blocks_to_copy == {}), (
+                    "Cache operations are not supported for Neuron backend.")
+        assert execute_model_req.num_lookahead_slots == 0, (
+            "lookahead not supported for Neuron backend.")
+
+        output = self.driver_worker.execute_model(
+            execute_model_req.seq_group_metadata_list)
+        return output
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.driver_worker.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.driver_worker.remove_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.driver_worker.list_loras()
+
+    def check_health(self) -> None:
+        # NeuronExecutor will always be healthy as long as
+        # it's running.
+        return
+
+
+class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase):
+
+    async def execute_model_async(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> List[SamplerOutput]:
+        output = await make_async(
+            self.driver_worker.execute_model
+        )(seq_group_metadata_list=execute_model_req.seq_group_metadata_list, )
+        return output
+
+    async def check_health_async(self) -> None:
+        # NeuronExecutor will always be healthy as long as
+        # it's running.
+        return