port deepseekv2 and mtp to main branch (#429)

### What this PR does / why we need it? This PR ports all the deepseek graph mode code and mtp code from v0.7.3 to the main branch --------- Signed-off-by: SidaoY <1024863041@qq.com> Signed-off-by: linfeng-yuan <1102311262@qq.com> Signed-off-by: Yizhou Liu <liuyizhou5@h-partners.com> Signed-off-by: mengwei805 <mengwei25@huawei.com> Signed-off-by: libaokui <libaokui@huawei.com> Signed-off-by: q00832892 <qiaoyang19@huawei.com> Signed-off-by: ganyi <pleaplusone.gy@gmail.com> Co-authored-by: SidaoY <1024863041@qq.com> Co-authored-by: linfeng-yuan <1102311262@qq.com> Co-authored-by: Yizhou Liu <liuyizhou5@h-partners.com> Co-authored-by: mengwei805 <mengwei25@huawei.com> Co-authored-by: libaokui <libaokui@huawei.com>
2025-04-19 17:38:18 +08:00
parent 086423dc35
commit 1a1f9a6d89
33 changed files with 3361 additions and 315 deletions
--- a/examples/disaggregated_prefill_hccl.py
+++ b/examples/disaggregated_prefill_hccl.py
@@ -0,0 +1,128 @@
+"""
+ This file demonstrates the example usage of disaggregated prefilling
+ We will launch 2 vllm instances (NPU 0,1 for prefill and NPU 2,3 for decode),
+ and then transfer the KV cache between them.
+ """
+import multiprocessing as mp
+import os
+import time
+from multiprocessing import Event, Process
+
+
+def clean_up():
+    import gc
+
+    import torch
+    from vllm.distributed.parallel_state import (
+        destroy_distributed_environment, destroy_model_parallel)
+    destroy_model_parallel()
+    destroy_distributed_environment()
+    gc.collect()
+    torch.npu.empty_cache()
+
+
+def run_prefill(prefill_done, process_close):
+    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"
+
+    from vllm import LLM, SamplingParams
+    from vllm.config import KVTransferConfig
+
+    prompts = [
+        "Hello, how are you today?", "Hi, what is your name?",
+        "Tell me a very long story.", "what is your favourite book?"
+    ]
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+
+    ktc = KVTransferConfig.from_cli(
+        '{"kv_connector":"AscendHcclConnector","kv_buffer_device":"npu","kv_role":"kv_producer", "kv_parallel_size":2}'
+    )
+
+    # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
+    # memory. You may need to adjust the value to fit your GPU.
+    llm = LLM(model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+              kv_transfer_config=ktc,
+              max_model_len=2000,
+              gpu_memory_utilization=0.8,
+              tensor_parallel_size=2)
+
+    llm.generate(prompts, sampling_params)
+    print("Prefill node is finished.")
+    prefill_done.set()
+
+    # To keep the prefill node running in case the decode node is not done;
+    # otherwise, the script might exit prematurely, causing incomplete decoding.
+    try:
+        while not process_close.is_set():
+            time.sleep(1)
+    except KeyboardInterrupt:
+        print("Script stopped by user.")
+    finally:
+        print("Cleanup prefill resources")
+        del llm
+        clean_up()
+
+
+def run_decode(prefill_done):
+    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "2,3"
+
+    from vllm import LLM, SamplingParams
+    from vllm.config import KVTransferConfig
+
+    prompts = [
+        "Hello, how are you today?", "Hi, what is your name?",
+        "Tell me a very long story.", "what is your favourite book?"
+    ]
+    sampling_params = SamplingParams(temperature=0, top_p=0.95)
+
+    ktc = KVTransferConfig.from_cli(
+        '{"kv_connector":"AscendHcclConnector","kv_buffer_device":"npu","kv_role":"kv_consumer","kv_parallel_size":2}'
+    )
+
+    llm = LLM(model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+              kv_transfer_config=ktc,
+              max_model_len=2000,
+              gpu_memory_utilization=0.8,
+              tensor_parallel_size=2)
+
+    # Wait for the producer to start the comsumer
+    print("Waiting for prefill node to finish...")
+    prefill_done.wait()
+
+    # At this point when the prefill_done is set, the kv-cache should have been
+    # transferred to this decode node, so we can start decoding.
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    del llm
+    clean_up()
+
+
+if __name__ == "__main__":
+    mp.get_context('spawn')
+
+    prefill_done = Event()
+    process_close = Event()
+    prefill_process = Process(target=run_prefill,
+                              args=(
+                                  prefill_done,
+                                  process_close,
+                              ))
+    decode_process = Process(target=run_decode, args=(prefill_done, ))
+
+    # Start prefill node
+    prefill_process.start()
+
+    # Start decode node
+    decode_process.start()
+
+    # Terminate the prefill node when decode is finished
+    decode_process.join()
+
+    # Terminate prefill process
+    process_close.set()
+    prefill_process.join()
+    prefill_process.terminate()
+    print("All process done!")
--- a/examples/dp_offline/data_parallel.py
+++ b/examples/dp_offline/data_parallel.py
@@ -0,0 +1,86 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm-project/vllm/examples/offline_inference/data_parallel.py
+# SPDX-License-Identifier: Apache-2.0
+# usage:
+# python examples/offline_inference_data_parallel.py
+# we need to have a launcher to create multiple data parallel
+# ranks. And each rank will create a vLLM instance to process its own prompts.
+
+import gc
+import os
+
+VLLM_ENABLE_GRAPGH_MODE = os.environ.get("VLLM_ENABLE_GRAPH_MODE") == "1"
+
+
+def main():
+    dp_rank = int(os.environ['RANK'])
+    local_rank = int(os.environ['LOCAL_RANK'])
+    dp_size = int(os.environ['WORLD_SIZE'])
+    master_addr = os.environ['MASTER_ADDR']
+    master_port = os.environ['MASTER_PORT']
+    tp_size = 4
+    etp_size = 2
+
+    os.environ["VLLM_DP_RANK"] = str(dp_rank)
+    os.environ["VLLM_DP_SIZE"] = str(dp_size)
+    os.environ["VLLM_DP_MASTER_IP"] = master_addr
+    os.environ["VLLM_DP_MASTER_PORT"] = master_port
+    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = ",".join(
+        str(i)
+        for i in range(local_rank * tp_size, (local_rank + 1) * tp_size))
+
+    import torch
+    import torch_npu  # noqa
+    from vllm import LLM, SamplingParams
+    from vllm.distributed.parallel_state import (
+        destroy_distributed_environment, destroy_model_parallel)
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ] * 4
+
+    promts_per_rank = len(prompts) // dp_size
+    start = dp_rank * promts_per_rank
+    end = start + promts_per_rank
+    prompts = prompts[start:end]
+    if len(prompts) == 0:
+        prompts = ["Placeholder"]
+    print(f"DP rank {dp_rank} needs to process {len(prompts)} prompts")
+    num_seqs = len(prompts)
+
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=4,
+                                     min_tokens=4)
+    # Create an LLM.
+    llm = LLM(
+        model="deepseek-ai/DeepSeek-V2-Lite-Chat",
+        tensor_parallel_size=tp_size,
+        trust_remote_code=True,
+        expert_tensor_parallel_size=etp_size,
+        max_model_len=4096,
+        max_num_seqs=num_seqs,
+        compilation_config=1 if VLLM_ENABLE_GRAPGH_MODE else 0,
+    )
+
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"DP rank {dp_rank}, Prompt: {prompt!r}, "
+              f"Generated text: {generated_text!r}")
+
+    del llm
+    destroy_model_parallel()
+    destroy_distributed_environment()
+    gc.collect()
+    torch.npu.empty_cache()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/dp_offline/run_dp.sh
+++ b/examples/dp_offline/run_dp.sh
@@ -0,0 +1,21 @@
+export HCCL_IF_IP=${local_ip}
+export GLOO_SOCKET_IFNAME=${ifname}
+export TP_SOCKET_IFNAME=${ifname}
+export HCCL_SOCKET_IFNAME=${ifname}
+
+# dp_size = node_size * dp_per_node
+node_size=1
+node_rank=0
+dp_per_node=2
+master_addr=127.0.0.1
+master_port=12345
+
+rm -rf ./.torchair_cache/
+rm -rf ./dynamo_*
+rm -rf /root/ascend/log/debug/plog/*
+export VLLM_ENABLE_GRAPH_MODE=0
+export VLLM_ENABLE_MC2=0
+
+torchrun --nproc_per_node ${dp_per_node} --nnodes ${node_size} \
+    --node_rank ${node_rank} --master_addr ${master_addr} --master_port ${master_port} \
+    data_parallel.py
--- a/examples/offline_inference_npu_v1.py
+++ b/examples/offline_inference_npu_v1.py
@@ -0,0 +1,49 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm-project/vllm/examples/offline_inference/basic.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+
+from vllm import LLM, SamplingParams
+
+os.environ["VLLM_USE_V1"] = "1"
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+if __name__ == "__main__":
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
+    # Create an LLM.
+    llm = LLM(model="/data/weights/deepseek-ai/deepseekv3-lite-base-latest",
+              tensor_parallel_size=2,
+              enforce_eager=True,
+              trust_remote_code=True,
+              max_model_len=1024)
+
+    # Generate texts from the prompts.
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")