xc-llm-ascend/examples/offline_inference_npu_long_seq.py

import os
import time
import argparse

from vllm import LLM, SamplingParams

os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_ASCEND_ENABLE_CONTEXT_PARALLEL"] = "1"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument('--input_len', type=int, default=1024)
    parser.add_argument('--output_len', type=int, default=128)
    parser.add_argument('--bs', type=int, default=1)
    parser.add_argument('--model_path', type=str, default="deepseek-ai/DeepSeek-V2-Lite")
    parser.add_argument('--tp', type=int, default=2)
    parser.add_argument('--pcp', type=int, default=2)
    parser.add_argument('--dcp', type=int, default=1)
    parser.add_argument('--iter_times', type=int, default=1)

    args = parser.parse_args()

    prompts = [
        "The capital of France is",
        "Hello, my name is Tom, I am",
        "The president of United States is",
        "AI future is"
    ]

    sampling_params = SamplingParams(temperature = 0.8, top_p = 0.95, max_tokens=args.output_len)
    llm = LLM(
        model=args.model_path,
        trust_remote_code=True,
        enforce_eager=True,
        tensor_parallel_size=args.tp,
        prefill_context_parallel_size=args.pcp,
        decode_context_parallel_size=args.dcp,
        enable_prefix_caching=False,
        enable_expert_parallel=True,
        enable_chunked_prefill=False,
        max_num_batched_tokens=2048,
        max_model_len=1024,
        additional_config={"ascend_scheduler_config": {"enabled": False}},
        max_num_seqs=1,
        block_size=128,
        gpu_memory_utilization=0.9
    )

    t0 = time.time()
    for _ in range(args.iter_times):
        outputs = llm.generate(prompts, sampling_params)
    t1 = time.time()
    print(f"TTFT: {(t1 - t0) * 1000 / (args.iter_times * args.bs)} ms")

    for i, output in enumerate(outputs):
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"req_num: {i}\nGenerated text: {generated_text!r}")
support cp&dcp (#3260) ### What this PR does / why we need it? This PR adds the Prefill Context Parallelism (PCP) feature, which corresponds to DCP. For specific implementation details, please refer to the RFC https://github.com/vllm-project/vllm/issues/25749. TL;DR: PCP enhances long-sequence inference capabilities by partitioning the sequence dimension during the prefill stage. ### Does this PR introduce _any_ user-facing change? The current implementation primarily includes the following changes: Modified ModelRunner.py for CP partitioning logic for tokens; Modified attention_v1.py and mla_v1.py to adapt the GQA/MLA backend to PCP. Modified block_tables.py to extend the KV cache storage based on DCP&PCP; Added necessary command-line arguments to control parallelism for PCP; ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: LookAround <lixushi@huawei.com> Signed-off-by: chenjie <chenjie137@huawei.com> Signed-off-by: Delphine-Nic <tanwenqin@huawei.com> Signed-off-by: zhangsicheng5 <zhangsicheng5@huawei.com> Signed-off-by: Feng Liu <liufeng248@huawei.com> Signed-off-by: gaojc <1055866782@qq.com> Signed-off-by: weiguihua2 <weiguihua2@huawei.com> Signed-off-by: z50049692 <zhangmingwei11@huawei.com> Co-authored-by: chenjie <chenjie137@huawei.com> Co-authored-by: Delphine-Nic <tanwenqin@huawei.com> Co-authored-by: zhangsicheng5 <zhangsicheng5@huawei.com> Co-authored-by: Feng Liu <liufeng248@huawei.com> Co-authored-by: gaojc <1055866782@qq.com> Co-authored-by: weiguihua2 <weiguihua2@huawei.com> Co-authored-by: z50049692 <zhangmingwei11@huawei.com> Co-authored-by: w00896881 <wangzixuan40@huawei.com> 2025-10-24 10:32:01 +08:00			`import os`
			`import time`
			`import argparse`

			`from vllm import LLM, SamplingParams`

			`os.environ["VLLM_USE_MODELSCOPE"] = "True"`
			`os.environ["VLLM_ASCEND_ENABLE_CONTEXT_PARALLEL"] = "1"`
			`os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"`

			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser()`

			`parser.add_argument('--input_len', type=int, default=1024)`
			`parser.add_argument('--output_len', type=int, default=128)`
			`parser.add_argument('--bs', type=int, default=1)`
			`parser.add_argument('--model_path', type=str, default="deepseek-ai/DeepSeek-V2-Lite")`
			`parser.add_argument('--tp', type=int, default=2)`
			`parser.add_argument('--pcp', type=int, default=2)`
			`parser.add_argument('--dcp', type=int, default=1)`
			`parser.add_argument('--iter_times', type=int, default=1)`

			`args = parser.parse_args()`

			`prompts = [`
			`"The capital of France is",`
			`"Hello, my name is Tom, I am",`
			`"The president of United States is",`
			`"AI future is"`
			`]`

			`sampling_params = SamplingParams(temperature = 0.8, top_p = 0.95, max_tokens=args.output_len)`
			`llm = LLM(`
			`model=args.model_path,`
			`trust_remote_code=True,`
			`enforce_eager=True,`
			`tensor_parallel_size=args.tp,`
			`prefill_context_parallel_size=args.pcp,`
			`decode_context_parallel_size=args.dcp,`
			`enable_prefix_caching=False,`
			`enable_expert_parallel=True,`
			`enable_chunked_prefill=False,`
			`max_num_batched_tokens=2048,`
			`max_model_len=1024,`
			`additional_config={"ascend_scheduler_config": {"enabled": False}},`
			`max_num_seqs=1,`
			`block_size=128,`
			`gpu_memory_utilization=0.9`
			`)`

			`t0 = time.time()`
			`for _ in range(args.iter_times):`
			`outputs = llm.generate(prompts, sampling_params)`
			`t1 = time.time()`
			`print(f"TTFT: {(t1 - t0) * 1000 / (args.iter_times * args.bs)} ms")`

			`for i, output in enumerate(outputs):`
			`prompt = output.prompt`
			`generated_text = output.outputs[0].text`
			`print(f"req_num: {i}\nGenerated text: {generated_text!r}")`