diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 213793d2..b665e5fd 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -93,6 +93,7 @@ jobs: pytest -sv tests/e2e/singlecard/test_completion_with_prompt_embeds.py pytest -sv tests/e2e/singlecard/test_aclgraph_accuracy.py pytest -sv tests/e2e/singlecard/test_aclgraph_mem.py + pytest -sv tests/e2e/singlecard/test_async_scheduling.py pytest -sv tests/e2e/singlecard/test_camem.py pytest -sv tests/e2e/singlecard/test_guided_decoding.py # torch 2.8 doesn't work with lora, fix me diff --git a/tests/e2e/singlecard/test_async_scheduling.py b/tests/e2e/singlecard/test_async_scheduling.py index 4f4eb05f..aab24911 100644 --- a/tests/e2e/singlecard/test_async_scheduling.py +++ b/tests/e2e/singlecard/test_async_scheduling.py @@ -17,8 +17,12 @@ MTP_MODEL = "wemaster/deepseek_mtp_main_random_bf16" first_prompt = ("The following numbers of the sequence " + ", ".join(str(i) for i in range(10)) + " are:") -example_prompts = [first_prompt, "In one word, the capital of France is " - ] + [f"Tell me about the number {i}: " for i in range(32)] +example_prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] default_params = dict( temperature=0.0, # greedy diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 20f9badf..02a5acae 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1875,8 +1875,10 @@ class NPUModelRunner(GPUModelRunner): return AsyncGPUModelRunnerOutput( model_runner_output=model_runner_output, sampled_token_ids=sampled_token_ids, + logprobs_tensors=sampler_output.logprobs_tensors, invalid_req_indices=invalid_req_indices, async_output_copy_stream=self.async_output_copy_stream, + vocab_size=self.input_batch.vocab_size, ) def _build_dummy_attn_metadata( @@ -3472,7 +3474,7 @@ def _torch_cuda_wrapper(): try: # replace cuda APIs with xpu APIs, this should work by default - torch.cuda.Event = _EventPlaceholder + torch.cuda.Event = torch.npu.Event torch.cuda.Stream = torch.npu.Stream torch.cuda.default_stream = torch.npu.default_stream torch.cuda.current_stream = torch.npu.current_stream