[Bugfix] Follow vLLM Qwen-Moe/VL and KV Connector change to fix broken CI (#2181)
### What this PR does / why we need it? This pr fix broken CI: 1. Fix theee2eb6ecd8changes, in this commit, they fused the gate and up projections in the vision MLP, This can improve performance by reducing one matrix multiplication. so, this pr do the following things: - Specify that the two linear layers are fused as `mlp.gate_up_proj` when loading the weights. - Use a SiluAndMul activation function. 2. Fixaefeea0fde, Update ModelRunnerOutput parameters to adapt to its changes 3. Fix [vllm-commit](https://github.com/vllm-project/vllm/pull/20815/files#diff-3ffb829a39ab2b3e4706aa28f5e476815f36c3a87b98d6a66514ebedc8f3ffb4R354-R356), fix qwen moe ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.10.0 - vLLM main:fed5849d3f--------- Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -25,6 +25,7 @@ from tests.ut.kv_connector.utils import (assert_scheduler_empty,
|
||||
create_model_runner_output,
|
||||
create_request, create_scheduler,
|
||||
create_vllm_config)
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
|
||||
def test_basic_lifecycle():
|
||||
@@ -102,7 +103,13 @@ def test_basic_lifecycle():
|
||||
|
||||
# (3b): execute_model()
|
||||
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
|
||||
model_runner_output.finished_sending = [request_id]
|
||||
if vllm_version_is("0.10.0"):
|
||||
model_runner_output.finished_sending = [request_id]
|
||||
else:
|
||||
from vllm.v1.worker.kv_connector_model_runner_mixin import \
|
||||
KVConnectorOutput # type: ignore # noqa
|
||||
model_runner_output.kv_connector_output = KVConnectorOutput(
|
||||
finished_sending=[request_id])
|
||||
|
||||
# (3c): update_from_output()
|
||||
scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||
@@ -157,7 +164,13 @@ def test_prefix_cache_lifecycle():
|
||||
scheduler_output = scheduler.schedule()
|
||||
scheduler.schedule()
|
||||
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
|
||||
model_runner_output.finished_sending = [request_remote.request_id]
|
||||
if vllm_version_is("0.10.0"):
|
||||
model_runner_output.finished_sending = [request_remote.request_id]
|
||||
else:
|
||||
from vllm.v1.worker.kv_connector_model_runner_mixin import \
|
||||
KVConnectorOutput # noqa
|
||||
model_runner_output.kv_connector_output = KVConnectorOutput(
|
||||
finished_sending=[request_remote.request_id])
|
||||
scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||
_ = scheduler.schedule()
|
||||
assert_scheduler_empty(scheduler)
|
||||
|
||||
@@ -19,7 +19,7 @@
|
||||
import copy
|
||||
|
||||
from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT
|
||||
from vllm.v1.request import FinishReason, RequestStatus
|
||||
from vllm.v1.request import RequestStatus
|
||||
|
||||
from tests.ut.kv_connector.utils import (assert_scheduler_empty,
|
||||
create_model_runner_output,
|
||||
@@ -55,10 +55,7 @@ def test_basic_lifecycle():
|
||||
# Nothing running and empty scheduler output.
|
||||
assert len(scheduler.running) == 0
|
||||
assert len(scheduler_output.scheduled_new_reqs) == 0
|
||||
if vllm_version_is("0.9.1"):
|
||||
assert len(scheduler_output.scheduled_cached_reqs) == 0
|
||||
else:
|
||||
assert scheduler_output.scheduled_cached_reqs.num_reqs == 0
|
||||
assert scheduler_output.scheduled_cached_reqs.num_reqs == 0
|
||||
assert len(scheduler_output.num_scheduled_tokens) == 0
|
||||
assert scheduler_output.total_num_scheduled_tokens == 0
|
||||
|
||||
@@ -94,7 +91,13 @@ def test_basic_lifecycle():
|
||||
|
||||
# (2b): forward(): request finishes recv.
|
||||
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
|
||||
model_runner_output.finished_recving = [request_id]
|
||||
if vllm_version_is("0.10.0"):
|
||||
model_runner_output.finished_recving = [request_id]
|
||||
else:
|
||||
from vllm.v1.worker.kv_connector_model_runner_mixin import \
|
||||
KVConnectorOutput # type: ignore # noqa
|
||||
model_runner_output.kv_connector_output = KVConnectorOutput(
|
||||
finished_recving=[request_id])
|
||||
|
||||
# (2c): update_from_output():
|
||||
engine_core_outputs = scheduler.update_from_output(scheduler_output,
|
||||
@@ -135,11 +138,6 @@ def test_basic_lifecycle():
|
||||
model_runner_output)
|
||||
scheduler.schedule()
|
||||
|
||||
if vllm_version_is("0.9.1"):
|
||||
outputs = engine_core_outputs[0].outputs
|
||||
assert len(outputs) == 1
|
||||
output = outputs[0]
|
||||
assert output.finish_reason == FinishReason.STOP
|
||||
assert_scheduler_empty(scheduler)
|
||||
|
||||
|
||||
@@ -213,7 +211,13 @@ def test_full_block_prompt():
|
||||
# # STEP (2): Recv.
|
||||
scheduler_output = scheduler.schedule()
|
||||
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
|
||||
model_runner_output.finished_recving = [request_id]
|
||||
if vllm_version_is("0.10.0"):
|
||||
model_runner_output.finished_recving = [request_id]
|
||||
else:
|
||||
from vllm.v1.worker.kv_connector_model_runner_mixin import \
|
||||
KVConnectorOutput # type: ignore # noqa
|
||||
model_runner_output.kv_connector_output = KVConnectorOutput(
|
||||
finished_recving=[request_id])
|
||||
scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||
assert len(scheduler.waiting) == 1
|
||||
assert (request_id in scheduler.finished_recving_kv_req_ids)
|
||||
@@ -236,13 +240,6 @@ def test_full_block_prompt():
|
||||
# # Step (4): Hit EOS.
|
||||
scheduler_output = scheduler.schedule()
|
||||
model_runner_output = create_model_runner_output([request], use_eos=True)
|
||||
engine_core_outputs = scheduler.update_from_output(scheduler_output,
|
||||
model_runner_output)
|
||||
scheduler.schedule()
|
||||
|
||||
if vllm_version_is("0.9.1"):
|
||||
outputs = engine_core_outputs[0].outputs
|
||||
assert len(outputs) == 1
|
||||
output = outputs[0]
|
||||
assert output.finish_reason == FinishReason.STOP
|
||||
assert_scheduler_empty(scheduler)
|
||||
|
||||
@@ -186,6 +186,20 @@ def create_model_runner_output(
|
||||
sampled_token_ids = [[sampled_token] for _ in req_ids]
|
||||
|
||||
# Make output data structure.
|
||||
extra_args = {}
|
||||
if not vllm_version_is("0.10.0"):
|
||||
from vllm.v1.worker.kv_connector_model_runner_mixin import \
|
||||
KVConnectorOutput # type: ignore # noqa
|
||||
kv_connector_output = KVConnectorOutput(
|
||||
finished_sending=finished_sending,
|
||||
finished_recving=finished_recving)
|
||||
extra_args = {"kv_connector_output": kv_connector_output}
|
||||
else:
|
||||
extra_args = {
|
||||
"finished_sending": finished_sending,
|
||||
"finished_recving": finished_recving,
|
||||
}
|
||||
|
||||
return ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_id_to_index,
|
||||
@@ -193,9 +207,6 @@ def create_model_runner_output(
|
||||
spec_token_ids=None,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
**({
|
||||
"pooler_output": []
|
||||
} if not vllm_version_is("0.9.1") else {}),
|
||||
finished_sending=finished_sending,
|
||||
finished_recving=finished_recving,
|
||||
pooler_output=[],
|
||||
**extra_args,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user