Add patch_qwen3_5 for triton ops fused_recurrent_gated_delta_rule (#7109)
### What this PR does / why we need it? The ops `torch_npu.npu_recurrent_gated_delta_rule` currently does not support `ssm_state` inputs in float32 format, we temporarily retain the _forward_core implementation with triton for Qwen3_5 --------- Signed-off-by: pppeng <zepengliu912@qq.com> Signed-off-by: pppeng <60355449+ppppeng@users.noreply.github.com>
This commit is contained in:
@@ -17,9 +17,14 @@
|
||||
|
||||
from vllm.triton_utils import HAS_TRITON
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if HAS_TRITON:
|
||||
import vllm_ascend.patch.worker.patch_triton
|
||||
|
||||
if not vllm_version_is("v0.16.0"):
|
||||
import vllm_ascend.patch.worker.patch_qwen3_5 # noqa
|
||||
|
||||
# isort: off
|
||||
import vllm_ascend.patch.platform.patch_sched_yield # noqa
|
||||
import vllm_ascend.patch.worker.patch_unquantized_gemm # noqa
|
||||
|
||||
Reference in New Issue
Block a user