xc-llm-ascend/vllm_ascend/patch/worker/patch_unquantized_gemm.py

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import torch
import vllm.model_executor.layers.utils
from vllm.utils.torch_utils import direct_register_custom_op


def unquantized_gemm(
    x: torch.Tensor,
    weight: torch.Tensor,
    bias: torch.Tensor | None = None,
) -> torch.Tensor:
    return torch.nn.functional.linear(x, weight, bias)


def unquantized_gemm_fake(
    x: torch.Tensor,
    weight: torch.Tensor,
    bias: torch.Tensor | None = None,
) -> torch.Tensor:
    output_shape = (x.shape[0], weight.shape[0])
    return torch.empty(output_shape, dtype=x.dtype, device=x.device)


direct_register_custom_op(
    op_name="unquantized_gemm",
    op_func=unquantized_gemm,
    fake_impl=unquantized_gemm_fake,
    mutates_args=[],
    dispatch_key="PrivateUse1",
)


def default_unquantized_gemm(
    layer: torch.nn.Module,
    x: torch.Tensor,
    weight: torch.Tensor,
    bias: torch.Tensor | None = None,
) -> torch.Tensor:
    if x.device.type == "npu":
        return torch.ops.vllm.unquantized_gemm(x, weight, bias)
    else:
        return torch.nn.functional.linear(x, weight, bias)


vllm.model_executor.layers.utils.default_unquantized_gemm = default_unquantized_gemm
[Fusion] [Graph]Add Matmul Allreduce Rmsnorm fusion Pass (#5034) This PR add `MatmulAllreduceRmsnorm` operator and introduces a graph fusion pass for `matmul_allreduce_rmsnorm` operations. The implementation includes a new configuration flag, a pattern matching pass using `torch._inductor.pattern_matcher`. Co-authored-by: Trunrain [270250579@qq.com](mailto:270250579@qq.com) - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: wxsIcey <1790571317@qq.com> Signed-off-by: tongrunze <t00574058@china.huawei.com> 2026-01-19 09:28:07 +08:00			`#`
			`# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.`
			`# This file is a part of the vllm-ascend project.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`
			`import torch`
			`import vllm.model_executor.layers.utils`
			`from vllm.utils.torch_utils import direct_register_custom_op`


			`def unquantized_gemm(`
			`x: torch.Tensor,`
			`weight: torch.Tensor,`
			`bias: torch.Tensor \| None = None,`
			`) -> torch.Tensor:`
			`return torch.nn.functional.linear(x, weight, bias)`


			`def unquantized_gemm_fake(`
			`x: torch.Tensor,`
			`weight: torch.Tensor,`
			`bias: torch.Tensor \| None = None,`
			`) -> torch.Tensor:`
			`output_shape = (x.shape[0], weight.shape[0])`
			`return torch.empty(output_shape, dtype=x.dtype, device=x.device)`


[Lint]Style: Convert `vllm-ascend/` to ruff format(Batch #10) (#6173) ### What this PR does / why we need it? Scope of Changes: \| File Path \| \| :--- \| \|`vllm_ascend/ops/layer_shard_linear.py`\| \|`vllm_ascend/ops/linear.py`\| \|`vllm_ascend/ops/linear_op.py`\| \|`vllm_ascend/worker/worker.py`\| \| ` vllm_ascend/patch/worker/patch_bert.py` \| \| ` vllm_ascend/patch/worker/patch_deepseek.py` \| \| ` vllm_ascend/patch/worker/patch_distributed.py` \| \| ` vllm_ascend/patch/worker/patch_module.py` \| \| ` vllm_ascend/patch/worker/patch_multimodal_merge.py` \| \| ` vllm_ascend/patch/worker/patch_qwen3_next.py` \| \| ` vllm_ascend/patch/worker/patch_qwen3_next_mtp.py` \| \| ` vllm_ascend/patch/worker/patch_rejection_sampler.py` \| \| ` vllm_ascend/patch/worker/patch_rope.py` \| \| ` vllm_ascend/patch/worker/patch_triton.py` \| \| ` vllm_ascend/patch/worker/patch_unquantized_gemm.py` \| \| ` vllm_ascend/patch/worker/patch_v2_egale.py` \| \|` vllm_ascend/worker/npu_input_batch.py`\| \|` vllm_ascend/worker/v2/aclgraph_utils.py`\| \|` vllm_ascend/worker/v2/attn_utils.py`\| \|` vllm_ascend/worker/v2/model_runner.py`\| \|` vllm_ascend/worker/v2/sample/gumbel.py`\| \|` vllm_ascend/worker/v2/sample/penalties.py`\| \|` vllm_ascend/worker/v2/sample/sampler.py`\| \|` vllm_ascend/worker/v2/spec_decode/__init__.py`\| \|` vllm_ascend/worker/v2/spec_decode/eagle.py`\| \|` vllm_ascend/worker/v2/states.py`\| ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.14.0 - vLLM main: https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60 Signed-off-by: MrZ20 <2609716663@qq.com> Signed-off-by: SILONG ZENG <2609716663@qq.com> Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com> 2026-02-06 15:35:06 +08:00			`direct_register_custom_op(`
			`op_name="unquantized_gemm",`
			`op_func=unquantized_gemm,`
			`fake_impl=unquantized_gemm_fake,`
			`mutates_args=[],`
			`dispatch_key="PrivateUse1",`
			`)`

[Fusion] [Graph]Add Matmul Allreduce Rmsnorm fusion Pass (#5034) This PR add `MatmulAllreduceRmsnorm` operator and introduces a graph fusion pass for `matmul_allreduce_rmsnorm` operations. The implementation includes a new configuration flag, a pattern matching pass using `torch._inductor.pattern_matcher`. Co-authored-by: Trunrain [270250579@qq.com](mailto:270250579@qq.com) - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: wxsIcey <1790571317@qq.com> Signed-off-by: tongrunze <t00574058@china.huawei.com> 2026-01-19 09:28:07 +08:00
			`def default_unquantized_gemm(`
			`layer: torch.nn.Module,`
			`x: torch.Tensor,`
			`weight: torch.Tensor,`
			`bias: torch.Tensor \| None = None,`
			`) -> torch.Tensor:`
			`if x.device.type == "npu":`
			`return torch.ops.vllm.unquantized_gemm(x, weight, bias)`
			`else:`
			`return torch.nn.functional.linear(x, weight, bias)`


			`vllm.model_executor.layers.utils.default_unquantized_gemm = default_unquantized_gemm`