Revert "[Feat] Unquantized linear nz support (#2619)" (#2896)

### What this PR does / why we need it?
This reverts commit 7b2ecc1e9a.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
CI passed

- vLLM version: main
- vLLM main:
64d90c3e4f

Closes: https://github.com/vllm-project/vllm-ascend/issues/2890
Closes: https://github.com/vllm-project/vllm-ascend/issues/2887
Closes: https://github.com/vllm-project/vllm-ascend/issues/2885

Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
This commit is contained in:
Yikun Jiang
2025-09-12 20:51:12 +08:00
committed by GitHub
parent fc2bcbe21c
commit 756b8a1946
4 changed files with 10 additions and 111 deletions

View File

@@ -36,36 +36,12 @@ from vllm.model_executor.utils import set_weight_attrs
from vllm_ascend.distributed.parallel_state import (get_mlp_tp_group,
get_otp_group)
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, dense_optim_enable,
matmul_allreduce_enable, mlp_tp_enable,
oproj_tp_enable)
from vllm_ascend.utils import (dense_optim_enable, matmul_allreduce_enable,
mlp_tp_enable, oproj_tp_enable)
_HCOMM_INFO = None
class AscendUnquantizedLinearMethod(UnquantizedLinearMethod):
"""Linear method without quantization."""
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
super().process_weights_after_loading(layer)
if torch.version.cann.startswith("8.3"):
layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
layer.weight.data = torch_npu.npu_format_cast(
layer.weight.data, ACL_FORMAT_FRACTAL_NZ)
def apply(self,
layer: torch.nn.Module,
x: torch.Tensor,
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
if torch.version.cann.startswith("8.3"):
if bias is None:
return torch.matmul(x, layer.weight)
else:
return torch.matmul(x, layer.weight) + bias
else:
return torch.nn.functional.linear(x, layer.weight, bias)
class AscendColumnParallelLinear(ColumnParallelLinear):
"""Linear layer with column parallelism.
@@ -642,7 +618,7 @@ class AscendLinearBase(LinearBase):
self.prefix = prefix
if quant_config is None:
self.quant_method: Optional[
QuantizeMethodBase] = AscendUnquantizedLinearMethod()
QuantizeMethodBase] = UnquantizedLinearMethod()
else:
self.quant_method = quant_config.get_quant_method(self,
prefix=prefix)

View File

@@ -23,7 +23,8 @@ from vllm.distributed import get_tensor_model_parallel_rank
from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
FusedMoeWeightScaleSupported)
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
RowParallelLinear)
RowParallelLinear,
UnquantizedLinearMethod)
from vllm.model_executor.layers.quantization import \
register_quantization_config
from vllm.model_executor.layers.quantization.base_config import (
@@ -37,7 +38,6 @@ from vllm.model_executor.utils import set_weight_attrs
from vllm_ascend.distributed.parallel_state import (get_mlp_tp_group,
get_otp_group)
from vllm_ascend.ops.fused_moe import AscendUnquantizedFusedMoEMethod
from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod
from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, mlp_tp_enable,
oproj_tp_enable)
@@ -95,7 +95,7 @@ class AscendQuantConfig(QuantizationConfig):
if isinstance(layer, LinearBase):
if self.is_layer_skipped_ascend(prefix,
self.packed_modules_mapping):
return AscendUnquantizedLinearMethod()
return UnquantizedLinearMethod()
return AscendLinearMethod(self, prefix,
self.packed_modules_mapping)
elif isinstance(layer, Attention) and \