[Performance] Disable JIT and nd2nz to improve performance for Altlas 300I series (#1591)
### What this PR does / why we need it? Since running on Altlas 300I Duo was initial supported after #1333 , this PR will disable the JIT compiler for the 310P and changed the data format to NZ for the weight in the vocabulary embedding and QKV projection layers, which help improving performance. See #1563 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Test manually: https://github.com/vllm-project/vllm-ascend/pull/1591#issuecomment-3028352339 Signed-off-by: Vincent Yuan <farawayboat@gmail.com>
This commit is contained in:
@@ -89,6 +89,7 @@ if TYPE_CHECKING:
|
||||
else:
|
||||
xgr = LazyLoader("xgr", globals(), "xgrammar")
|
||||
|
||||
import torch_npu
|
||||
import vllm.envs as envs_vllm
|
||||
|
||||
import vllm_ascend.envs as envs_ascend
|
||||
@@ -96,6 +97,9 @@ import vllm_ascend.envs as envs_ascend
|
||||
if vllm_version_is("0.9.1"):
|
||||
from vllm.v1.spec_decode.utils import is_spec_decode_supported
|
||||
|
||||
if is_310p():
|
||||
torch_npu.npu.set_compile_mode(jit_compile=False)
|
||||
|
||||
|
||||
@dataclass
|
||||
class GraphCaptureContext:
|
||||
@@ -2007,6 +2011,18 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
with DeviceMemoryProfiler() as m: # noqa: SIM117
|
||||
self.model = get_model(vllm_config=self.vllm_config)
|
||||
|
||||
if is_310p():
|
||||
from vllm.model_executor.layers.linear import (
|
||||
MergedColumnParallelLinear, QKVParallelLinear,
|
||||
RowParallelLinear)
|
||||
for module in self.model.modules():
|
||||
if isinstance(module,
|
||||
(MergedColumnParallelLinear,
|
||||
QKVParallelLinear, RowParallelLinear)):
|
||||
module.weight.data = torch_npu.npu_format_cast(
|
||||
module.weight.data, ACL_FORMAT_FRACTAL_NZ)
|
||||
|
||||
try:
|
||||
# For version compatibility, remove this after we abort vllm v0.9.1 support
|
||||
from vllm.model_executor.models.interfaces import \
|
||||
|
||||
Reference in New Issue
Block a user