[Perf] Deepseekv3 performance optimization for eager mode (#598)

### What this PR does / why we need it? Deepseek v3 now adopt vanilla chunked prefill on MLA part which is ineffcient for computing but necessary for chunked prefill. Since PR https://github.com/vllm-project/vllm-ascend/pull/543 bring v0 scheduler into vllm-ascend, we can now adopt torch_npu._npu_flash_attention inside the mla backend for more performance boost. Also there are some redundant computation inside the rope, which is also removed. This PR should bring some performance gain for deepseek eager mode inference. --------- Signed-off-by: ganyi <pleaplusone.gy@gmail.com>
2025-04-29 17:12:03 +08:00
parent 87975fa058
commit 0329fad927
4 changed files with 180 additions and 102 deletions
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -31,15 +31,12 @@ try:
    # register custom ops into torch_library here
    import vllm_ascend.vllm_ascend_C  # type: ignore  # noqa: F401

-except ImportError as e:
-    if not str(
-            e
-    ) == "dynamic module does not define module export function (PyInit_vllm_ascend_C)":
-        logging.warning(
-            "Warning: Failed to register custom ops, all custom ops will be disabled"
-        )
-    else:
-        CUSTOM_OP_ENABLED = True
+except ImportError:
+    logging.warning(
+        "Warning: Failed to register custom ops, all custom ops will be disabled"
+    )
+else:
+    CUSTOM_OP_ENABLED = True

 if TYPE_CHECKING:
    from vllm.config import ModelConfig, VllmConfig
@@ -180,9 +177,10 @@ class NPUPlatform(Platform):
        if envs.VLLM_USE_V1:
            # Activate custom ops for v1.
            vllm_config.compilation_config.custom_ops = ["all"]
-            additional_config = vllm_config.additional_config
            # If ascend_scheduler_config exists in additional_config,
            # extents original scheduler_config to use AscendScheduler.
+
+            additional_config = vllm_config.additional_config
            if additional_config and additional_config.get(
                    "ascend_scheduler_config", None) is not None:
                additional_scheduler_config = additional_config.get(