[Ops] Fix bug in register_custom_ops without forward_context (#2883)

### What this PR does / why we need it?
This PR fixed the bug in register_custom_ops without forward_context. We
set try-except to consider this situation.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
CI passed with new added/existing test.

- vLLM version: main
- vLLM main:
7920de0a2a

Signed-off-by: rjg-lyh <1318825571@qq.com>
This commit is contained in:
rjg-lyh
2025-09-12 16:58:08 +08:00
committed by GitHub
parent 6d8bc38c7b
commit fc2bcbe21c
2 changed files with 54 additions and 18 deletions

View File

@@ -139,11 +139,13 @@ env_variables: Dict[str, Callable[[], Any]] = {
"VLLM_ASCEND_ENABLE_PREFETCH_MLP":
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_PREFETCH_MLP", '0'))),
# buffer size for gate up prefetch
"MLP_GATE_UP_PREFETCH_SIZE":
lambda: int(os.getenv("MLP_GATE_UP_PREFETCH_SIZE", 18 * 1024 * 1024)),
"VLLM_ASCEND_MLP_GATE_UP_PREFETCH_SIZE":
lambda: int(
os.getenv("VLLM_ASCEND_MLP_GATE_UP_PREFETCH_SIZE", 18 * 1024 * 1024)),
# buffer size for down proj prefetch
"MLP_DOWN_PREFETCH_SIZE":
lambda: int(os.getenv("MLP_DOWN_PREFETCH_SIZE", 18 * 1024 * 1024)),
"VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE":
lambda: int(
os.getenv("VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE", 18 * 1024 * 1024)),
# Whether to enable dense model and general optimizations for better performance.
# Since we modified the base parent class `linear`, this optimization is also applicable to other model types.
# However, there might be hidden issues, and it is currently recommended to prioritize its use with dense models.