2025-07-28 14:06:20 +08:00
|
|
|
import math
|
|
|
|
|
from contextlib import contextmanager
|
|
|
|
|
from enum import Enum
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
from typing import Any
|
2025-07-28 14:06:20 +08:00
|
|
|
|
|
|
|
|
import torch
|
2026-03-13 09:11:46 +08:00
|
|
|
import vllm.envs as envs_vllm
|
2025-08-20 09:01:04 +08:00
|
|
|
from vllm.config import CUDAGraphMode, VllmConfig
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
from vllm.distributed import get_dp_group, get_ep_group, get_tensor_model_parallel_world_size
|
|
|
|
|
from vllm.forward_context import BatchDescriptor, get_forward_context, set_forward_context
|
2025-07-28 14:06:20 +08:00
|
|
|
|
2025-08-14 09:33:39 +08:00
|
|
|
import vllm_ascend.envs as envs_ascend
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
from vllm_ascend.utils import (
|
|
|
|
|
AscendDeviceType,
|
2026-03-01 20:22:50 +08:00
|
|
|
enable_sp,
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
flashcomm2_enable,
|
|
|
|
|
get_ascend_device_type,
|
|
|
|
|
has_layer_idx,
|
|
|
|
|
is_drafter_moe_model,
|
|
|
|
|
is_moe_model,
|
|
|
|
|
speculative_enable_dispatch_gmm_combine_decode,
|
2026-03-23 16:25:57 +08:00
|
|
|
vllm_version_is,
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
)
|
2025-07-28 14:06:20 +08:00
|
|
|
|
|
|
|
|
|
2025-11-24 17:32:37 +08:00
|
|
|
class MoECommType(Enum):
|
|
|
|
|
ALLGATHER = 0
|
|
|
|
|
MC2 = 1
|
|
|
|
|
ALLTOALL = 2
|
2025-12-18 23:34:31 +08:00
|
|
|
FUSED_MC2 = 3
|
2025-11-24 17:32:37 +08:00
|
|
|
|
|
|
|
|
|
2025-07-28 14:06:20 +08:00
|
|
|
@contextmanager
|
|
|
|
|
def set_ascend_forward_context(
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
attn_metadata: Any,
|
|
|
|
|
vllm_config: VllmConfig,
|
|
|
|
|
virtual_engine: int = 0,
|
|
|
|
|
num_tokens: int = 0,
|
|
|
|
|
num_tokens_across_dp: torch.Tensor | None = None,
|
|
|
|
|
in_profile_run: bool = False,
|
|
|
|
|
num_actual_tokens: int | None = None,
|
|
|
|
|
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
|
|
|
|
|
batch_descriptor: BatchDescriptor | None = None,
|
|
|
|
|
model_instance: torch.nn.Module = None,
|
|
|
|
|
is_draft_model=False,
|
2026-01-27 08:44:36 +08:00
|
|
|
skip_compiled: bool = False,
|
2026-02-28 21:44:08 +08:00
|
|
|
max_tokens_across_pcp: int = 0,
|
2026-02-02 19:15:31 +08:00
|
|
|
draft_attn_metadatas=None,
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
):
|
2025-07-28 14:06:20 +08:00
|
|
|
"""A context manager that stores the current forward context,
|
|
|
|
|
can be attention metadata, etc.
|
|
|
|
|
We add some additional param into forward_context.
|
|
|
|
|
"""
|
2026-01-27 08:44:36 +08:00
|
|
|
forward_context_kwargs = {
|
|
|
|
|
"attn_metadata": attn_metadata,
|
|
|
|
|
"vllm_config": vllm_config,
|
|
|
|
|
"num_tokens": num_tokens,
|
|
|
|
|
"num_tokens_across_dp": num_tokens_across_dp,
|
|
|
|
|
"cudagraph_runtime_mode": aclgraph_runtime_mode,
|
|
|
|
|
"batch_descriptor": batch_descriptor,
|
2026-02-02 15:57:55 +08:00
|
|
|
"skip_compiled": skip_compiled,
|
2026-01-27 08:44:36 +08:00
|
|
|
}
|
2026-03-23 16:25:57 +08:00
|
|
|
if vllm_version_is("0.18.0"):
|
|
|
|
|
forward_context_kwargs["virtual_engine"] = virtual_engine
|
2026-01-27 08:44:36 +08:00
|
|
|
|
|
|
|
|
with set_forward_context(**forward_context_kwargs):
|
2025-07-28 14:06:20 +08:00
|
|
|
forward_context = get_forward_context()
|
2026-02-02 19:15:31 +08:00
|
|
|
forward_context.draft_attn_metadatas = draft_attn_metadatas
|
2025-09-22 19:12:58 +08:00
|
|
|
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
from vllm_ascend.ops.fused_moe.moe_comm_method import get_moe_comm_method
|
|
|
|
|
|
2026-03-23 17:05:02 +08:00
|
|
|
max_num_tokens = int(num_tokens_across_dp.max().item()) if num_tokens_across_dp is not None else num_tokens
|
|
|
|
|
moe_comm_type = select_moe_comm_method(max_num_tokens, vllm_config, is_draft_model)
|
|
|
|
|
|
2025-09-22 19:12:58 +08:00
|
|
|
forward_context.moe_comm_type = moe_comm_type
|
|
|
|
|
forward_context.moe_comm_method = get_moe_comm_method(moe_comm_type)
|
|
|
|
|
|
2025-09-08 22:52:24 +08:00
|
|
|
tp_world_size = get_tensor_model_parallel_world_size()
|
2025-07-28 14:06:20 +08:00
|
|
|
|
|
|
|
|
forward_context.in_profile_run = in_profile_run
|
|
|
|
|
|
|
|
|
|
# NOTE: This cannot be set using set_forward_context
|
|
|
|
|
# due to multiple warmups before actual capturing
|
|
|
|
|
forward_context.capturing = False
|
|
|
|
|
|
2026-01-23 21:12:23 +08:00
|
|
|
# TODO: remove it when torch_npu.npu_mm_reduce_scatter_base supports tp_size >= 16.
|
|
|
|
|
mmrs_fusion = tp_world_size <= 8
|
|
|
|
|
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
# set for sequence parallelism, 1000 is the batch size concurrency threshold
|
|
|
|
|
# for enabling the flashcomm_v1 or sequence_parallelism feature.
|
|
|
|
|
# Currently, it is an empirical value. In normal scenarios, if the concurrency
|
|
|
|
|
# exceeds this threshold, the performance benefits can be maximized.
|
|
|
|
|
# Conversely, if the concurrency is below the threshold,
|
2025-09-08 22:52:24 +08:00
|
|
|
# the performance may degrade due to the switching of communication methods.
|
2026-01-23 21:12:23 +08:00
|
|
|
|
2026-01-14 09:00:37 +08:00
|
|
|
# main model and drafter model may have different architecture
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
is_context_moe_model = is_drafter_moe_model(vllm_config) if is_draft_model else is_moe_model(vllm_config)
|
2026-01-14 09:00:37 +08:00
|
|
|
if is_context_moe_model:
|
2026-03-01 20:22:50 +08:00
|
|
|
flash_comm_v1_enabled = enable_sp(vllm_config) and num_tokens is not None
|
2025-10-28 23:30:27 +08:00
|
|
|
mmrs_fusion = False
|
[Bugfix] Avoided a bug of drafter when `dp` and `sp` are enabled (#6226)
### What this PR does / why we need it?
Avoided a bug of drafter when `dp` and `sp` are enabled.
Specifically, disable `sp` when drafter is dense.
### Does this PR introduce _any_ user-facing change?
N/A
### How was this patch tested?
An aisbench test:
```shell
python3 aisbench_test.py --input_len 3500 --output_len 1000 --data_num 100 --concurrency 320 --request_rate 8
```
The result is okay.
```text
[2026-01-24 22:38:20,256] [ais_bench.benchmark.openicl.icl_inferencer.icl_gen_inferencer] [INFO] Calculate global interval offsets time: 0.5922 s
01/24 22:38:20 - AISBench - INFO - Process 0 using precomputed sleep offsets with 100 requests
Process-0 pid:220279: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [09:40<00:00, 5.81s/it]
Pid: 220279 | Post: 100 | Received: 100 | Failed: 0 | Post Time:12.51s | Receive Time:580.92s:
Encoding output text...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 93.75it/s]
01/24 22:48:02 - AISBench - INFO - Start converting origin data to detailed data ...
01/24 22:48:02 - AISBench - INFO - Finish converting origin data to detailed data█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 95.08it/s]
01/24 22:48:02 - AISBench - INFO - Added 'Actual RPS: After Excluding Anomalies' to group 'Time - RPS: ' in legend explanation table
01/24 22:48:02 - AISBench - INFO - Successfully merged chart into position (1, 1)
01/24 22:48:02 - AISBench - INFO - RPS distribution charts saved to outputs/default/20260124_223809/performances/vllm-api-stream-chat/gsm8kdataset_rps_distribution_plot_with_actual_rps.html
01/24 22:48:02 - AISBench - INFO - Updated chart with actual RPS saved to outputs/default/20260124_223809/performances/vllm-api-stream-chat/gsm8kdataset_rps_distribution_plot_with_actual_rps.html
[2026-01-24 22:48:02,557] [ais_bench.benchmark.openicl.icl_inferencer.icl_gen_perf_inferencer] [INFO] Start extracting pref datas ...
[2026-01-24 22:48:02,558] [ais_bench.benchmark.openicl.icl_inferencer.icl_gen_perf_inferencer] [INFO] Finish extracting pref datas!
[2026-01-24 22:48:02,558] [ais_bench.benchmark.openicl.icl_inferencer.icl_gen_perf_inferencer] [INFO] Dumping detail perf data ...
Dumping data to h5: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 75.31it/s]
[2026-01-24 22:48:02,588] [ais_bench.benchmark.openicl.icl_inferencer.icl_gen_perf_inferencer] [INFO] Dump detail perf data cost: 0.02995561994612217(s)
[2026-01-24 22:48:02,588] [ais_bench.benchmark.openicl.icl_inferencer.icl_gen_perf_inferencer] [INFO] Performance task finished, results saved in outputs/default/20260124_223809/performances/vllm-api-stream-chat
01/24 22:48:02 - AISBench - INFO - time elapsed: 586.32s
Running tasks: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [09:55<00:00, 595.91s/it]
01/24 22:48:05 - AISBench - INFO - Performance evaluation tasks completed.
01/24 22:48:05 - AISBench - INFO - Loading detail perf data of model='vllm-api-stream-chat' dataset='gsm8kdataset' ...
01/24 22:48:05 - AISBench - INFO - Starting request timeline processing...
01/24 22:48:05 - AISBench - INFO - Data preprocessing completed in 0.0004s
01/24 22:48:05 - AISBench - INFO - Generating timeline traces for 100 requests...
01/24 22:48:05 - AISBench - INFO - Generated timeline trace chunks in 0.0441s
01/24 22:48:05 - AISBench - INFO - Generating concurrency traces...
01/24 22:48:05 - AISBench - INFO - Generated concurrency trace chunks in 0.0011s
01/24 22:48:05 - AISBench - INFO - Creating figure layout...
01/24 22:48:05 - AISBench - INFO - Figure layout created in 0.0504s
01/24 22:48:05 - AISBench - INFO - Writing to outputs/default/20260124_223809/performances/vllm-api-stream-chat/gsm8kdataset_plot.html...
01/24 22:48:05 - AISBench - INFO - HTML written in 0.0181s
01/24 22:48:05 - AISBench - INFO - Completed! Total execution time: 0.1148s
01/24 22:48:05 - AISBench - INFO - The gsm8kdataset_plot has been saved in outputs/default/20260124_223809/performances/vllm-api-stream-chat/gsm8kdataset_plot.html
01/24 22:48:05 - AISBench - INFO - Converting perf results of stage ...
01/24 22:48:05 - AISBench - INFO - Finish Converting!
01/24 22:48:05 - AISBench - INFO - Start calculating metrics ...
01/24 22:48:05 - AISBench - INFO - Start calculating common metrics ...
01/24 22:48:05 - AISBench - INFO - Start calculating add units ...
01/24 22:48:05 - AISBench - INFO - Finish calculating perf data!
01/24 22:48:05 - AISBench - INFO - Summarizing performance results...
01/24 22:48:05 - AISBench - INFO - Performance Results of task: vllm-api-stream-chat/gsm8kdataset:
╒══════════════════════════╤═════════╤════════════════╤════════════════╤════════════════╤════════════════╤════════════════╤════════════════╤════════════════╤═════╕
│ Performance Parameters │ Stage │ Average │ Min │ Max │ Median │ P75 │ P90 │ P99 │ N │
╞══════════════════════════╪═════════╪════════════════╪════════════════╪════════════════╪════════════════╪════════════════╪════════════════╪════════════════╪═════╡
│ E2EL │ total │ 300806.1781 ms │ 189326.0489 ms │ 568345.5121 ms │ 380629.6785 ms │ 384208.3527 ms │ 385363.7709 ms │ 566871.7684 ms │ 100 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ TTFT │ total │ 107441.2231 ms │ 343.8054 ms │ 378132.3979 ms │ 188817.4877 ms │ 190985.8451 ms │ 192547.6847 ms │ 378008.356 ms │ 100 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ TPOT │ total │ 193.5585 ms │ 185.1008 ms │ 197.262 ms │ 193.8146 ms │ 195.0803 ms │ 196.0323 ms │ 196.9688 ms │ 100 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ ITL │ total │ 194.2067 ms │ 0.0108 ms │ 2782.7124 ms │ 184.9998 ms │ 194.2631 ms │ 221.2895 ms │ 304.363 ms │ 100 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ InputTokens │ total │ 3506.86 │ 3431.0 │ 3508.0 │ 3508.0 │ 3508.0 │ 3508.0 │ 3508.0 │ 100 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ OutputTokens │ total │ 1000.0 │ 1000.0 │ 1000.0 │ 1000.0 │ 1000.0 │ 1000.0 │ 1000.0 │ 100 │
├──────────────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────┤
│ OutputTokenThroughput │ total │ 3.7745 token/s │ 1.7595 token/s │ 5.2819 token/s │ 2.6272 token/s │ 5.1028 token/s │ 5.1502 token/s │ 5.2754 token/s │ 100 │
╘══════════════════════════╧═════════╧════════════════╧════════════════╧════════════════╧════════════════╧════════════════╧════════════════╧════════════════╧═════╛
╒══════════════════════════╤═════════╤══════════════════╕
│ Common Metric │ Stage │ Value │
╞══════════════════════════╪═════════╪══════════════════╡
│ Benchmark Duration │ total │ 580456.2704 ms │
├──────────────────────────┼─────────┼──────────────────┤
│ Total Requests │ total │ 100 │
├──────────────────────────┼─────────┼──────────────────┤
│ Failed Requests │ total │ 0 │
├──────────────────────────┼─────────┼──────────────────┤
│ Success Requests │ total │ 100 │
├──────────────────────────┼─────────┼──────────────────┤
│ Concurrency │ total │ 51.8224 │
├──────────────────────────┼─────────┼──────────────────┤
│ Max Concurrency │ total │ 320 │
├──────────────────────────┼─────────┼──────────────────┤
│ Request Throughput │ total │ 0.1723 req/s │
├──────────────────────────┼─────────┼──────────────────┤
│ Total Input Tokens │ total │ 350686 │
├──────────────────────────┼─────────┼──────────────────┤
│ Prefill Token Throughput │ total │ 32.6398 token/s │
├──────────────────────────┼─────────┼──────────────────┤
│ Total generated tokens │ total │ 100000 │
├──────────────────────────┼─────────┼──────────────────┤
│ Input Token Throughput │ total │ 604.1558 token/s │
├──────────────────────────┼─────────┼──────────────────┤
│ Output Token Throughput │ total │ 172.2783 token/s │
├──────────────────────────┼─────────┼──────────────────┤
│ Total Token Throughput │ total │ 776.434 token/s │
╘══════════════════════════╧═════════╧══════════════════╛
01/24 22:48:05 - AISBench - INFO - Performance Result files locate in outputs/default/20260124_223809/performances/vllm-api-stream-chat.
```
- vLLM version: v0.14.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60
Signed-off-by: drslark <slarksblood@qq.com>
2026-01-25 17:45:29 +08:00
|
|
|
elif is_draft_model:
|
|
|
|
|
# TODO: for dense drafter, `sp` is redundant and is not compatible with `dp` and `graph`.
|
|
|
|
|
# Disable it to avoid more problems.
|
2026-02-27 08:27:41 +08:00
|
|
|
flash_comm_v1_enabled = False
|
2025-10-15 19:36:32 +08:00
|
|
|
else:
|
2026-03-01 20:22:50 +08:00
|
|
|
flash_comm_v1_enabled = enable_sp(vllm_config) and num_tokens is not None and num_tokens > 1000
|
2025-10-28 23:30:27 +08:00
|
|
|
forward_context.mmrs_fusion = mmrs_fusion
|
2025-11-10 11:01:45 +08:00
|
|
|
forward_context.num_tokens = num_tokens
|
2026-02-27 08:27:41 +08:00
|
|
|
forward_context.flash_comm_v1_enabled = flash_comm_v1_enabled
|
2025-12-18 23:34:31 +08:00
|
|
|
# TODO(Levi-JQ): another PR to normalize the enabling logic for sp/fc2
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
forward_context.flashcomm_v2_enabled = flashcomm2_enable() and tp_world_size > 1 and num_tokens is not None
|
2025-09-08 22:52:24 +08:00
|
|
|
|
2026-02-27 08:27:41 +08:00
|
|
|
forward_context.pad_size = 0
|
|
|
|
|
if forward_context.flash_comm_v1_enabled or forward_context.flashcomm_v2_enabled:
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
pad_size = (tp_world_size - (num_tokens % tp_world_size)) % tp_world_size
|
2025-09-08 22:52:24 +08:00
|
|
|
forward_context.pad_size = pad_size
|
|
|
|
|
|
2025-09-09 14:28:14 +08:00
|
|
|
# set this for rope forward_oot using
|
|
|
|
|
forward_context.is_first_layer = True
|
|
|
|
|
|
2025-09-11 21:20:09 +08:00
|
|
|
# set layer_idx to enable optimization features that depend on this information.
|
|
|
|
|
# This is only applicable to models that contain these necessary attributes.
|
|
|
|
|
forward_context.layer_idx = None
|
2025-10-29 15:59:55 +08:00
|
|
|
if has_layer_idx(model_instance):
|
2025-09-11 21:20:09 +08:00
|
|
|
forward_context.layer_idx = model_instance.model.start_layer
|
|
|
|
|
|
2026-02-04 09:08:18 +08:00
|
|
|
forward_context.prefetch_mlp_gate_up_proj = False
|
|
|
|
|
forward_context.prefetch_mlp_down_proj = False
|
2025-10-14 20:16:33 +08:00
|
|
|
forward_context.model_instance = model_instance
|
2025-12-29 09:54:51 +08:00
|
|
|
forward_context.is_draft_model = is_draft_model
|
2025-09-11 21:20:09 +08:00
|
|
|
|
2025-07-28 14:06:20 +08:00
|
|
|
if num_tokens is None and attn_metadata is not None:
|
2025-08-05 08:39:02 +08:00
|
|
|
num_tokens = attn_metadata.num_actual_tokens
|
2025-07-28 14:06:20 +08:00
|
|
|
|
|
|
|
|
dp_world_size = get_dp_group().world_size
|
|
|
|
|
if dp_world_size > 1 and forward_context.dp_metadata is not None:
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
max_tokens_across_dp = forward_context.dp_metadata.max_tokens_across_dp_cpu.item()
|
2026-02-27 08:27:41 +08:00
|
|
|
if forward_context.flash_comm_v1_enabled or forward_context.flashcomm_v2_enabled:
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
padded_length = (max_tokens_across_dp + tp_world_size - 1) // tp_world_size * tp_world_size
|
2025-10-15 19:36:32 +08:00
|
|
|
pad_size = padded_length - num_tokens
|
|
|
|
|
forward_context.padded_length = padded_length
|
|
|
|
|
forward_context.pad_size = pad_size
|
2025-07-28 14:06:20 +08:00
|
|
|
else:
|
|
|
|
|
max_tokens_across_dp = num_tokens
|
|
|
|
|
|
|
|
|
|
forward_context.max_tokens_across_dp = max_tokens_across_dp
|
2026-02-28 21:44:08 +08:00
|
|
|
forward_context.max_tokens_across_pcp = max_tokens_across_pcp
|
2025-07-28 14:06:20 +08:00
|
|
|
|
|
|
|
|
if num_tokens is not None:
|
2025-08-05 08:39:02 +08:00
|
|
|
if num_actual_tokens is None:
|
|
|
|
|
num_actual_tokens = num_tokens
|
2025-07-28 14:06:20 +08:00
|
|
|
# NOTE: token num which need to pad to when mc2
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
forward_context.padded_num_tokens = math.ceil(max_tokens_across_dp / tp_world_size) * tp_world_size
|
2025-12-12 17:27:09 +08:00
|
|
|
reserved_mc2_mask = get_mc2_mask()
|
2025-08-12 21:10:20 +08:00
|
|
|
if reserved_mc2_mask is not None:
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
mc2_mask = reserved_mc2_mask[: forward_context.padded_num_tokens]
|
2025-08-12 21:10:20 +08:00
|
|
|
mc2_mask[:num_actual_tokens] = True
|
|
|
|
|
mc2_mask[num_actual_tokens:] = False
|
|
|
|
|
forward_context.mc2_mask = mc2_mask
|
2025-07-28 14:06:20 +08:00
|
|
|
try:
|
|
|
|
|
yield
|
|
|
|
|
finally:
|
|
|
|
|
pass
|
2025-12-12 17:27:09 +08:00
|
|
|
|
|
|
|
|
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
_mc2_tokens_capacity: int | None = None
|
|
|
|
|
_reserved_mc2_mask: torch.Tensor | None = None
|
2025-12-12 17:27:09 +08:00
|
|
|
|
|
|
|
|
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
def set_mc2_tokens_capacity(vllm_config, max_num_reqs, uniform_decode_query_len):
|
2025-12-12 17:27:09 +08:00
|
|
|
global _mc2_tokens_capacity
|
|
|
|
|
if _mc2_tokens_capacity is not None:
|
|
|
|
|
return
|
|
|
|
|
if vllm_config.compilation_config.cudagraph_capture_sizes:
|
|
|
|
|
max_num_tokens = vllm_config.compilation_config.max_cudagraph_capture_size
|
|
|
|
|
else:
|
|
|
|
|
# NOTE: To save memory, we cap the max number of tokens to 512.
|
|
|
|
|
max_num_tokens = min(max_num_reqs * uniform_decode_query_len, 512)
|
|
|
|
|
tp_size = vllm_config.parallel_config.tensor_parallel_size
|
|
|
|
|
# Use integer arithmetic for ceiling division.
|
|
|
|
|
num_tokens_per_tp_rank = (max_num_tokens + tp_size - 1) // tp_size
|
|
|
|
|
_mc2_tokens_capacity = num_tokens_per_tp_rank * tp_size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_mc2_tokens_capacity():
|
|
|
|
|
return _mc2_tokens_capacity
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def set_mc2_mask(vllm_config, device):
|
|
|
|
|
global _reserved_mc2_mask
|
|
|
|
|
if _reserved_mc2_mask is not None:
|
|
|
|
|
return
|
|
|
|
|
if is_moe_model(vllm_config):
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
_reserved_mc2_mask = torch.zeros(get_mc2_tokens_capacity(), dtype=torch.bool, device=device)
|
2025-12-12 17:27:09 +08:00
|
|
|
else:
|
|
|
|
|
_reserved_mc2_mask = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_mc2_mask():
|
|
|
|
|
return _reserved_mc2_mask
|
|
|
|
|
|
|
|
|
|
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
def select_moe_comm_method(num_tokens: int, vllm_config: VllmConfig, is_draft_model=False) -> MoECommType | None:
|
2025-12-18 23:34:31 +08:00
|
|
|
"""Select the MoE communication method according to parallel settings,
|
|
|
|
|
device generation, token count, and quantization.
|
|
|
|
|
|
|
|
|
|
1. Non-MoE models return `None`.
|
|
|
|
|
2. Without expert parallel, fall back to all-gather.
|
|
|
|
|
3. On A2 with expert parallel, pick MC2 when tokens fit the MC2 capacity
|
|
|
|
|
and the DP size is large enough; otherwise use all-gather.
|
|
|
|
|
4. On A3 with expert parallel, prefer fused MC2 when using w8a8_dynamic
|
|
|
|
|
quantization with small EP size, no dynamic_eplb, and not in MTP
|
|
|
|
|
mode; otherwise use MC2 within capacity or all-to-all.
|
[Feat.]: 310p support MOE models (#6530)
### What this PR does / why we need it?
This pull request integrates comprehensive support for Mixture of
Experts (MoE) models on the Ascend 310P device within the vllm-ascend
framework. It achieves this by introducing specialized modules for
expert selection, fused MoE layers, and optimized all-gather
communication. The changes also refine existing NPU operations, making
them more consistent and efficient for 310P, ultimately enhancing the
performance and compatibility of MoE models on this hardware.
Highlights
310P MoE Support: Introduces dedicated implementations for Mixture of
Experts (MoE) models on Ascend 310P devices, including new modules for
expert selection, fused MoE layers, and communication.
All-Gather Communication: Enforces the use of ALLGATHER communication
for MoE operations on 310P, optimizing data transfer and leveraging
NPU-specific token dispatching.
Simplified NPU Operations: Removes conditional type casting for
npu_swiglu and enables custom rotary embedding kernels unconditionally,
suggesting improved native support for 310P.
New MoE Classes Registered: Registers AscendFusedMoE310 and
AscendSharedFusedMoE310 to integrate 310P-specific MoE layers into the
system's custom operation registry.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
offline test and server test, with qwen3-30b-a3b,tp/ep 4 on 310p
- vLLM version: v0.15.0
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0
---------
Signed-off-by: pu-zhe <zpuaa@outlook.com>
2026-02-06 10:30:56 +08:00
|
|
|
5. On 310P, always use all-gather.
|
2025-12-18 23:34:31 +08:00
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
num_tokens (int): The number of tokens in the current batch.
|
|
|
|
|
vllm_config (VllmConfig): Runtime configuration for the model.
|
2025-12-29 09:54:51 +08:00
|
|
|
is_draft_model (bool): Whether the model runs in MTP mode (disables fused MC2).
|
2025-12-18 23:34:31 +08:00
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
ValueError: If the soc version is unsupported.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
MoECommType | None: The selected MoE communication method.
|
|
|
|
|
"""
|
2025-12-16 17:44:04 +08:00
|
|
|
if not is_moe_model(vllm_config):
|
|
|
|
|
return None
|
|
|
|
|
mc2_tokens_capacity = get_mc2_tokens_capacity()
|
|
|
|
|
soc_version = get_ascend_device_type()
|
|
|
|
|
quant_type = getattr(
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
vllm_config.model_config.hf_text_config,
|
|
|
|
|
"moe_quantize",
|
|
|
|
|
getattr(vllm_config.model_config.hf_text_config, "quantize", None),
|
|
|
|
|
)
|
2025-12-16 17:44:04 +08:00
|
|
|
|
[Lint]Style: Convert `vllm-ascend/compilation` to ruff format (#5912)
### What this PR does / why we need it?
Convert `vllm-ascend/compilation` to ruff format.
### Does this PR introduce _any_ user-facing change?
During this migration, we encountered some **errors** in our CI and
testing environments, such as:
```
vllm_ascend/utils.py:653: in <module>
def register_ascend_customop(vllm_config: VllmConfig | None = None):
^^^^^^^^^^^^^^^^^
E TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'
```
**1. Root Cause Analysis:**
The project uses a common pattern to break circular dependencies:
```python
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None # Placeholder assigned at runtime
```
When Python parses the function definition `def
register_ascend_customop(vllm_config: VllmConfig | None)`, it attempts
to evaluate the expression `VllmConfig | None`.
Since `VllmConfig` is assigned `None` at runtime, the expression
effectively becomes `None | None`. In Python, `None` is an instance of
`NoneType`. While the `|` operator is implemented for Type objects
(classes), it is not supported for `NoneType` instances, leading to the
`TypeError` shown above.
**2. Solution:**
To maintain the modern `|` syntax required by our new linting standards
while preserving our dependency management strategy, I have introduced:
```python
from __future__ import annotations
```
at the top of the affected files. This enables **Postponed Evaluation of
Annotations (PEP 563)**.
**3. Impact and Benefits:**
- By enabling `annotations`, Python no longer executes the `VllmConfig |
None` operation during module load. Instead, it stores the annotation as
a string literal, completely avoiding the `None | None` calculation.
- We can keep the `VllmConfig = None` placeholders. This ensures that
other modules can still import these symbols without triggering an
`ImportError`, maintaining a stable dependency graph.
- IDEs and static type checkers (MyPy/Pyright) continue to resolve the
types correctly. This allows us to use modern syntax without sacrificing
type safety or runtime stability.
- The only side effect is that `__annotations__` will now return strings
instead of type objects. Since this module does not use runtime type
enforcement or reflection, this change has zero negative impact on
existing functionality.
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-16 20:57:46 +08:00
|
|
|
if not vllm_config.parallel_config.enable_expert_parallel or get_ep_group().world_size == 1:
|
2025-12-16 17:44:04 +08:00
|
|
|
moe_comm_type = MoECommType.ALLGATHER
|
2025-12-17 14:08:19 +08:00
|
|
|
elif soc_version in {AscendDeviceType.A2}:
|
2026-03-17 23:03:45 +08:00
|
|
|
num_experts = vllm_config.model_config.get_num_experts()
|
|
|
|
|
ep_world_size = (
|
|
|
|
|
vllm_config.parallel_config.world_size_across_dp // vllm_config.parallel_config.pipeline_parallel_size
|
|
|
|
|
)
|
|
|
|
|
num_experts_per_device = num_experts // ep_world_size
|
|
|
|
|
if num_experts_per_device <= 24 and ep_world_size >= 16 and num_tokens <= mc2_tokens_capacity:
|
2025-12-16 17:44:04 +08:00
|
|
|
moe_comm_type = MoECommType.MC2
|
|
|
|
|
else:
|
2025-12-17 17:39:57 +08:00
|
|
|
moe_comm_type = MoECommType.ALLGATHER
|
2025-12-16 17:44:04 +08:00
|
|
|
|
2025-12-17 14:08:19 +08:00
|
|
|
elif soc_version in {AscendDeviceType.A3}:
|
2025-12-16 17:44:04 +08:00
|
|
|
# TODO: drop the EP-size guard when dispatch_ffn_combine supports larger EP sizes
|
2026-01-04 17:51:28 +08:00
|
|
|
# TODO: drop speculative method guard when dispatch_gmm_combine_decode supports w16a16
|
2026-03-17 19:53:02 +08:00
|
|
|
fused_mc2_enable = envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2
|
2026-03-09 11:26:57 +08:00
|
|
|
dispatch_ffn_combine_enable = get_ep_group().world_size <= 32 and (not is_draft_model)
|
2025-12-18 23:34:31 +08:00
|
|
|
if num_tokens <= mc2_tokens_capacity:
|
2025-12-21 15:23:59 +08:00
|
|
|
fused_decode_enable = fused_mc2_enable
|
|
|
|
|
if envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 == 1:
|
2026-01-07 11:23:42 +08:00
|
|
|
fused_decode_enable = fused_mc2_enable and dispatch_ffn_combine_enable
|
2026-01-04 17:51:28 +08:00
|
|
|
elif envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 == 2:
|
2026-03-17 19:53:02 +08:00
|
|
|
fused_decode_enable = (
|
|
|
|
|
fused_mc2_enable
|
|
|
|
|
and speculative_enable_dispatch_gmm_combine_decode(vllm_config)
|
|
|
|
|
and quant_type == "w8a8_dynamic"
|
|
|
|
|
)
|
2025-12-21 15:23:59 +08:00
|
|
|
moe_comm_type = MoECommType.FUSED_MC2 if fused_decode_enable else MoECommType.MC2
|
2025-12-18 23:34:31 +08:00
|
|
|
else:
|
2025-12-21 15:23:59 +08:00
|
|
|
fused_prefill_enable = fused_mc2_enable
|
|
|
|
|
if envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 == 1:
|
2026-01-07 11:23:42 +08:00
|
|
|
fused_prefill_enable = fused_mc2_enable and dispatch_ffn_combine_enable
|
2025-12-21 15:23:59 +08:00
|
|
|
elif envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 == 2:
|
|
|
|
|
fused_prefill_enable = False
|
|
|
|
|
moe_comm_type = MoECommType.FUSED_MC2 if fused_prefill_enable else MoECommType.ALLTOALL
|
[Feat.]: 310p support MOE models (#6530)
### What this PR does / why we need it?
This pull request integrates comprehensive support for Mixture of
Experts (MoE) models on the Ascend 310P device within the vllm-ascend
framework. It achieves this by introducing specialized modules for
expert selection, fused MoE layers, and optimized all-gather
communication. The changes also refine existing NPU operations, making
them more consistent and efficient for 310P, ultimately enhancing the
performance and compatibility of MoE models on this hardware.
Highlights
310P MoE Support: Introduces dedicated implementations for Mixture of
Experts (MoE) models on Ascend 310P devices, including new modules for
expert selection, fused MoE layers, and communication.
All-Gather Communication: Enforces the use of ALLGATHER communication
for MoE operations on 310P, optimizing data transfer and leveraging
NPU-specific token dispatching.
Simplified NPU Operations: Removes conditional type casting for
npu_swiglu and enables custom rotary embedding kernels unconditionally,
suggesting improved native support for 310P.
New MoE Classes Registered: Registers AscendFusedMoE310 and
AscendSharedFusedMoE310 to integrate 310P-specific MoE layers into the
system's custom operation registry.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
offline test and server test, with qwen3-30b-a3b,tp/ep 4 on 310p
- vLLM version: v0.15.0
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0
---------
Signed-off-by: pu-zhe <zpuaa@outlook.com>
2026-02-06 10:30:56 +08:00
|
|
|
elif soc_version in {AscendDeviceType._310P}:
|
|
|
|
|
moe_comm_type = MoECommType.ALLGATHER
|
2026-03-02 11:04:06 +08:00
|
|
|
elif soc_version in {AscendDeviceType.A5}:
|
|
|
|
|
if num_tokens <= mc2_tokens_capacity and vllm_config.parallel_config.world_size_across_dp > 1:
|
|
|
|
|
moe_comm_type = MoECommType.MC2
|
|
|
|
|
else:
|
|
|
|
|
moe_comm_type = MoECommType.ALLTOALL
|
2025-12-16 17:44:04 +08:00
|
|
|
else:
|
|
|
|
|
raise ValueError(f"Unsupported soc_version: {soc_version}")
|
|
|
|
|
return moe_comm_type
|
2026-03-13 09:11:46 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class _ExtraForwardContextProxy:
|
|
|
|
|
"""Unified forward-context access for v1/v2 model runners."""
|
|
|
|
|
|
|
|
|
|
extra_attrs = (
|
|
|
|
|
"capturing",
|
|
|
|
|
"moe_comm_type",
|
|
|
|
|
"moe_comm_method",
|
|
|
|
|
"mmrs_fusion",
|
|
|
|
|
"num_tokens",
|
|
|
|
|
"flash_comm_v1_enabled",
|
|
|
|
|
"flashcomm_v2_enabled",
|
|
|
|
|
"pad_size",
|
|
|
|
|
"padded_length",
|
|
|
|
|
"num_tokens_across_dp",
|
|
|
|
|
"mc2_mask",
|
|
|
|
|
"is_draft_model",
|
|
|
|
|
"prefetch_mlp_gate_up_proj",
|
|
|
|
|
"prefetch_mlp_down_proj",
|
|
|
|
|
"model_instance",
|
|
|
|
|
"layer_idx",
|
|
|
|
|
"max_tokens_across_dp",
|
|
|
|
|
"max_tokens_across_pcp",
|
|
|
|
|
"num_accept_tokens",
|
|
|
|
|
"in_profile_run",
|
|
|
|
|
"padded_num_tokens",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def check_extra_attr(self, name: str):
|
|
|
|
|
if name not in self.extra_attrs:
|
|
|
|
|
raise AttributeError(
|
|
|
|
|
f"{name} is not extra forward context attribute, "
|
|
|
|
|
"please get/set it from vllm's _forward_context directly."
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _ctx():
|
|
|
|
|
return get_forward_context()
|
|
|
|
|
|
|
|
|
|
def __getattr__(self, name: str) -> Any:
|
|
|
|
|
self.check_extra_attr(name)
|
|
|
|
|
ctx = self._ctx()
|
|
|
|
|
if envs_vllm.VLLM_USE_V2_MODEL_RUNNER:
|
|
|
|
|
return ctx.additional_kwargs[name]
|
|
|
|
|
return getattr(ctx, name)
|
|
|
|
|
|
|
|
|
|
def __setattr__(self, name: str, value: Any) -> None:
|
|
|
|
|
self.check_extra_attr(name)
|
|
|
|
|
ctx = self._ctx()
|
|
|
|
|
if envs_vllm.VLLM_USE_V2_MODEL_RUNNER:
|
|
|
|
|
ctx.additional_kwargs[name] = value
|
|
|
|
|
else:
|
|
|
|
|
setattr(ctx, name, value)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# usage: from vllm_ascend.ascend_forward_context import _EXTRA_CTX
|
|
|
|
|
_EXTRA_CTX = _ExtraForwardContextProxy()
|