xc-llm-ascend/vllm_ascend/distributed/parallel_state.py

from typing import Optional

import torch
from vllm.config import ParallelConfig
from vllm.distributed.parallel_state import (GroupCoordinator, get_world_group,
                                             init_model_parallel_group)

import vllm_ascend.envs as envs_ascend

# Currently, mc2 op need their own group coordinator.
_MC2: Optional[GroupCoordinator] = None
_MLP_TP: Optional[GroupCoordinator] = None


def get_mc2_group() -> GroupCoordinator:
    assert _MC2 is not None, ("mc2 group is not initialized")
    return _MC2


def get_mlp_tp_group() -> GroupCoordinator:
    assert _MLP_TP is not None, ("mlp group is not initialized")
    return _MLP_TP


def model_parallel_initialized():
    return (_MC2 is not None)


def init_ascend_model_parallel(parallel_config: ParallelConfig, ):
    if model_parallel_initialized():
        return
    assert torch.distributed.is_initialized()
    world_size = torch.distributed.get_world_size()
    backend = torch.distributed.get_backend(get_world_group().device_group)

    # The layout of all ranks: ExternalDP * EP
    # ExternalDP is the data parallel group that is not part of the model,
    # every dp rank can generate independently (in verl integration).
    all_ranks = torch.arange(world_size).reshape(
        -1, parallel_config.data_parallel_size *
        parallel_config.tensor_parallel_size)
    global _MC2
    group_ranks = all_ranks.unbind(0)
    group_ranks = [x.tolist() for x in group_ranks]

    _MC2 = init_model_parallel_group(group_ranks,
                                     get_world_group().local_rank,
                                     backend,
                                     group_name="mc2")
    if envs_ascend.VLLM_ASCEND_ENABLE_MLP_OPTIMIZE:
        global _MLP_TP
        assert _MLP_TP is None, (
            "mlp tensor model parallel group is already initialized")

        mlp_tp = parallel_config.data_parallel_size

        all_ranks_mlp_head = torch.arange(world_size).reshape(
            -1, mlp_tp, parallel_config.pipeline_parallel_size, 1)  # noqa
        group_ranks = all_ranks_mlp_head.view(-1, mlp_tp).unbind(0)
        group_ranks = [x.tolist() for x in group_ranks]

        # message queue broadcaster is only used in tensor model parallel group
        _MLP_TP = init_model_parallel_group(group_ranks,
                                            get_world_group().local_rank,
                                            backend,
                                            group_name="mlp_tp")


def get_mlp_tensor_model_parallel_world_size():
    """Return world size for the tensor model parallel group."""
    return get_mlp_tp_group().world_size


def get_mlp_tensor_model_parallel_rank():
    """Return world size for the tensor model parallel group."""
    return get_mlp_tp_group().rank_in_group


def destroy_ascend_model_parallel():
    global _MC2
    if _MC2:
        _MC2.destroy()
    _MC2 = None

    global _MLP_TP
    if _MLP_TP:
        _MLP_TP.destroy()
    _MLP_TP = None
[main][refactor] Refactoring forward_context and model_runner_v1 (#1979) ### What this PR does / why we need it? A refactoring of forward_context and model_runner_v1, add some context which is necessary in model inference into forward_context, and refactor dummy_run logic, make it more reasonable. Some details for this PR: Add `ascend_forward_context`; Update mc2_v2 op, and support `active_mask` param; Update scripts in examples dir; refactor `dummy_run` logic; Add soc_version for A2 and A3; ### Does this PR introduce _any_ user-facing change? No change at user-facing. ### How was this patch tested? - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/57c22e57f989b466a46a990243bb7f072a668b7f Signed-off-by: zzzzwwjj <1183291235@qq.com> 2025-07-28 14:06:20 +08:00			`from typing import Optional`

			`import torch`
			`from vllm.config import ParallelConfig`
			`from vllm.distributed.parallel_state import (GroupCoordinator, get_world_group,`
			`init_model_parallel_group)`

add mlp tp optimze (#2120) ### What this PR does / why we need it? For dense models, by not applying tensor parallelism (TP) to the attention module and applying TP to the MLP module, the allreduce operations in the attention module can be eliminated, thereby reducing computational overhead. However, this approach increases memory usage, so the environment variable VLLM_ASCEND_ENABLE_MLP_OPTIMZE is used to control this optimization. - vLLM main: https://github.com/vllm-project/vllm/commit/b17109beeafbf9577c319ab61530810943a7fc4b Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> 2025-08-21 09:22:07 +08:00			`import vllm_ascend.envs as envs_ascend`

[main][refactor] Refactoring forward_context and model_runner_v1 (#1979) ### What this PR does / why we need it? A refactoring of forward_context and model_runner_v1, add some context which is necessary in model inference into forward_context, and refactor dummy_run logic, make it more reasonable. Some details for this PR: Add `ascend_forward_context`; Update mc2_v2 op, and support `active_mask` param; Update scripts in examples dir; refactor `dummy_run` logic; Add soc_version for A2 and A3; ### Does this PR introduce _any_ user-facing change? No change at user-facing. ### How was this patch tested? - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/57c22e57f989b466a46a990243bb7f072a668b7f Signed-off-by: zzzzwwjj <1183291235@qq.com> 2025-07-28 14:06:20 +08:00			`# Currently, mc2 op need their own group coordinator.`
			`_MC2: Optional[GroupCoordinator] = None`
add mlp tp optimze (#2120) ### What this PR does / why we need it? For dense models, by not applying tensor parallelism (TP) to the attention module and applying TP to the MLP module, the allreduce operations in the attention module can be eliminated, thereby reducing computational overhead. However, this approach increases memory usage, so the environment variable VLLM_ASCEND_ENABLE_MLP_OPTIMZE is used to control this optimization. - vLLM main: https://github.com/vllm-project/vllm/commit/b17109beeafbf9577c319ab61530810943a7fc4b Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> 2025-08-21 09:22:07 +08:00			`_MLP_TP: Optional[GroupCoordinator] = None`
[main][refactor] Refactoring forward_context and model_runner_v1 (#1979) ### What this PR does / why we need it? A refactoring of forward_context and model_runner_v1, add some context which is necessary in model inference into forward_context, and refactor dummy_run logic, make it more reasonable. Some details for this PR: Add `ascend_forward_context`; Update mc2_v2 op, and support `active_mask` param; Update scripts in examples dir; refactor `dummy_run` logic; Add soc_version for A2 and A3; ### Does this PR introduce _any_ user-facing change? No change at user-facing. ### How was this patch tested? - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/57c22e57f989b466a46a990243bb7f072a668b7f Signed-off-by: zzzzwwjj <1183291235@qq.com> 2025-07-28 14:06:20 +08:00

			`def get_mc2_group() -> GroupCoordinator:`
			`assert _MC2 is not None, ("mc2 group is not initialized")`
			`return _MC2`


add mlp tp optimze (#2120) ### What this PR does / why we need it? For dense models, by not applying tensor parallelism (TP) to the attention module and applying TP to the MLP module, the allreduce operations in the attention module can be eliminated, thereby reducing computational overhead. However, this approach increases memory usage, so the environment variable VLLM_ASCEND_ENABLE_MLP_OPTIMZE is used to control this optimization. - vLLM main: https://github.com/vllm-project/vllm/commit/b17109beeafbf9577c319ab61530810943a7fc4b Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> 2025-08-21 09:22:07 +08:00			`def get_mlp_tp_group() -> GroupCoordinator:`
			`assert _MLP_TP is not None, ("mlp group is not initialized")`
			`return _MLP_TP`


[main][refactor] Refactoring forward_context and model_runner_v1 (#1979) ### What this PR does / why we need it? A refactoring of forward_context and model_runner_v1, add some context which is necessary in model inference into forward_context, and refactor dummy_run logic, make it more reasonable. Some details for this PR: Add `ascend_forward_context`; Update mc2_v2 op, and support `active_mask` param; Update scripts in examples dir; refactor `dummy_run` logic; Add soc_version for A2 and A3; ### Does this PR introduce _any_ user-facing change? No change at user-facing. ### How was this patch tested? - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/57c22e57f989b466a46a990243bb7f072a668b7f Signed-off-by: zzzzwwjj <1183291235@qq.com> 2025-07-28 14:06:20 +08:00			`def model_parallel_initialized():`
			`return (_MC2 is not None)`


			`def init_ascend_model_parallel(parallel_config: ParallelConfig, ):`
			`if model_parallel_initialized():`
			`return`
			`assert torch.distributed.is_initialized()`
			`world_size = torch.distributed.get_world_size()`
			`backend = torch.distributed.get_backend(get_world_group().device_group)`

			`# The layout of all ranks: ExternalDP * EP`
			`# ExternalDP is the data parallel group that is not part of the model,`
			`# every dp rank can generate independently (in verl integration).`
			`all_ranks = torch.arange(world_size).reshape(`
			`-1, parallel_config.data_parallel_size *`
			`parallel_config.tensor_parallel_size)`
			`global _MC2`
			`group_ranks = all_ranks.unbind(0)`
			`group_ranks = [x.tolist() for x in group_ranks]`

			`_MC2 = init_model_parallel_group(group_ranks,`
			`get_world_group().local_rank,`
			`backend,`
			`group_name="mc2")`
add mlp tp optimze (#2120) ### What this PR does / why we need it? For dense models, by not applying tensor parallelism (TP) to the attention module and applying TP to the MLP module, the allreduce operations in the attention module can be eliminated, thereby reducing computational overhead. However, this approach increases memory usage, so the environment variable VLLM_ASCEND_ENABLE_MLP_OPTIMZE is used to control this optimization. - vLLM main: https://github.com/vllm-project/vllm/commit/b17109beeafbf9577c319ab61530810943a7fc4b Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> 2025-08-21 09:22:07 +08:00			`if envs_ascend.VLLM_ASCEND_ENABLE_MLP_OPTIMIZE:`
			`global _MLP_TP`
			`assert _MLP_TP is None, (`
			`"mlp tensor model parallel group is already initialized")`

			`mlp_tp = parallel_config.data_parallel_size`

			`all_ranks_mlp_head = torch.arange(world_size).reshape(`
			`-1, mlp_tp, parallel_config.pipeline_parallel_size, 1) # noqa`
			`group_ranks = all_ranks_mlp_head.view(-1, mlp_tp).unbind(0)`
			`group_ranks = [x.tolist() for x in group_ranks]`

			`# message queue broadcaster is only used in tensor model parallel group`
			`_MLP_TP = init_model_parallel_group(group_ranks,`
			`get_world_group().local_rank,`
			`backend,`
			`group_name="mlp_tp")`


			`def get_mlp_tensor_model_parallel_world_size():`
			`"""Return world size for the tensor model parallel group."""`
			`return get_mlp_tp_group().world_size`


			`def get_mlp_tensor_model_parallel_rank():`
			`"""Return world size for the tensor model parallel group."""`
			`return get_mlp_tp_group().rank_in_group`
[main][refactor] Refactoring forward_context and model_runner_v1 (#1979) ### What this PR does / why we need it? A refactoring of forward_context and model_runner_v1, add some context which is necessary in model inference into forward_context, and refactor dummy_run logic, make it more reasonable. Some details for this PR: Add `ascend_forward_context`; Update mc2_v2 op, and support `active_mask` param; Update scripts in examples dir; refactor `dummy_run` logic; Add soc_version for A2 and A3; ### Does this PR introduce _any_ user-facing change? No change at user-facing. ### How was this patch tested? - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/57c22e57f989b466a46a990243bb7f072a668b7f Signed-off-by: zzzzwwjj <1183291235@qq.com> 2025-07-28 14:06:20 +08:00

			`def destroy_ascend_model_parallel():`
			`global _MC2`
			`if _MC2:`
			`_MC2.destroy()`
			`_MC2 = None`
add mlp tp optimze (#2120) ### What this PR does / why we need it? For dense models, by not applying tensor parallelism (TP) to the attention module and applying TP to the MLP module, the allreduce operations in the attention module can be eliminated, thereby reducing computational overhead. However, this approach increases memory usage, so the environment variable VLLM_ASCEND_ENABLE_MLP_OPTIMZE is used to control this optimization. - vLLM main: https://github.com/vllm-project/vllm/commit/b17109beeafbf9577c319ab61530810943a7fc4b Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> 2025-08-21 09:22:07 +08:00
			`global _MLP_TP`
			`if _MLP_TP:`
			`_MLP_TP.destroy()`
			`_MLP_TP = None`