[refactor] Refactor the interface for shard weight and remove the flashcomm2 o_shared interface. (#5181)

### What this PR does / why we need it? - Delete the environment variable `VLLM_ASCEND_ENABLE_FLASHCOMM2_OSHARED` - Introduce layer_sharding as a configurable feature in additional_config - Revise the term "shared weight" to "shard weight." Configuration : The feature is opt-in via the additional_config argument: ``` --additional-config '{ "layer_sharding": ["o_proj", "q_b_proj"] }' ``` This is orthogonal to standard tensor parallelism and weight replication strategies. It is treated as a separate, explicit feature.It can be used in any scenario, combined with the flashcomm2https://github.com/vllm-project/vllm-ascend/pull/3232 feature or the ShardedCP #4702 feature, to achieve significant performance. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: zzhx1 <zzh_201018@outlook.com> Signed-off-by: zzhxx <zhangzihang23@mails.ucas.ac.cn> Signed-off-by: chenxiao <Jaychou1620@Gmail.com> Co-authored-by: clrs97 <524936896@qq.com> Co-authored-by: Levi-JQ <yujinqi2@huawei.com> Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
2026-01-08 09:05:02 +08:00
parent 20a8cf061b
commit f7db812ed7
13 changed files with 288 additions and 169 deletions
--- a/tests/ut/distributed/test_parallel_state.py
+++ b/tests/ut/distributed/test_parallel_state.py
@@ -25,14 +25,10 @@ def mock_distributed():
         patch('torch.distributed.get_world_size', return_value=16), \
         patch('torch.distributed.get_backend', return_value='nccl'), \
         patch('vllm_ascend.distributed.parallel_state.get_world_group') as mock_group, \
-         patch('vllm_ascend.distributed.parallel_state.get_tp_group') as mock_tp_group, \
-         patch('vllm_ascend.distributed.parallel_state.get_dp_group') as mock_dp_group, \
-         patch('vllm_ascend.distributed.parallel_state.get_pp_group') as mock_pp_group:
+         patch('vllm_ascend.distributed.parallel_state.get_tp_group') as mock_tp_group:
        mock_group.return_value.local_rank = 0
        mock_group.return_value.device_group = MagicMock()
        mock_tp_group.return_value.world_size = 4
-        mock_dp_group.return_value.world_size = 2
-        mock_pp_group.return_value.world_size = 2
        yield


@@ -50,7 +46,6 @@ def test_init_ascend_model_parallel(mock_distributed, parallel_config):
    mock_vllm_config.kv_transfer_config.is_kv_producer = True
    mock_envs_ascend = MagicMock()
    mock_envs_ascend.VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE = 2
-    mock_envs_ascend.VLLM_ASCEND_ENABLE_FLASHCOMM2_OSHARED = 0
    mock_envs_ascend.VLLM_ASCEND_ENABLE_CONTEXT_PARALLEL = 0
    with patch('vllm_ascend.distributed.parallel_state.model_parallel_initialized', return_value=False), \
         patch('vllm_ascend.distributed.parallel_state.init_model_parallel_group'), \