[main2main] upgrade vllm to 0308 (#7213)
### What this PR does / why we need it?
Update main2main to vllm 0308.
breaks:
* https://github.com/vllm-project/vllm/pull/30681
* https://github.com/vllm-project/vllm/pull/35552 remove
self.cudagraph_batch_sizes
* https://github.com/vllm-project/vllm/pull/35158 clear_metadata ->
defer_finalize
* https://github.com/vllm-project/vllm/pull/36006 remove
CacheConfig.cpu_offload_gb
* https://github.com/vllm-project/vllm/pull/35472
* https://github.com/vllm-project/vllm/pull/34552 attn_metadata_builder
* https://github.com/vllm-project/vllm/pull/30515 profile_seq_lens
* https://github.com/vllm-project/vllm/pull/28053
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
Signed-off-by: menogrey <1299267905@qq.com>
Co-authored-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
@@ -9,7 +9,6 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig, FusedMoE
|
||||
|
||||
from vllm_ascend.ascend_config import init_ascend_config
|
||||
from vllm_ascend.eplb.core.eplb_utils import init_eplb_config
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
# isort: on
|
||||
|
||||
|
||||
@@ -22,38 +21,22 @@ class TestAscendConfig(unittest.TestCase):
|
||||
"eplb_config": {"dynamic_eplb": True, "num_redundant_experts": 2},
|
||||
}
|
||||
from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
|
||||
if vllm_version_is("0.16.0"):
|
||||
moe_parallel_config = FusedMoEParallelConfig(
|
||||
2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", is_sequence_parallel=True, enable_eplb=True)
|
||||
moe_config = FusedMoEConfig(
|
||||
num_experts=8,
|
||||
experts_per_token=8,
|
||||
hidden_dim=8192,
|
||||
intermediate_size_per_partition=5,
|
||||
num_local_experts=8,
|
||||
activation="silu",
|
||||
device="npu",
|
||||
routing_method=RoutingMethodType.Simulated,
|
||||
moe_parallel_config=moe_parallel_config,
|
||||
in_dtype=torch.float16,
|
||||
)
|
||||
else:
|
||||
moe_parallel_config = FusedMoEParallelConfig(
|
||||
2, 0, 1, 2, 1, 1, 1, 1, 1, True, "hccl",
|
||||
enable_eplb=True)
|
||||
moe_config = FusedMoEConfig(
|
||||
num_experts=8,
|
||||
experts_per_token=8,
|
||||
hidden_dim=8192,
|
||||
intermediate_size_per_partition=5,
|
||||
num_local_experts=8,
|
||||
num_logical_experts=8,
|
||||
activation="silu",
|
||||
device="npu",
|
||||
routing_method=RoutingMethodType.Simulated,
|
||||
moe_parallel_config=moe_parallel_config,
|
||||
in_dtype=torch.float16,
|
||||
)
|
||||
moe_parallel_config = FusedMoEParallelConfig(
|
||||
2, 0, 1, 2, 1, 1, 1, 1, 1, True, "hccl",
|
||||
enable_eplb=True)
|
||||
moe_config = FusedMoEConfig(
|
||||
num_experts=8,
|
||||
experts_per_token=8,
|
||||
hidden_dim=8192,
|
||||
intermediate_size_per_partition=5,
|
||||
num_local_experts=8,
|
||||
num_logical_experts=8,
|
||||
activation="silu",
|
||||
device="npu",
|
||||
routing_method=RoutingMethodType.Simulated,
|
||||
moe_parallel_config=moe_parallel_config,
|
||||
in_dtype=torch.float16,
|
||||
)
|
||||
moe_config.supports_eplb = True
|
||||
self.vllm_config = vllm_config
|
||||
self.moe_config = moe_config
|
||||
|
||||
Reference in New Issue
Block a user