[Misc] Upgrade vllm hash to 12_14 (#5000)
### What this PR does / why we need it?
### Does this PR introduce _any_ user-facing change?
1. fix https://github.com/vllm-project/vllm/pull/27938
2. fix https://github.com/vllm-project/vllm/pull/27145
pooling models now supports chunked prefill and prefix caching,
3. fix https://github.com/vllm-project/vllm/pull/30181
define the CPU fields in the field config where they really belong.
4. fix https://github.com/vllm-project/vllm/pull/28168
define the CPU fields in the field config where they really belong.
5. fix https://github.com/vllm-project/vllm/pull/30201
some moudle rename
6. fix https://github.com/vllm-project/vllm/pull/29067
fusedmoe moudle refactor
7. fix https://github.com/vllm-project/vllm/pull/29066
fusedmoe moudle refactor
8. fix https://github.com/vllm-project/vllm/pull/29624
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -153,7 +153,7 @@ class AscendFusedMoE(FusedMoE):
|
||||
AscendFusedMoE.moe_counter += 1
|
||||
self.moe_instance_id = AscendFusedMoE.moe_counter
|
||||
|
||||
self.expert_map = None
|
||||
self._expert_map = None
|
||||
self.log2phy = None
|
||||
|
||||
if self.quant_config is None:
|
||||
@@ -184,7 +184,7 @@ class AscendFusedMoE(FusedMoE):
|
||||
dtype=vllm_config.model_config.dtype)
|
||||
|
||||
# init moe.
|
||||
self.local_num_experts, self.expert_map, _ = determine_expert_map(
|
||||
self.local_num_experts, self._expert_map, _ = determine_expert_map(
|
||||
self.ep_size, self.ep_rank, self.global_num_experts)
|
||||
# TODO: Temporary flag to indicate if static EPLB is enabled. This is a
|
||||
# workaround to bypass a quantization check that fails with float weights.
|
||||
@@ -200,7 +200,7 @@ class AscendFusedMoE(FusedMoE):
|
||||
self.expert_load_balancer.get_global_redundant_expert_num())
|
||||
self.global_num_experts = num_experts + self.global_redundant_expert_num
|
||||
try:
|
||||
self.local_num_experts, self.expert_map = (
|
||||
self.local_num_experts, self._expert_map = (
|
||||
self.expert_load_balancer.get_rank_placement_map(
|
||||
self.moe_instance_id, self.ep_rank))
|
||||
self.log2phy = self.expert_load_balancer.get_rank_log2phy_map(
|
||||
@@ -216,16 +216,16 @@ class AscendFusedMoE(FusedMoE):
|
||||
if self.dynamic_eplb:
|
||||
self.log2phy = determine_default_log2phy_map(
|
||||
self.global_num_experts, self.ep_size, self.ep_rank).npu()
|
||||
if self.expert_map is not None and isinstance(self.expert_map,
|
||||
torch.Tensor):
|
||||
if self._expert_map is not None and isinstance(self._expert_map,
|
||||
torch.Tensor):
|
||||
logger.info_once(
|
||||
"[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
|
||||
" number of experts: %s/%s. Experts local to global index map:"
|
||||
" %s.", self.ep_rank, self.ep_size, self.local_num_experts,
|
||||
self.global_num_experts,
|
||||
get_compressed_expert_map(self.expert_map))
|
||||
get_compressed_expert_map(self._expert_map))
|
||||
local_num_experts = (torch.sum(
|
||||
self.expert_map != -1) if self.expert_map is not None else
|
||||
self._expert_map != -1) if self._expert_map is not None else
|
||||
self.global_num_experts)
|
||||
if self.dynamic_eplb:
|
||||
self.moe_load = torch.zeros(local_num_experts,
|
||||
@@ -276,10 +276,16 @@ class AscendFusedMoE(FusedMoE):
|
||||
return QuantType.NONE
|
||||
|
||||
def update_expert_map(self, new_expert_map):
|
||||
self.expert_map = new_expert_map
|
||||
self._expert_map = new_expert_map
|
||||
|
||||
def get_map(self):
|
||||
return self.expert_map
|
||||
@property
|
||||
def expert_map(self) -> torch.Tensor | None:
|
||||
return self._expert_map
|
||||
|
||||
@expert_map.setter
|
||||
def expert_map(self, new_expert_map):
|
||||
# TODO(Potabk): Remove this once we drop vllm v0.12.0(This makes backward compatibility with vllm v0.12.0)
|
||||
self._expert_map = new_expert_map
|
||||
|
||||
def get_log2phy_map(self):
|
||||
return self.log2phy
|
||||
|
||||
Reference in New Issue
Block a user