[EPLB][Bugfix]Reduce unnecessary video memory usage (#6020)
### What this PR does / why we need it?
1.Incorporate the warm up of the EPLB into the profile run.
2.Reusing the same gather buffer
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
qwen3-235b aime baseline
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| aime2024 | 604a78 | accuracy | gen | 86.67 |
eplb The OOM issue does not occur.
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| aime2024 | 604a78 | accuracy | gen | 86.67 |
- vLLM version: v0.13.0
- vLLM main:
2c24bc6996
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
This commit is contained in:
@@ -2266,7 +2266,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
is_profile=is_profile)
|
||||
if is_profile and self.dynamic_eplb:
|
||||
self.model.clear_all_moe_loads()
|
||||
if not is_profile and self.dynamic_eplb:
|
||||
if self.dynamic_eplb:
|
||||
self.eplb_updator.take_update_info_from_eplb_process()
|
||||
self.eplb_updator.forward_end()
|
||||
return hidden_states, hidden_states
|
||||
@@ -2293,6 +2293,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
return output
|
||||
|
||||
def profile_run(self) -> None:
|
||||
self.eplb_warmup()
|
||||
mc2_tokens_capacity = get_mc2_tokens_capacity()
|
||||
if self.max_num_tokens > mc2_tokens_capacity and \
|
||||
select_moe_comm_method(mc2_tokens_capacity, self.vllm_config) in {MoECommType.MC2, MoECommType.FUSED_MC2}:
|
||||
|
||||
@@ -366,7 +366,6 @@ class NPUWorker(WorkerBase):
|
||||
|
||||
def compile_or_warm_up_model(self) -> None:
|
||||
# Note: need to adapt for graph mode.
|
||||
self.model_runner.eplb_warmup()
|
||||
warmup_sizes = (self.vllm_config.compilation_config.compile_sizes
|
||||
or []).copy()
|
||||
if not self.model_config.enforce_eager:
|
||||
|
||||
Reference in New Issue
Block a user