[EPLB][Bugfix] Bugfix for ineffective dynamic eplb (#6653)
### What this PR does / why we need it?
#6043 deleted the forward_before phase of the dynamic eplb. Currently,
the end-to-end precision is monitored in the UT, and the log is not
printed in the key place. As a result, the eplb does not take effect and
is not intercepted.
1. The forward_before function is added back.
2. Delete unnecessary logs and add key logs.
3. Warm-up of algorithm 3 is added.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?

#### The conversation is normal.
Okay, the user is asking, \"What is deep learning?\" I need to explain
this in a clear and concise way. Let me start by recalling what I know
about deep learning. It's a subset of machine learning, right? So first,
I should mention that it's part of machine learning, which itself is a
branch of AI. Then, the key aspect of deep learning is the use of neural
networks with multiple layers. These are called deep neural
networks.\n\nWait, I should define neural networks first. Maybe start
with the basics. A neural network is inspired by the human brain, with
layers of nodes (neurons) that process data. But deep learning
specifically refers to networks with many layers—hence \"deep.\" So the
term \"deep\" comes from the number of layers. \n\nI should explain how
deep learning works. It involves training these networks on large
datasets, allowing them to automatically learn features from the data.
Unlike traditional machine learning, where you might have to manually
extract features, deep learning models can do this automatically. That's
a key point. For example, in image recognition, a deep learning model
can learn to detect edges, shapes, and then more complex patterns
without human intervention.\n\nApplications are important too. The user
might want to know where deep learning is used. Common examples include
image and speech recognition, natural language processing, autonomous
vehicles, and recommendation systems. Maybe mention specific
technologies like self-driving cars using computer vision or virtual
assistants like Siri or Alexa
- vLLM version: v0.15.0
- vLLM main:
13397841ab
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
This commit is contained in:
@@ -107,10 +107,3 @@ def test_invalid_state_asyn_update(mock_adaptor):
|
||||
loader_obj.update_expert_map_and_weight([])
|
||||
|
||||
assert not mock_adaptor.do_update_expert_map.called
|
||||
|
||||
|
||||
def test_load_impl_not_implemented(mock_adaptor):
|
||||
loader_obj = loader.D2DExpertWeightLoader()
|
||||
loader_obj.set_adator(mock_adaptor)
|
||||
with pytest.raises(NotImplementedError):
|
||||
loader_obj.load_impl({}, {})
|
||||
|
||||
@@ -34,7 +34,7 @@ class D2DExpertWeightLoader:
|
||||
self.layer_id = -1 # layer id to be updated
|
||||
self.state = ExpertWeightUpdateState.WAITING
|
||||
self.recv_expert_list = []
|
||||
self.mock_flag = True
|
||||
self.num_layers = 0
|
||||
|
||||
def set_adator(self, eplb_adaptor):
|
||||
self.eplb_adaptor = eplb_adaptor
|
||||
@@ -103,12 +103,10 @@ class D2DExpertWeightLoader:
|
||||
local_expert_to_replace, buffer_tensor_id = recv_expert_info
|
||||
self.eplb_adaptor.do_update_expert_weight(self.layer_id, local_expert_to_replace, buffer_tensor_id)
|
||||
|
||||
logger.debug(f"[EPLB] finished update expert weight for layer: {self.layer_id}")
|
||||
if self.layer_id == self.num_layers - 1:
|
||||
logger.info("[EPLB] finished update expert weight.")
|
||||
|
||||
self.recv_expert_list = []
|
||||
self.updated_expert_map = None
|
||||
self.layer_id = -1
|
||||
self.state = ExpertWeightUpdateState.WAITING
|
||||
|
||||
def load_impl(self, old_expert_table, new_expert_table):
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -68,7 +68,7 @@ class EplbWorker:
|
||||
|
||||
update_info = self.compose_expert_update_info_greedy(new_expert_maps, self.old_expert_maps)
|
||||
self.old_expert_maps = new_expert_maps
|
||||
logger.info("EPLB Process compute complete")
|
||||
logger.debug("EPLB Process compute complete")
|
||||
|
||||
packed_update_info = self.pack_update_info(update_info)
|
||||
|
||||
@@ -274,6 +274,10 @@ class EplbProcess:
|
||||
Subprocess entry: bind to specified NPU, loop waiting for planner_q to wake up,
|
||||
call do_update, then notify main process update is complete.
|
||||
"""
|
||||
if self.policy_type == 3:
|
||||
from vllm_ascend.eplb.core.policy.policy_flashlb import warm_up
|
||||
|
||||
warm_up()
|
||||
while True:
|
||||
try:
|
||||
planner_q.get()
|
||||
|
||||
@@ -22,11 +22,12 @@ import vllm.envs as envs
|
||||
from vllm.logger import logger
|
||||
|
||||
from vllm_ascend.eplb.adaptor.vllm_adaptor import VllmEplbAdaptor
|
||||
from vllm_ascend.eplb.core.eplb_device_transfer_loader import D2DExpertWeightLoader
|
||||
from vllm_ascend.eplb.core.eplb_worker import EplbProcess
|
||||
|
||||
|
||||
class EplbUpdator:
|
||||
def __init__(self, eplb_config, loader, eplb_process: EplbProcess, process):
|
||||
def __init__(self, eplb_config, loader: D2DExpertWeightLoader, eplb_process: EplbProcess, process):
|
||||
self.eplb_config = eplb_config
|
||||
self.init_eplb(self.eplb_config.expert_map_path, process)
|
||||
self.eplb_loader = loader
|
||||
@@ -42,6 +43,7 @@ class EplbUpdator:
|
||||
self.device = local_load.device
|
||||
shape = (self.world_size, *local_load.shape)
|
||||
self._gather_buffer = torch.empty(shape, dtype=local_load.dtype, device=self.device)
|
||||
self.eplb_loader.num_layers = self.adaptor.num_dense_layers + self.adaptor.num_moe_layers
|
||||
|
||||
def init_eplb(self, expert_map_path, process):
|
||||
self.rank_id = dist.get_rank()
|
||||
@@ -75,7 +77,6 @@ class EplbUpdator:
|
||||
if self.cur_iterations == (
|
||||
self.expert_heat_collection_interval + self.algorithm_execution_interval + self.num_moe_layers
|
||||
):
|
||||
logger.info("Finish expert parallel load balancing.")
|
||||
if self.expert_map_record_path is not None:
|
||||
self.adaptor._export_tensor_to_file(self.shared_dict["expert_maps"], self.expert_map_record_path)
|
||||
|
||||
|
||||
@@ -1099,6 +1099,10 @@ class NPUModelRunner(GPUModelRunner):
|
||||
"logprobs for prompt tokens, tokens, please disable "
|
||||
"it when the requests need prompt logprobs"
|
||||
)
|
||||
|
||||
if self.dynamic_eplb:
|
||||
self.eplb_updator.forward_before()
|
||||
|
||||
num_reqs = self.input_batch.num_reqs
|
||||
req_ids = self.input_batch.req_ids
|
||||
tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
|
||||
@@ -1198,6 +1202,9 @@ class NPUModelRunner(GPUModelRunner):
|
||||
ec_connector_output,
|
||||
) = self._preprocess(scheduler_output, num_tokens_padded, intermediate_tensors)
|
||||
|
||||
if self.dynamic_eplb:
|
||||
self.eplb_updator.take_update_info_from_eplb_process()
|
||||
|
||||
# update global cos, sin
|
||||
update_cos_sin(positions)
|
||||
|
||||
@@ -2072,6 +2079,9 @@ class NPUModelRunner(GPUModelRunner):
|
||||
assert sum(num_scheduled_tokens_list) == num_tokens
|
||||
assert len(num_scheduled_tokens_list) == num_reqs
|
||||
|
||||
if not is_profile and self.dynamic_eplb:
|
||||
self.eplb_updator.forward_before()
|
||||
|
||||
num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32)
|
||||
self.query_lens = torch.from_numpy(num_scheduled_tokens)
|
||||
num_tokens_unpadded = int(num_scheduled_tokens.sum())
|
||||
|
||||
Reference in New Issue
Block a user