[Bugfix] Fix kvpool precision synchronization (#4574)
### What this PR does / why we need it? Fix kvpool precision synchronization Issue https://github.com/vllm-project/vllm-ascend/issues/4412 - vLLM version: v0.11.2 --------- Signed-off-by: LCAIZJ <leichao139636@163.com>
This commit is contained in:
@@ -117,7 +117,6 @@ class KVCacheStoreSendingThread(KVTransferThread):
|
|||||||
addr_list.append(addr)
|
addr_list.append(addr)
|
||||||
size_list.append(size)
|
size_list.append(size)
|
||||||
if self.dcp_size > 1:
|
if self.dcp_size > 1:
|
||||||
torch.npu.current_stream().synchronize()
|
|
||||||
self.m_store.put(key_list, addr_list, size_list)
|
self.m_store.put(key_list, addr_list, size_list)
|
||||||
else:
|
else:
|
||||||
key_list_tp = key_list[self.tp_rank % self.put_step::self.put_step]
|
key_list_tp = key_list[self.tp_rank % self.put_step::self.put_step]
|
||||||
@@ -126,7 +125,6 @@ class KVCacheStoreSendingThread(KVTransferThread):
|
|||||||
size_list_tp = size_list[self.tp_rank %
|
size_list_tp = size_list[self.tp_rank %
|
||||||
self.put_step::self.put_step]
|
self.put_step::self.put_step]
|
||||||
if key_list_tp:
|
if key_list_tp:
|
||||||
torch.npu.current_stream().synchronize()
|
|
||||||
self.m_store.put(key_list_tp, addr_list_tp, size_list_tp)
|
self.m_store.put(key_list_tp, addr_list_tp, size_list_tp)
|
||||||
if is_last_chunk:
|
if is_last_chunk:
|
||||||
self.set_finished_request(req_id)
|
self.set_finished_request(req_id)
|
||||||
@@ -205,7 +203,6 @@ class KVCacheStoreLayerSendingThread(KVTransferThread):
|
|||||||
addr_list.append(addr)
|
addr_list.append(addr)
|
||||||
size_list.append(size)
|
size_list.append(size)
|
||||||
if self.dcp_size > 1:
|
if self.dcp_size > 1:
|
||||||
torch.npu.current_stream().synchronize()
|
|
||||||
self.m_store.put(key_list, addr_list, size_list)
|
self.m_store.put(key_list, addr_list, size_list)
|
||||||
else:
|
else:
|
||||||
key_list_tp = key_list[self.tp_rank % self.put_step::self.put_step]
|
key_list_tp = key_list[self.tp_rank % self.put_step::self.put_step]
|
||||||
@@ -214,7 +211,6 @@ class KVCacheStoreLayerSendingThread(KVTransferThread):
|
|||||||
size_list_tp = size_list[self.tp_rank %
|
size_list_tp = size_list[self.tp_rank %
|
||||||
self.put_step::self.put_step]
|
self.put_step::self.put_step]
|
||||||
if key_list_tp:
|
if key_list_tp:
|
||||||
torch.npu.current_stream().synchronize()
|
|
||||||
self.m_store.put(key_list_tp, addr_list_tp, size_list_tp)
|
self.m_store.put(key_list_tp, addr_list_tp, size_list_tp)
|
||||||
if req_meta.layer_id == self.final_layer_id and req_meta.is_last_chunk:
|
if req_meta.layer_id == self.final_layer_id and req_meta.is_last_chunk:
|
||||||
self.set_finished_request(req_meta.req_id)
|
self.set_finished_request(req_meta.req_id)
|
||||||
|
|||||||
@@ -2339,7 +2339,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
attn_metadata, self.with_prefill, maybe_padded_num_tokens,
|
attn_metadata, self.with_prefill, maybe_padded_num_tokens,
|
||||||
input_ids, positions, intermediate_tensors, inputs_embeds)
|
input_ids, positions, intermediate_tensors, inputs_embeds)
|
||||||
|
|
||||||
self.maybe_wait_for_kv_save()
|
|
||||||
finished_sending, finished_recving = self.get_finished_kv_transfer(
|
finished_sending, finished_recving = self.get_finished_kv_transfer(
|
||||||
scheduler_output)
|
scheduler_output)
|
||||||
|
|
||||||
@@ -2603,7 +2602,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
# ngram and other speculative decoding methods use the sampled
|
# ngram and other speculative decoding methods use the sampled
|
||||||
# tokens on the CPU, so they are run after bookkeeping.
|
# tokens on the CPU, so they are run after bookkeeping.
|
||||||
propose_draft_token_ids(valid_sampled_token_ids)
|
propose_draft_token_ids(valid_sampled_token_ids)
|
||||||
|
self.maybe_wait_for_kv_save()
|
||||||
if has_kv_transfer_group():
|
if has_kv_transfer_group():
|
||||||
get_kv_transfer_group().clear_connector_metadata()
|
get_kv_transfer_group().clear_connector_metadata()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user