remove useless patch (#4699)
patach_config is useless now. Let's remove it
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
@@ -317,7 +317,7 @@ class AscendMLATorchairMetadataBuilder:
|
||||
dtype=self.model_config.dtype,
|
||||
device=device)
|
||||
if self.vllm_config.speculative_config is not None and\
|
||||
self.vllm_config.speculative_config.method == 'deepseek_mtp':
|
||||
self.vllm_config.speculative_config.method == 'mtp':
|
||||
attn_state = AscendAttentionState.SpecDecoding
|
||||
num_decode_tokens = 2
|
||||
else:
|
||||
|
||||
@@ -501,7 +501,7 @@ class NPUTorchairModelRunner(NPUModelRunner):
|
||||
def update_torchair_graph_batch_sizes(self):
|
||||
# return graph_batch_sizes according to the max number of tokens
|
||||
# first pad according to the number of requests
|
||||
if self.is_kv_consumer and self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
|
||||
if self.is_kv_consumer and self.speculative_config and self.speculative_config.method == 'mtp':
|
||||
# pd disaggregation scenario may incorrectly calculate the batch in mtp scenario, so we force set it to max_num_reqs
|
||||
self.torchair_graph_batch_sizes = [self.max_num_reqs]
|
||||
logger.warning(
|
||||
|
||||
@@ -319,7 +319,7 @@ class AscendSFATorchairMetadataBuilder:
|
||||
device=device)
|
||||
|
||||
if self.vllm_config.speculative_config is not None and\
|
||||
self.vllm_config.speculative_config.method == 'deepseek_mtp':
|
||||
self.vllm_config.speculative_config.method == 'mtp':
|
||||
attn_state = AscendAttentionState.SpecDecoding
|
||||
num_decode_tokens = 2
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user