Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -274,7 +274,6 @@ class Qwen3_5DecoderLayer(Qwen3NextDecoderLayer):
|
||||
1,
|
||||
1,
|
||||
config.hidden_size,
|
||||
dtype=config.dtype,
|
||||
),
|
||||
)
|
||||
self.ffn_layer_scale = torch.nn.Parameter(
|
||||
@@ -282,7 +281,6 @@ class Qwen3_5DecoderLayer(Qwen3NextDecoderLayer):
|
||||
1,
|
||||
1,
|
||||
config.hidden_size,
|
||||
dtype=config.dtype,
|
||||
),
|
||||
)
|
||||
|
||||
@@ -444,11 +442,7 @@ class Qwen3_5Model(Qwen3NextModel):
|
||||
# qwen3.5 no need to transpose
|
||||
# loaded_weight = loaded_weight.transpose(-1, -2)
|
||||
if "experts.gate_up_proj" in name:
|
||||
if loaded_weight.shape[-2] != 1:
|
||||
chunk_dim = -2
|
||||
else:
|
||||
chunk_dim = -1
|
||||
loaded_weight = loaded_weight.chunk(2, dim=chunk_dim)
|
||||
loaded_weight = loaded_weight.chunk(2, dim=-2)
|
||||
success_w1 = self.load_fused_expert_weights(
|
||||
name_mapped,
|
||||
params_dict,
|
||||
@@ -544,6 +538,7 @@ class Qwen3_5ForCausalLMBase(
|
||||
self.vllm_config = vllm_config
|
||||
self.model_config = vllm_config.model_config
|
||||
cache_config = vllm_config.cache_config
|
||||
|
||||
scheduler_config = vllm_config.scheduler_config
|
||||
if cache_config.mamba_cache_mode == "all":
|
||||
raise NotImplementedError(
|
||||
@@ -633,6 +628,9 @@ class Qwen3_5MoeForCausalLM(Qwen3_5ForCausalLMBase, QwenNextMixtureOfExperts):
|
||||
dummy_inputs=Qwen3VLDummyInputsBuilder,
|
||||
)
|
||||
class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid):
|
||||
# Qwen3.5 does not support multimodal pruning (EVS).
|
||||
supports_multimodal_pruning = False
|
||||
|
||||
packed_modules_mapping = Qwen3VLForConditionalGeneration.packed_modules_mapping | {
|
||||
"in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
|
||||
"in_proj_ba": ["in_proj_b", "in_proj_a"],
|
||||
@@ -648,10 +646,8 @@ class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid)
|
||||
self.config = config
|
||||
self.multimodal_config = multimodal_config
|
||||
self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
|
||||
self.video_pruning_rate = multimodal_config.video_pruning_rate
|
||||
self.is_multimodal_pruning_enabled = (
|
||||
multimodal_config.is_multimodal_pruning_enabled()
|
||||
)
|
||||
# Qwen3.5 does not support multimodal pruning (EVS).
|
||||
self.is_multimodal_pruning_enabled = False
|
||||
|
||||
with self._mark_tower_model(vllm_config, {"image", "video"}):
|
||||
self.visual = Qwen3_VisionTransformer(
|
||||
@@ -698,6 +694,12 @@ class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid)
|
||||
|
||||
return inputs_embeds
|
||||
|
||||
def recompute_mrope_positions(self, *args, **kwargs):
|
||||
raise NotImplementedError(
|
||||
"Qwen3.5 does not support multimodal pruning (EVS). "
|
||||
"recompute_mrope_positions should never be called."
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
@@ -856,10 +858,8 @@ class Qwen3_5MoeForConditionalGeneration(
|
||||
self.config = config
|
||||
self.multimodal_config = multimodal_config
|
||||
self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
|
||||
self.video_pruning_rate = multimodal_config.video_pruning_rate
|
||||
self.is_multimodal_pruning_enabled = (
|
||||
multimodal_config.is_multimodal_pruning_enabled()
|
||||
)
|
||||
# Qwen3.5 does not support multimodal pruning (EVS).
|
||||
self.is_multimodal_pruning_enabled = False
|
||||
|
||||
with self._mark_tower_model(vllm_config, {"image", "video"}):
|
||||
self.visual = Qwen3_VisionTransformer(
|
||||
|
||||
Reference in New Issue
Block a user