diff --git a/vllm_ascend/ops/triton/fused_gdn_gating.py b/vllm_ascend/ops/triton/fused_gdn_gating.py index b3b05706..9c3ea9d1 100644 --- a/vllm_ascend/ops/triton/fused_gdn_gating.py +++ b/vllm_ascend/ops/triton/fused_gdn_gating.py @@ -71,6 +71,8 @@ def fused_gdn_gating_patch( BLK_HEADS = 8 COL_ITER = triton.cdiv(num_heads, BLK_HEADS) + elem_size = a.element_size() + max_ub_batches = int((UNIFIED_BUFFER_SIZE * 0.95) / (BLK_HEADS * elem_size)) if batch <= num_cores: progs = batch BLK_BATCHES = 1 @@ -78,10 +80,11 @@ def fused_gdn_gating_patch( else: progs = num_cores FACTOR = 8 * num_heads - row_per_core = triton.cdiv(batch, num_cores) - BLK_BATCHES = ( - triton.next_power_of_2(triton.cdiv(UNIFIED_BUFFER_SIZE, FACTOR * BLK_HEADS) // a.element_size()) // 2 + calc_blk_batches = ( + triton.next_power_of_2(triton.cdiv(int(UNIFIED_BUFFER_SIZE * 0.95), FACTOR * BLK_HEADS * elem_size)) // 2 ) + BLK_BATCHES = max(1, min(calc_blk_batches, max_ub_batches, 64)) + row_per_core = triton.cdiv(batch, progs) ROW_ITER = triton.cdiv(row_per_core, BLK_BATCHES) g = torch.empty(1, batch, num_heads, dtype=torch.float32, device=a.device) diff --git a/vllm_ascend/quantization/modelslim_config.py b/vllm_ascend/quantization/modelslim_config.py index 3e3b308a..c769bfa0 100644 --- a/vllm_ascend/quantization/modelslim_config.py +++ b/vllm_ascend/quantization/modelslim_config.py @@ -87,6 +87,16 @@ QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = { "language_model.lm_head.": "lm_head.", "language_model.model.": "model.language_model.", }, + "qwen3_5": { + "visual.": "model.visual.", + "language_model.lm_head.": "lm_head.", + "language_model.model.": "model.language_model.", + }, + "qwen3_5_moe": { + "visual.": "model.visual.", + "language_model.lm_head.": "lm_head.", + "language_model.model.": "model.language_model.", + }, } # key: model_type @@ -104,6 +114,19 @@ packed_modules_model_mapping: dict[str, dict[str, list[str]]] = { ], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, + "qwen3_5": { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"], + "in_proj_ba": ["in_proj_b", "in_proj_a"], + }, + "qwen3_5_moe": { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"], + "in_proj_ba": ["in_proj_b", "in_proj_a"], + "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], + }, "deepseek_v2": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],