[BugFix] Fix Qwen3-Next because of vllm #24982 (#3221)

- Fixes Qwen3-Next because of vllm #24982

### Does this PR introduce _any_ user-facing change?
N/A

### How was this patch tested?
```
def main():
    prompts = [
        "窗前明月光,",
        "The president of the United States is Mr.",
        "The capital of France is",
        "The future of AI is",
        "感时花溅泪,",
        "家书抵万金啥意思?",
        "plz tell me a story: ",
    ]

    # Create a sampling params object.
    sampling_params = SamplingParams(max_tokens=100, temperature=0.6, top_k=40, top_p=0.95)
    # Create an LLM.
    llm = LLM(
        model="Qwen/Qwen3-Next-80B-A3B-Instruct",
              tensor_parallel_size=4,
              enforce_eager=True,
              trust_remote_code=True,
              max_model_len=256,
              gpu_memory_utilization=0.7,
              block_size=64
              )

    # Generate texts from the prompts.
    outputs = llm.generate(prompts, sampling_params)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```


- vLLM version: v0.10.2
- vLLM main:
https://github.com/vllm-project/vllm/commit/releases/v0.11.0

---------

Signed-off-by: Icey <1790571317@qq.com>
This commit is contained in:
Icey
2025-09-29 15:27:30 +08:00
committed by GitHub
parent c73dd8fecb
commit 83092d9b8b

View File

@@ -51,13 +51,10 @@ from vllm.model_executor.utils import set_weight_attrs
from vllm.transformers_utils.configs import Qwen3NextConfig
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata
from vllm.model_executor.models.qwen3_next import Qwen3NextAttention # isort: skip
from vllm.model_executor.models.qwen3_next import Qwen3NextDecoderLayer # isort: skip
from vllm.model_executor.models.qwen3_next import Qwen3NextForCausalLM # isort: skip
from vllm.model_executor.models.qwen3_next import Qwen3NextGatedDeltaNet # isort: skip
from vllm.model_executor.models.qwen3_next import Qwen3NextModel # isort: skip
from vllm.model_executor.models.qwen3_next import Qwen3NextSparseMoeBlock # isort: skip
from vllm.model_executor.models.qwen3_next import fused_gdn_gating # isort: skip
from vllm.model_executor.models.qwen3_next import ( # isort: skip
Qwen3NextAttention, Qwen3NextDecoderLayer, Qwen3NextForCausalLM,
Qwen3NextGatedDeltaNet, Qwen3NextModel, Qwen3NextSparseMoeBlock,
fused_gdn_gating)
class CustomQwen3NextGatedDeltaNet(Qwen3NextGatedDeltaNet, MambaBase):
@@ -429,17 +426,16 @@ class CustomQwen3NextDecoderLayer(Qwen3NextDecoderLayer):
def __init__(
self,
config: Qwen3NextConfig,
vllm_config: VllmConfig,
layer_type: str,
model_config: Optional[ModelConfig] = None,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
speculative_config: Optional[SpeculativeConfig] = None,
prefix: str = "",
enable_eplb: bool = False,
) -> None:
nn.Module.__init__(self)
self.config = config
config = vllm_config.model_config.hf_config
model_config = vllm_config.model_config
cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config
speculative_config = vllm_config.speculative_config
self.layer_type = layer_type
self.layer_idx = extract_layer_index(prefix)
@@ -468,12 +464,8 @@ class CustomQwen3NextDecoderLayer(Qwen3NextDecoderLayer):
if (self.layer_idx not in mlp_only_layers) and (
config.num_experts > 0 and
(self.layer_idx + 1) % config.decoder_sparse_step == 0):
self.mlp = Qwen3NextSparseMoeBlock(
config=config,
quant_config=quant_config,
prefix=f"{prefix}.mlp",
enable_eplb=enable_eplb,
)
self.mlp = Qwen3NextSparseMoeBlock(vllm_config=vllm_config,
prefix=f"{prefix}.mlp")
else:
self.mlp = Qwen3NextMLP(
hidden_size=config.hidden_size,
@@ -493,14 +485,14 @@ class CustomQwen3NextDecoderLayer(Qwen3NextDecoderLayer):
torch.zeros(
1,
1,
self.config.hidden_size,
config.hidden_size,
dtype=config.torch_dtype,
), )
self.ffn_layer_scale = torch.nn.Parameter(
torch.zeros(
1,
1,
self.config.hidden_size,
config.hidden_size,
dtype=config.torch_dtype,
), )
@@ -511,13 +503,8 @@ class CustomQwen3NextModel(Qwen3NextModel):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
nn.Module.__init__(self)
config: Qwen3NextConfig = vllm_config.model_config.hf_config
model_config = vllm_config.model_config
cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config
parallel_config = vllm_config.parallel_config
lora_config = vllm_config.lora_config
speculative_config = vllm_config.speculative_config
enable_eplb = parallel_config.enable_eplb
eplb_config = parallel_config.eplb_config
self.num_redundant_experts = eplb_config.num_redundant_experts
@@ -534,14 +521,9 @@ class CustomQwen3NextModel(Qwen3NextModel):
def get_layer(prefix: str):
return CustomQwen3NextDecoderLayer(
config,
vllm_config,
layer_type=config.layer_types[extract_layer_index(prefix)],
model_config=model_config,
cache_config=cache_config,
quant_config=quant_config,
speculative_config=speculative_config,
prefix=prefix,
enable_eplb=enable_eplb,
)
self.start_layer, self.end_layer, self.layers = make_layers(