some modifications to ensure 50K context input

2026-06-04 17:56:29 +08:00
parent 1c33ef1355
commit 8c047a70ea
3 changed files with 150 additions and 0 deletions
--- a/qwen3_6_scripts/qwen3_5.py
+++ b/qwen3_6_scripts/qwen3_5.py
@@ -412,6 +412,9 @@ class GatedDeltaNet(nn.Module):

        else:
            # Decode: one token per sequence
+            with open("/tmp/vllm_decode_debug.log", "a") as _f:
+                _f.write(f"[deltanet decode] layer={self.layer_idx} num_seqs={hidden_states.shape[0]}\n")
+                _f.flush()
            num_seqs = hidden_states.shape[0]
            weight_2d = self.conv1d_weight.squeeze(1)

@@ -847,6 +850,12 @@ class Qwen3_5ForCausalLM(nn.Module, HasInnerState, SupportsLoRA):
        hidden_states: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[torch.Tensor]:
+        # Non-driver TP ranks have seq_groups=None in sampling_metadata (normal
+        # TP behavior); they must still call logits_processor to participate in
+        # the NCCL gather inside lm_head. logits_processor returns None for
+        # non-driver ranks after the gather, safely skipping _apply_logits_processors.
+        # Rank 0 (driver) always has seq_groups != None given
+        # --max-num-batched-tokens >= --max-model-len (no chunked-prefill splits).
        return self.logits_processor(self.lm_head, hidden_states,
                                     sampling_metadata)