testing dynamic register

2026-02-06 14:17:06 +08:00
parent fba02652c8
commit b702adf015
2 changed files with 93 additions and 186 deletions
--- a/vllm-v0.6.2/vllm/model_executor/models/transformers/base.py
+++ b/vllm-v0.6.2/vllm/model_executor/models/transformers/base.py
@@ -22,6 +22,7 @@ from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group, get_tp_group
 from vllm.distributed.utils import get_pp_indices
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.models.utils import (
    AutoWeightsLoader,
@@ -251,17 +252,6 @@ class Base(nn.Module):
        logger.info("Creating model structure on meta device...")
        # DEBUG: Print config info before any modifications
        logger.info("DEBUG: Config type: %s", type(self.config).__name__)
        logger.info("DEBUG: text_config type: %s", type(self.text_config).__name__)
        logger.info("DEBUG: hidden_size=%s, num_attention_heads=%s",
                   getattr(self.text_config, 'hidden_size', 'N/A'),
                   getattr(self.text_config, 'num_attention_heads', 'N/A'))
        logger.info("DEBUG: config.head_dim=%s (before fix)",
                   getattr(self.config, 'head_dim', 'NOT SET'))
        logger.info("DEBUG: text_config.head_dim=%s (before fix)",
                   getattr(self.text_config, 'head_dim', 'NOT SET'))
        # Set attention implementation to vLLM's
        self.text_config._attn_implementation = "vllm"
@@ -270,7 +260,6 @@ class Base(nn.Module):
        # Some models may have incorrect head_dim, so we compute and set it
        if hasattr(self.text_config, "num_attention_heads") and hasattr(self.text_config, "hidden_size"):
            correct_head_dim = self.text_config.hidden_size // self.text_config.num_attention_heads
            logger.info("DEBUG: Computed correct_head_dim = %d", correct_head_dim)
            # Check and fix head_dim in text_config
            if hasattr(self.text_config, "head_dim"):
@@ -297,11 +286,6 @@ class Base(nn.Module):
            # Some models also need _attn_implementation in config
            self.config._attn_implementation = "vllm"
        logger.info("DEBUG: config.head_dim=%s (after fix)",
                   getattr(self.config, 'head_dim', 'NOT SET'))
        logger.info("DEBUG: text_config.head_dim=%s (after fix)",
                   getattr(self.text_config, 'head_dim', 'NOT SET'))
        with init_on_device_without_buffers("meta"):
            self.model: "PreTrainedModel" = AutoModel.from_config(
                self.config,
@@ -461,7 +445,8 @@ class Base(nn.Module):
                    )
                    replaced_count += 1
-                elif child.__class__.__name__.endswith("RMSNorm"):
+                elif child.__class__.__name__.endswith("RMSNorm") and \
                        not isinstance(child, RMSNorm):
                    new_module = replace_rms_norm_class(child, self.hidden_size)
                    replaced_count += 1
@@ -475,64 +460,8 @@ class Base(nn.Module):
        logger.info("Replaced %d modules", replaced_count)
    def _add_attention_debug_hook(self) -> None:
-        """Add debug hooks to capture actual tensor shapes during forward."""
+        """No-op. Debug hooks removed after root cause identified."""
-        # Monkey-patch apply_rotary_pos_emb in the transformers module
+        pass
        try:
            import transformers.models.qwen2.modeling_qwen2 as qwen2_module
            original_apply_rotary = qwen2_module.apply_rotary_pos_emb
            def _debug_apply_rotary(q, k, cos, sin, unsqueeze_dim=1):
                logger.info("DEBUG ROTARY: q.shape=%s, k.shape=%s, cos.shape=%s, sin.shape=%s",
                           q.shape, k.shape, cos.shape, sin.shape)
                # After unsqueeze
                cos_unsqueezed = cos.unsqueeze(unsqueeze_dim)
                sin_unsqueezed = sin.unsqueeze(unsqueeze_dim)
                logger.info("DEBUG ROTARY: after unsqueeze(%d): cos.shape=%s, sin.shape=%s",
                           unsqueeze_dim, cos_unsqueezed.shape, sin_unsqueezed.shape)
                logger.info("DEBUG ROTARY: q dim 3 = %d, cos dim 3 = %d",
                           q.shape[3] if q.dim() >= 4 else -1,
                           cos_unsqueezed.shape[3] if cos_unsqueezed.dim() >= 4 else -1)
                return original_apply_rotary(q, k, cos, sin, unsqueeze_dim)
            qwen2_module.apply_rotary_pos_emb = _debug_apply_rotary
            logger.info("DEBUG: Patched apply_rotary_pos_emb for debugging")
        except Exception as e:
            logger.warning("DEBUG: Failed to patch apply_rotary_pos_emb: %s", e)
        # Also add a forward pre-hook with kwargs support
        for name, module in self.model.named_modules():
            if "Attention" in module.__class__.__name__:
                def _debug_hook(mod, args, kwargs):
                    hidden = kwargs.get('hidden_states', args[0] if args else None)
                    if hidden is not None:
                        logger.info("DEBUG HOOK: Attention input hidden_states.shape=%s", hidden.shape)
                        logger.info("DEBUG HOOK: mod.head_dim=%s (at forward time)", getattr(mod, 'head_dim', 'NOT SET'))
                        # Check mod.config.head_dim 
                        mod_config = getattr(mod, 'config', None)
                        if mod_config:
                            logger.info("DEBUG HOOK: mod.config.head_dim=%s", getattr(mod_config, 'head_dim', 'NOT SET'))
                            logger.info("DEBUG HOOK: mod.config id=%d, same as self.config=%s",
                                       id(mod_config), id(mod_config) == id(mod_config))
                        # Try q_proj
                        q_proj = getattr(mod, 'q_proj', None)
                        if q_proj is not None:
                            try:
                                q_out = q_proj(hidden)
                                logger.info("DEBUG HOOK: q_proj output shape=%s", q_out.shape)
                                head_dim = getattr(mod, 'head_dim', 128)
                                input_shape = hidden.shape[:-1]
                                hidden_shape = (*input_shape, -1, head_dim)
                                logger.info("DEBUG HOOK: view target shape=%s", hidden_shape)
                                q_viewed = q_out.view(hidden_shape)
                                logger.info("DEBUG HOOK: q_proj viewed shape=%s", q_viewed.shape)
                                q_transposed = q_viewed.transpose(1, 2)
                                logger.info("DEBUG HOOK: q_proj transposed shape=%s", q_transposed.shape)
                            except Exception as e:
                                logger.info("DEBUG HOOK: Error: %s", e)
                module.register_forward_pre_hook(_debug_hook, with_kwargs=True)
                logger.info("DEBUG: Added debug hook (with_kwargs) to %s", name)
                break
    def _fix_attention_head_dim(self) -> None:
        """
@@ -546,50 +475,36 @@ class Base(nn.Module):
        correct_head_dim = self.hidden_size // getattr(
            self.text_config, "num_attention_heads", 32
        )
        logger.info("DEBUG: _fix_attention_head_dim called, correct_head_dim=%d", correct_head_dim)
        fixed_count = 0
        attention_modules_found = []
        rotary_modules_fixed = []
        for name, module in self.model.named_modules():
            module_name = module.__class__.__name__
            # Fix head_dim in Attention modules
            if "Attention" in module_name:
                current_head_dim = getattr(module, 'head_dim', 'NOT SET')
                num_heads = getattr(module, 'num_heads', 'NOT SET')
                num_kv_heads = getattr(module, 'num_key_value_heads', 'NOT SET')
                attention_modules_found.append(
                    f"{name}: head_dim={current_head_dim}, num_heads={num_heads}, num_kv_heads={num_kv_heads}"
                )
                # Fix head_dim if it exists and is incorrect
                if hasattr(module, "head_dim"):
                    if module.head_dim != correct_head_dim:
                        logger.warning(
-                            "DEBUG: Fixing head_dim in %s: %d -> %d",
+                            "Fixing head_dim in %s: %d -> %d",
                            name, module.head_dim, correct_head_dim
                        )
                        module.head_dim = correct_head_dim
                        fixed_count += 1
-            # Fix rotary embeddings - need to recreate inv_freq buffer
+            # Fix rotary embeddings - recreate inv_freq buffer if needed
            if "RotaryEmbedding" in module_name:
                # Check if rotary embedding has wrong dimension
                if hasattr(module, "inv_freq"):
-                    current_dim = module.inv_freq.shape[0] * 2  # inv_freq is half the dim
+                    current_dim = module.inv_freq.shape[0] * 2
                    if current_dim != correct_head_dim:
                        logger.warning(
-                            "DEBUG: Recreating rotary embedding %s: dim %d -> %d",
+                            "Recreating rotary embedding %s: dim %d -> %d",
                            name, current_dim, correct_head_dim
                        )
                        # Recreate inv_freq with correct dimension
                        base = getattr(module.config, 'rope_theta', 10000.0)
                        if hasattr(module.config, 'rope_parameters'):
                            base = module.config.rope_parameters.get('rope_theta', base)
                        device = module.inv_freq.device
                        # Create new inv_freq
                        inv_freq = 1.0 / (
                            base ** (
                                torch.arange(0, correct_head_dim, 2, dtype=torch.int64)
@@ -599,22 +514,9 @@ class Base(nn.Module):
                        module.register_buffer("inv_freq", inv_freq, persistent=False)
                        if hasattr(module, "original_inv_freq"):
                            module.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
                        rotary_modules_fixed.append(name)
        # Print debug info
        if attention_modules_found:
            logger.info("DEBUG: Found %d attention modules", len(attention_modules_found))
            for info in attention_modules_found[:3]:
                logger.info("DEBUG: Attention module: %s", info)
        if rotary_modules_fixed:
            logger.info("DEBUG: Fixed %d rotary embedding modules: %s",
                       len(rotary_modules_fixed), rotary_modules_fixed)
        if fixed_count > 0:
            logger.info("Fixed head_dim in %d attention modules", fixed_count)
        else:
            logger.info("DEBUG: No attention modules needed head_dim fix")
    def _replace_input_embeddings(self) -> None:
        """Replace input embeddings with VocabParallelEmbedding."""
@@ -758,80 +660,6 @@ class Base(nn.Module):
            # Forward through model
            # Note: return_dict=False returns tuple, first element is last hidden state
            # DEBUG: Print detailed model structure info before forward
            if not hasattr(self, '_debug_printed'):
                self._debug_printed = True
                logger.info("DEBUG: === Detailed model structure debug ===")
                # Print transformers version
                try:
                    import transformers
                    logger.info("DEBUG: transformers version: %s", transformers.__version__)
                except Exception:
                    pass
                # Print TP world size
                logger.info("DEBUG: TP world_size=%d", self.tp_group.world_size)
                # Print first attention module details
                for name, module in self.model.named_modules():
                    if "Attention" in module.__class__.__name__:
                        logger.info("DEBUG: First attention: %s (class=%s)", name, module.__class__.__name__)
                        # Print all attributes
                        for attr in ['head_dim', 'num_heads', 'num_key_value_heads', 
                                    'hidden_size', 'num_attention_heads',
                                    'num_key_value_groups']:
                            val = getattr(module, attr, 'NOT SET')
                            logger.info("DEBUG:   %s = %s", attr, val)
                        # Print rotary_emb
                        rotary = getattr(module, 'rotary_emb', None)
                        if rotary:
                            logger.info("DEBUG:   rotary_emb: %s", type(rotary).__name__)
                            if hasattr(rotary, 'inv_freq'):
                                logger.info("DEBUG:   rotary_emb.inv_freq.shape: %s", rotary.inv_freq.shape)
                        else:
                            logger.info("DEBUG:   rotary_emb: None")
                        # Print projection shapes
                        for proj_name in ['q_proj', 'k_proj', 'v_proj', 'o_proj']:
                            proj = getattr(module, proj_name, None)
                            if proj:
                                if hasattr(proj, 'weight'):
                                    logger.info("DEBUG:   %s: type=%s, weight.shape=%s", 
                                              proj_name, type(proj).__name__, 
                                              proj.weight.shape if proj.weight is not None else 'None')
                                elif hasattr(proj, 'output_size'):
                                    logger.info("DEBUG:   %s: type=%s, in=%s, out=%s, out_per_part=%s", 
                                              proj_name, type(proj).__name__,
                                              getattr(proj, 'input_size', 'N/A'),
                                              getattr(proj, 'output_size', 'N/A'),
                                              getattr(proj, 'output_size_per_partition', 'N/A'))
                        break
                # Print model-level rotary_emb
                model_rotary = getattr(self.model, 'rotary_emb', None)
                if model_rotary:
                    logger.info("DEBUG: Model-level rotary_emb: %s", type(model_rotary).__name__)
                    if hasattr(model_rotary, 'inv_freq'):
                        logger.info("DEBUG: Model rotary_emb.inv_freq.shape: %s", model_rotary.inv_freq.shape)
                else:
                    logger.info("DEBUG: No model-level rotary_emb")
                    # Check nested
                    for name, module in self.model.named_modules():
                        if "RotaryEmbedding" in module.__class__.__name__:
                            inv_freq_shape = module.inv_freq.shape if hasattr(module, 'inv_freq') else 'N/A'
                            logger.info("DEBUG: Found rotary at %s: inv_freq.shape=%s", name, inv_freq_shape)
                            break
                # Print config details
                for attr in ['head_dim', 'hidden_size', 'num_attention_heads', 'num_key_value_heads',
                            'intermediate_size', 'num_hidden_layers']:
                    logger.info("DEBUG: config.%s = %s", attr, getattr(self.config, attr, 'NOT SET'))
                logger.info("DEBUG: === End debug ===")
            with torch.no_grad():
                outputs = self.model(
                    **model_inputs,
--- a/vllm-v0.6.2/vllm/model_executor/models/transformers/utils.py
+++ b/vllm-v0.6.2/vllm/model_executor/models/transformers/utils.py
@@ -8,7 +8,7 @@ module replacement functions.
 """
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Literal, Optional, Union
+from typing import TYPE_CHECKING, Literal, Optional, Tuple, Union
 import torch
 import torch.nn as nn
@@ -123,23 +123,102 @@ def replace_linear_class(
    )
 class TransformersRMSNorm(RMSNorm):
    """
    vLLM RMSNorm subclass that preserves tensor dimensions.
    vLLM's RMSNorm (especially the MLU backend) flattens input to 2D
    (e.g., [batch, seq, hidden] -> [batch*seq, hidden]), but transformers
    expects the batch dimension to be preserved. This subclass wraps
    the parent forward methods to save and restore the original tensor shape.
    Since this inherits from RMSNorm directly, weight loading via
    named_parameters() works correctly (weight path stays the same).
    """
    def forward_native(
        self,
        x: torch.Tensor,
        residual: Optional[torch.Tensor] = None,
    ):
        orig_shape = x.shape
        result = super().forward_native(x, residual)
        return self._restore_shape(result, orig_shape)
    def forward_cuda(
        self,
        x: torch.Tensor,
        residual: Optional[torch.Tensor] = None,
    ):
        orig_shape = x.shape
        result = super().forward_cuda(x, residual)
        return self._restore_shape(result, orig_shape)
    def forward_mlu(
        self,
        x: torch.Tensor,
        residual: Optional[torch.Tensor] = None,
    ):
        orig_shape = x.shape
        result = super().forward_mlu(x, residual)
        return self._restore_shape(result, orig_shape)
    def forward_xpu(
        self,
        x: torch.Tensor,
        residual: Optional[torch.Tensor] = None,
    ):
        orig_shape = x.shape
        result = super().forward_xpu(x, residual)
        return self._restore_shape(result, orig_shape)
    def forward_hpu(
        self,
        x: torch.Tensor,
        residual: Optional[torch.Tensor] = None,
    ):
        orig_shape = x.shape
        result = super().forward_hpu(x, residual)
        return self._restore_shape(result, orig_shape)
    @staticmethod
    def _restore_shape(result, orig_shape: Tuple):
        """Restore original tensor shape if it was changed."""
        if isinstance(result, tuple):
            restored = []
            for t in result:
                if t is not None and t.shape != orig_shape:
                    t = t.view(orig_shape)
                restored.append(t)
            return tuple(restored)
        else:
            if result.shape != orig_shape:
                result = result.view(orig_shape)
            return result
 def replace_rms_norm_class(
    rms_norm: nn.Module,
    hidden_size: int,
-) -> RMSNorm:
+) -> nn.Module:
    """
-    Replace a Transformers RMSNorm with vLLM's optimized RMSNorm.
+    Replace a Transformers RMSNorm with vLLM's optimized RMSNorm,
    wrapped to preserve tensor dimensions.
    vLLM's RMSNorm provides:
    - Fused CUDA kernels for better performance
    - Support for fused add + norm operations
    The wrapper ensures that the original tensor shape (including batch
    dimension) is preserved, which is required by transformers' model
    forward methods.
    Args:
        rms_norm: The RMSNorm module to replace.
        hidden_size: The hidden size of the model.
    Returns:
-        The new vLLM RMSNorm layer.
+        The new vLLM RMSNorm layer wrapped for shape preservation.
    """
    # Try to get epsilon from various attribute names
    eps = getattr(rms_norm, "eps", None)
@@ -153,7 +232,7 @@ def replace_rms_norm_class(
    if weight is not None:
        hidden_size = weight.size(0)
-    return RMSNorm(hidden_size=hidden_size, eps=eps)
+    return TransformersRMSNorm(hidden_size=hidden_size, eps=eps)
 def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module):