fix: fix video input for qwen3-vl (#11361)
This commit is contained in:
@@ -1126,6 +1126,11 @@ class MRotaryEmbedding(RotaryEmbedding):
|
||||
second_per_grid_ts: Optional[torch.Tensor] = None,
|
||||
**kwargs,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
if model_type.startswith("qwen3_vl") and video_grid_thw is not None:
|
||||
video_grid_thw = torch.repeat_interleave(
|
||||
video_grid_thw, video_grid_thw[:, 0], dim=0
|
||||
)
|
||||
video_grid_thw[:, 0] = 1
|
||||
mrope_position_deltas = []
|
||||
if input_ids is not None and (
|
||||
image_grid_thw is not None or video_grid_thw is not None
|
||||
|
||||
@@ -186,7 +186,6 @@ UNBALANCED_MODEL_LOADING_TIMEOUT_S = 300
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
if _is_npu:
|
||||
import torch_npu
|
||||
|
||||
@@ -625,6 +624,22 @@ class ModelRunner:
|
||||
"Setting hicache_io_backend to vanilla I/O, which may lead to suboptimal performance with small page sizes."
|
||||
)
|
||||
|
||||
if self.model_config.hf_config.model_type == "qwen3_vl_moe":
|
||||
if (
|
||||
quantization_config := getattr(
|
||||
self.model_config.hf_config, "quantization_config"
|
||||
)
|
||||
) is not None:
|
||||
text_config = self.model_config.hf_text_config
|
||||
weight_block_size_n = quantization_config["weight_block_size"][0]
|
||||
if (
|
||||
text_config.moe_intermediate_size
|
||||
// (self.tp_size // self.moe_ep_size)
|
||||
) % weight_block_size_n != 0:
|
||||
raise ValueError(
|
||||
f"For qwen3-vl-fp8 models, please make sure ({text_config.moe_intermediate_size=} // ({self.tp_size=} // {self.moe_ep_size=})) % {weight_block_size_n=} == 0"
|
||||
)
|
||||
|
||||
def init_torch_distributed(self):
|
||||
logger.info("Init torch distributed begin.")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user