[smol] [perf] Qwen3-VL in place op. (#11481)
Signed-off-by: vincentzed <207368749+vincentzed@users.noreply.github.com>
This commit is contained in:
@@ -189,10 +189,10 @@ class Qwen3_VisionBlock(nn.Module):
|
|||||||
position_embeddings=position_embeddings,
|
position_embeddings=position_embeddings,
|
||||||
)
|
)
|
||||||
attn = rearrange(attn, "b s ... -> s b ...")
|
attn = rearrange(attn, "b s ... -> s b ...")
|
||||||
x = x + attn
|
x += attn
|
||||||
norm2 = self.norm2(x)
|
norm2 = self.norm2(x)
|
||||||
mlp = self.mlp(norm2)
|
mlp = self.mlp(norm2)
|
||||||
x = x + mlp
|
x += mlp
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
@@ -441,7 +441,7 @@ class Qwen3_VisionTransformer(nn.Module):
|
|||||||
x = self.patch_embed(x)
|
x = self.patch_embed(x)
|
||||||
|
|
||||||
pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
|
pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
|
||||||
x = x + pos_embeds
|
x += pos_embeds
|
||||||
rotary_pos_emb = self.rot_pos_emb(grid_thw)
|
rotary_pos_emb = self.rot_pos_emb(grid_thw)
|
||||||
|
|
||||||
seq_len, _ = x.size()
|
seq_len, _ = x.size()
|
||||||
@@ -574,10 +574,7 @@ class Qwen3LLMModel(Qwen3Model):
|
|||||||
and layer_idx in self.deepstack_embed_to_decoder_layer
|
and layer_idx in self.deepstack_embed_to_decoder_layer
|
||||||
):
|
):
|
||||||
sep = self.hidden_size * layer_idx
|
sep = self.hidden_size * layer_idx
|
||||||
hidden_states = (
|
hidden_states += input_deepstack_embeds[:, sep : sep + self.hidden_size]
|
||||||
hidden_states
|
|
||||||
+ input_deepstack_embeds[:, sep : sep + self.hidden_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
if not self.pp_group.is_last_rank:
|
if not self.pp_group.is_last_rank:
|
||||||
return PPProxyTensors(
|
return PPProxyTensors(
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
|
|||||||
for layer_idx, layer in enumerate(
|
for layer_idx, layer in enumerate(
|
||||||
self.layers[self.start_layer : self.end_layer]
|
self.layers[self.start_layer : self.end_layer]
|
||||||
):
|
):
|
||||||
layer_idx = layer_idx + self.start_layer
|
layer_idx += self.start_layer
|
||||||
if layer_idx in self.layers_to_capture:
|
if layer_idx in self.layers_to_capture:
|
||||||
aux_hidden_states.append(
|
aux_hidden_states.append(
|
||||||
hidden_states + residual if residual is not None else hidden_states
|
hidden_states + residual if residual is not None else hidden_states
|
||||||
@@ -130,9 +130,8 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
|
|||||||
# process deepstack
|
# process deepstack
|
||||||
if input_deepstack_embeds is not None and layer_idx in range(3):
|
if input_deepstack_embeds is not None and layer_idx in range(3):
|
||||||
sep = self.hidden_size * layer_idx
|
sep = self.hidden_size * layer_idx
|
||||||
hidden_states = (
|
hidden_states.add_(
|
||||||
hidden_states
|
input_deepstack_embeds[:, sep : sep + self.hidden_size]
|
||||||
+ input_deepstack_embeds[:, sep : sep + self.hidden_size]
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if not self.pp_group.is_last_rank:
|
if not self.pp_group.is_last_rank:
|
||||||
|
|||||||
Reference in New Issue
Block a user