Address performance regression: disable multiple streams on ROCm (#6412)

This commit is contained in:
HAI
2025-05-19 21:16:20 -07:00
committed by GitHub
parent cba1cdbc46
commit 6317c5c61f

View File

@@ -1315,7 +1315,8 @@ class DeepseekV2Model(nn.Module):
config.hidden_size,
enable_tp=not global_server_args_dict["enable_dp_attention"],
)
self.alt_stream = torch.cuda.Stream()
# TODO(haishaw): multi-stream performance on ROCm
self.alt_stream = None if _is_hip else torch.cuda.Stream()
self.layers = nn.ModuleList(
[
DeepseekV2DecoderLayer(