Address performance regression: disable multiple streams on ROCm (#6412)
This commit is contained in:
@@ -1315,7 +1315,8 @@ class DeepseekV2Model(nn.Module):
|
|||||||
config.hidden_size,
|
config.hidden_size,
|
||||||
enable_tp=not global_server_args_dict["enable_dp_attention"],
|
enable_tp=not global_server_args_dict["enable_dp_attention"],
|
||||||
)
|
)
|
||||||
self.alt_stream = torch.cuda.Stream()
|
# TODO(haishaw): multi-stream performance on ROCm
|
||||||
|
self.alt_stream = None if _is_hip else torch.cuda.Stream()
|
||||||
self.layers = nn.ModuleList(
|
self.layers = nn.ModuleList(
|
||||||
[
|
[
|
||||||
DeepseekV2DecoderLayer(
|
DeepseekV2DecoderLayer(
|
||||||
|
|||||||
Reference in New Issue
Block a user