diff --git a/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/gemm/kernel/grouped_matmul_slice_m_per_token_dequant_swiglu_quant_multistage_workspace.h b/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/gemm/kernel/grouped_matmul_slice_m_per_token_dequant_swiglu_quant_multistage_workspace.h index 2420ba3e..cf4956e0 100644 --- a/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/gemm/kernel/grouped_matmul_slice_m_per_token_dequant_swiglu_quant_multistage_workspace.h +++ b/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/gemm/kernel/grouped_matmul_slice_m_per_token_dequant_swiglu_quant_multistage_workspace.h @@ -230,7 +230,6 @@ public: AscendC::WholeReduceMax(ubReduceMax, ubMax, mask, tileRow, 1, 1, halfTileColumn / elementPerBlk, AscendC::ReduceOrder::ORDER_ONLY_VALUE); AscendC::SetFlag(0); - AscendC::SetFlag(0); AscendC::PipeBarrier(); AscendC::WaitFlag(0); @@ -266,6 +265,7 @@ public: AscendC::WaitFlag(1); AscendC::Cast(ubOutput, ubQuantF16, AscendC::RoundMode::CAST_RINT, tileCount); AscendC::SetFlag(1); + AscendC::SetFlag(0); auto gmTileOutput = gmOutput[params.layoutOutput.GetOffset(tileOffset)]; auto layoutGmTileOutput = params.layoutOutput.GetTileLayout(actualTileShape);