diff --git a/test/srt/test_deepseek_v3_fp4_4gpu.py b/test/srt/test_deepseek_v3_fp4_4gpu.py index 81f6bdfb3..bb9e13554 100644 --- a/test/srt/test_deepseek_v3_fp4_4gpu.py +++ b/test/srt/test_deepseek_v3_fp4_4gpu.py @@ -160,5 +160,56 @@ class TestDeepseekV3FP4MTP(CustomTestCase): self.assertGreater(speed, 130) +class TestDeepseekV3FP4CutlassMoE(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = FULL_DEEPSEEK_V3_FP4_MODEL_PATH + cls.base_url = DEFAULT_URL_FOR_TEST + other_args = [ + "--tp", + "4", + "--ep", + "4", + "--attention-backend", + "trtllm_mla", + "--moe-runner-backend", + "flashinfer_cutlass", + "--quantization", + "modelopt_fp4", + ] + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_a_gsm8k( + self, + ): # Append an "a" to make this test run first (alphabetically) to warm up the server + args = SimpleNamespace( + num_shots=8, + data_path=None, + num_questions=1319, + parallel=1319, + max_new_tokens=512, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(f"{metrics=}") + + if is_in_ci(): + write_github_step_summary( + f"### test_gsm8k (deepseek-v3-fp4-cutlass-moe)\n" + f'{metrics["accuracy"]=:.3f}\n' + ) + self.assertGreater(metrics["accuracy"], 0.935) + + if __name__ == "__main__": unittest.main()