diff --git a/python/sglang/srt/mem_cache/memory_pool.py b/python/sglang/srt/mem_cache/memory_pool.py index b76d84d9a..126d03ab8 100644 --- a/python/sglang/srt/mem_cache/memory_pool.py +++ b/python/sglang/srt/mem_cache/memory_pool.py @@ -340,6 +340,7 @@ class MHATokenToKVPool(KVCache): cache_v = cache_v.view(self.store_dtype) if self.capture_mode and cache_k.shape[0] < 4: + # Overlap the copy of K and V cache for small batch size current_stream = self.device_module.current_stream() self.alt_stream.wait_stream(current_stream) with self.device_module.stream(self.alt_stream): diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 1ba9f38f7..f70c7e9ec 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -481,6 +481,7 @@ def suppress_other_loggers(): logging.getLogger("vllm.distributed.device_communicators.shm_broadcast").setLevel( logging.WARN ) + logging.getLogger("vllm.config").setLevel(logging.ERROR) warnings.filterwarnings( "ignore", category=UserWarning, message="The given NumPy array is not writable" @@ -527,10 +528,11 @@ def kill_process_tree(parent_pid, include_parent: bool = True, skip_pid: int = N pass if include_parent: - if parent_pid == os.getpid(): - sys.exit(0) - try: + if parent_pid == os.getpid(): + itself.kill() + sys.exit(0) + itself.kill() # Sometime processes cannot be killed with SIGKILL (e.g, PID=1 launched by kubernetes), diff --git a/test/srt/test_eval_fp8_accuracy.py b/test/srt/test_eval_fp8_accuracy.py index 8d3c5c00c..07eb4dc04 100644 --- a/test/srt/test_eval_fp8_accuracy.py +++ b/test/srt/test_eval_fp8_accuracy.py @@ -36,7 +36,7 @@ class TestEvalFP8Accuracy(unittest.TestCase): ) metrics = run_eval(args) - self.assertGreaterEqual(metrics["score"], 0.62) + self.assertGreaterEqual(metrics["score"], 0.61) class TestEvalFP8DynamicQuantAccuracy(unittest.TestCase):