Add graph runner support with torch compile on CPU (#7843)

2025-09-08 12:33:58 +08:00
parent 8cda5a622c
commit 7577f0e40f
16 changed files with 820 additions and 48 deletions
--- a/python/sglang/srt/distributed/parallel_state.py
+++ b/python/sglang/srt/distributed/parallel_state.py
@@ -64,6 +64,9 @@ class GraphCaptureContext:

 TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])

+# use int value instead of ReduceOp.SUM to support torch compile
+REDUCE_OP_SUM = int(torch.distributed.ReduceOp.SUM)
+

 def _split_tensor_dict(
    tensor_dict: Dict[str, Union[torch.Tensor, Any]]
@@ -489,9 +492,7 @@ class GroupCoordinator:

        if input_.is_cpu:
            if is_shm_available(input_.dtype, self.world_size, self.local_size):
-                torch.ops.sgl_kernel.shm_allreduce(
-                    input_, torch.distributed.ReduceOp.SUM
-                )
+                torch.ops.sgl_kernel.shm_allreduce(input_, REDUCE_OP_SUM)
            else:
                torch.distributed.all_reduce(input_, group=self.device_group)
            return input_