diff --git a/python/sglang/srt/distributed/parallel_state.py b/python/sglang/srt/distributed/parallel_state.py index 25dd4e511..ec980c030 100644 --- a/python/sglang/srt/distributed/parallel_state.py +++ b/python/sglang/srt/distributed/parallel_state.py @@ -1085,7 +1085,8 @@ class GroupCoordinator: comm_group = metadata_group if tensor.is_cpu else group work = send_func(tensor, self.ranks[dst], group=comm_group) - p2p_works.append(P2PWork(work, tensor)) + if async_send: + p2p_works.append(P2PWork(work, tensor)) return p2p_works def recv_tensor_dict( diff --git a/python/sglang/srt/metrics/collector.py b/python/sglang/srt/metrics/collector.py index 01ebc8063..e793eb988 100644 --- a/python/sglang/srt/metrics/collector.py +++ b/python/sglang/srt/metrics/collector.py @@ -809,13 +809,13 @@ class TokenizerMetricsCollector: def check_time_to_first_token_straggler(self, value: float) -> bool: his = self.histogram_time_to_first_token.labels(**self.labels) total_observations = sum(bucket._value for bucket in his._buckets) - if total_observations < 100: + if total_observations < 1000: return False - p99_threshold = total_observations * 0.99 + p999_threshold = total_observations * 0.999 cumulative_count = 0 for i, bucket in enumerate(his._buckets): cumulative_count += bucket._value - if cumulative_count > p99_threshold: + if cumulative_count > p999_threshold: return value >= his._upper_bounds[i] return False diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index baa08d4af..8d4ebe74f 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -116,6 +116,8 @@ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"] DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"] +RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"] + # Allow external code to add more choices def add_load_format_choices(choices): @@ -138,6 +140,14 @@ def add_grammar_backend_choices(choices): GRAMMAR_BACKEND_CHOICES.extend(choices) +def add_deterministic_attention_backend_choices(choices): + DETERMINISTIC_ATTENTION_BACKEND_CHOICES.extend(choices) + + +def add_radix_eviction_policy_choices(choices): + RADIX_EVICTION_POLICY_CHOICES.extend(choices) + + @dataclasses.dataclass class ServerArgs: # Model and tokenizer @@ -2243,7 +2253,7 @@ class ServerArgs: parser.add_argument( "--radix-eviction-policy", type=str, - choices=["lru", "lfu"], + choices=RADIX_EVICTION_POLICY_CHOICES, default=ServerArgs.radix_eviction_policy, help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.", ) diff --git a/python/sglang/test/test_deterministic.py b/python/sglang/test/test_deterministic.py index 286902677..aa6116043 100644 --- a/python/sglang/test/test_deterministic.py +++ b/python/sglang/test/test_deterministic.py @@ -29,7 +29,7 @@ class BenchArgs: port: int = 30000 batch_size: int = 1 temperature: float = 0.0 - sampling_seed: int = None + sampling_seed: int = 42 max_new_tokens: int = 100 frequency_penalty: float = 0.0 presence_penalty: float = 0.0