Minor fixes for server_args, parallel_state, and test_deterministic.py (#11159)
This commit is contained in:
@@ -1085,7 +1085,8 @@ class GroupCoordinator:
|
|||||||
|
|
||||||
comm_group = metadata_group if tensor.is_cpu else group
|
comm_group = metadata_group if tensor.is_cpu else group
|
||||||
work = send_func(tensor, self.ranks[dst], group=comm_group)
|
work = send_func(tensor, self.ranks[dst], group=comm_group)
|
||||||
p2p_works.append(P2PWork(work, tensor))
|
if async_send:
|
||||||
|
p2p_works.append(P2PWork(work, tensor))
|
||||||
return p2p_works
|
return p2p_works
|
||||||
|
|
||||||
def recv_tensor_dict(
|
def recv_tensor_dict(
|
||||||
|
|||||||
@@ -809,13 +809,13 @@ class TokenizerMetricsCollector:
|
|||||||
def check_time_to_first_token_straggler(self, value: float) -> bool:
|
def check_time_to_first_token_straggler(self, value: float) -> bool:
|
||||||
his = self.histogram_time_to_first_token.labels(**self.labels)
|
his = self.histogram_time_to_first_token.labels(**self.labels)
|
||||||
total_observations = sum(bucket._value for bucket in his._buckets)
|
total_observations = sum(bucket._value for bucket in his._buckets)
|
||||||
if total_observations < 100:
|
if total_observations < 1000:
|
||||||
return False
|
return False
|
||||||
p99_threshold = total_observations * 0.99
|
p999_threshold = total_observations * 0.999
|
||||||
cumulative_count = 0
|
cumulative_count = 0
|
||||||
for i, bucket in enumerate(his._buckets):
|
for i, bucket in enumerate(his._buckets):
|
||||||
cumulative_count += bucket._value
|
cumulative_count += bucket._value
|
||||||
if cumulative_count > p99_threshold:
|
if cumulative_count > p999_threshold:
|
||||||
return value >= his._upper_bounds[i]
|
return value >= his._upper_bounds[i]
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|||||||
@@ -116,6 +116,8 @@ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
|
|||||||
|
|
||||||
DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
|
DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
|
||||||
|
|
||||||
|
RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
|
||||||
|
|
||||||
|
|
||||||
# Allow external code to add more choices
|
# Allow external code to add more choices
|
||||||
def add_load_format_choices(choices):
|
def add_load_format_choices(choices):
|
||||||
@@ -138,6 +140,14 @@ def add_grammar_backend_choices(choices):
|
|||||||
GRAMMAR_BACKEND_CHOICES.extend(choices)
|
GRAMMAR_BACKEND_CHOICES.extend(choices)
|
||||||
|
|
||||||
|
|
||||||
|
def add_deterministic_attention_backend_choices(choices):
|
||||||
|
DETERMINISTIC_ATTENTION_BACKEND_CHOICES.extend(choices)
|
||||||
|
|
||||||
|
|
||||||
|
def add_radix_eviction_policy_choices(choices):
|
||||||
|
RADIX_EVICTION_POLICY_CHOICES.extend(choices)
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass
|
@dataclasses.dataclass
|
||||||
class ServerArgs:
|
class ServerArgs:
|
||||||
# Model and tokenizer
|
# Model and tokenizer
|
||||||
@@ -2243,7 +2253,7 @@ class ServerArgs:
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--radix-eviction-policy",
|
"--radix-eviction-policy",
|
||||||
type=str,
|
type=str,
|
||||||
choices=["lru", "lfu"],
|
choices=RADIX_EVICTION_POLICY_CHOICES,
|
||||||
default=ServerArgs.radix_eviction_policy,
|
default=ServerArgs.radix_eviction_policy,
|
||||||
help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
|
help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ class BenchArgs:
|
|||||||
port: int = 30000
|
port: int = 30000
|
||||||
batch_size: int = 1
|
batch_size: int = 1
|
||||||
temperature: float = 0.0
|
temperature: float = 0.0
|
||||||
sampling_seed: int = None
|
sampling_seed: int = 42
|
||||||
max_new_tokens: int = 100
|
max_new_tokens: int = 100
|
||||||
frequency_penalty: float = 0.0
|
frequency_penalty: float = 0.0
|
||||||
presence_penalty: float = 0.0
|
presence_penalty: float = 0.0
|
||||||
|
|||||||
Reference in New Issue
Block a user