[CI] fix port conflicts (#5789)
This commit is contained in:
@@ -14,7 +14,7 @@ class TestFile:
|
||||
suites = {
|
||||
"per-commit": [
|
||||
TestFile("models/lora/test_lora.py", 76),
|
||||
TestFile("models/lora/test_lora_backend.py", 420),
|
||||
TestFile("models/lora/test_lora_backend.py", 99),
|
||||
TestFile("models/lora/test_multi_lora_backend.py", 60),
|
||||
TestFile("models/test_embedding_models.py", 35),
|
||||
TestFile("models/test_generation_models.py", 103),
|
||||
@@ -23,30 +23,30 @@ suites = {
|
||||
TestFile("models/test_compressed_tensors_models.py", 100),
|
||||
TestFile("models/test_reward_models.py", 83),
|
||||
TestFile("models/test_gme_qwen_models.py", 45),
|
||||
TestFile("models/test_clip_models.py", 100),
|
||||
TestFile("models/test_vlm_models.py", 100),
|
||||
TestFile("models/test_clip_models.py", 52),
|
||||
TestFile("models/test_vlm_models.py", 581),
|
||||
TestFile("test_abort.py", 51),
|
||||
TestFile("test_block_int8.py", 22),
|
||||
TestFile("test_chunked_prefill.py", 336),
|
||||
TestFile("test_eagle_infer.py", 500),
|
||||
TestFile("test_chunked_prefill.py", 285),
|
||||
TestFile("test_eagle_infer.py", 584),
|
||||
TestFile("test_ebnf_constrained.py"),
|
||||
TestFile("test_fa3.py", 400),
|
||||
TestFile("test_fa3.py", 376),
|
||||
TestFile("test_fp8_kernel.py", 8),
|
||||
TestFile("test_embedding_openai_server.py", 36),
|
||||
TestFile("test_embedding_openai_server.py", 141),
|
||||
TestFile("test_hidden_states.py", 55),
|
||||
TestFile("test_int8_kernel.py", 8),
|
||||
TestFile("test_input_embeddings.py", 38),
|
||||
TestFile("test_json_constrained.py", 98),
|
||||
TestFile("test_large_max_new_tokens.py", 41),
|
||||
TestFile("test_metrics.py", 32),
|
||||
TestFile("test_mla.py", 162),
|
||||
TestFile("test_mla.py", 242),
|
||||
TestFile("test_mla_deepseek_v3.py", 221),
|
||||
TestFile("test_mla_int8_deepseek_v3.py", 522),
|
||||
TestFile("test_mla_int8_deepseek_v3.py", 674),
|
||||
TestFile("test_mla_flashinfer.py", 395),
|
||||
TestFile("test_mla_fp8.py", 93),
|
||||
TestFile("test_mla_fp8.py", 153),
|
||||
TestFile("test_no_chunked_prefill.py", 126),
|
||||
TestFile("test_no_overlap_scheduler.py", 262),
|
||||
TestFile("test_openai_server.py", 186),
|
||||
TestFile("test_openai_server.py", 149),
|
||||
TestFile("test_penalty.py", 41),
|
||||
TestFile("test_page_size.py", 60),
|
||||
TestFile("test_pytorch_sampling_backend.py", 66),
|
||||
@@ -57,11 +57,11 @@ suites = {
|
||||
TestFile("test_request_length_validation.py", 31),
|
||||
TestFile("test_retract_decode.py", 54),
|
||||
TestFile("test_server_args.py", 1),
|
||||
TestFile("test_skip_tokenizer_init.py", 72),
|
||||
TestFile("test_skip_tokenizer_init.py", 117),
|
||||
TestFile("test_srt_engine.py", 237),
|
||||
TestFile("test_srt_endpoint.py", 94),
|
||||
TestFile("test_torch_compile.py", 76),
|
||||
TestFile("test_torch_compile_moe.py", 85),
|
||||
TestFile("test_torch_compile_moe.py", 235),
|
||||
TestFile("test_torch_native_attention_backend.py", 123),
|
||||
TestFile("test_torchao.py", 70),
|
||||
TestFile("test_triton_attention_kernels.py", 4),
|
||||
@@ -69,27 +69,27 @@ suites = {
|
||||
TestFile("test_update_weights_from_disk.py", 114),
|
||||
TestFile("test_update_weights_from_tensor.py", 48),
|
||||
TestFile("test_vertex_endpoint.py", 31),
|
||||
TestFile("test_vision_chunked_prefill.py", 99),
|
||||
TestFile("test_vision_chunked_prefill.py", 119),
|
||||
TestFile("test_vlm_accuracy.py", 60),
|
||||
TestFile("test_vision_openai_server.py", 537),
|
||||
TestFile("test_vision_openai_server.py", 637),
|
||||
TestFile("test_fim_completion.py", 40),
|
||||
TestFile("test_w8a8_quantization.py", 46),
|
||||
TestFile("test_eval_fp8_accuracy.py", 303),
|
||||
TestFile("test_create_kvindices.py", 2),
|
||||
TestFile("test_hicache.py", 60),
|
||||
TestFile("test_hicache_mla.py", 90),
|
||||
TestFile("test_hicache.py", 116),
|
||||
TestFile("test_hicache_mla.py", 254),
|
||||
TestFile("test_fused_moe.py", 30),
|
||||
TestFile("test_triton_moe_channel_fp8_kernel.py", 25),
|
||||
],
|
||||
"per-commit-2-gpu": [
|
||||
TestFile("models/lora/test_lora_tp.py", 150),
|
||||
TestFile("test_data_parallelism.py", 90),
|
||||
TestFile("test_dp_attention.py", 150),
|
||||
TestFile("test_mla_tp.py", 174),
|
||||
TestFile("test_moe_ep.py", 220),
|
||||
TestFile("test_patch_torch.py", 30),
|
||||
TestFile("test_update_weights_from_distributed.py", 100),
|
||||
TestFile("test_verl_engine.py", 100),
|
||||
TestFile("models/lora/test_lora_tp.py", 116),
|
||||
TestFile("test_data_parallelism.py", 73),
|
||||
TestFile("test_dp_attention.py", 137),
|
||||
TestFile("test_mla_tp.py", 170),
|
||||
TestFile("test_moe_ep.py", 181),
|
||||
TestFile("test_patch_torch.py", 19),
|
||||
TestFile("test_update_weights_from_distributed.py", 103),
|
||||
TestFile("test_verl_engine.py", 64),
|
||||
],
|
||||
"per-commit-8-gpu": [
|
||||
TestFile("test_local_attn.py", 250),
|
||||
|
||||
@@ -24,7 +24,7 @@ class TestTorchCompileMoe(CustomTestCase):
|
||||
cls.model,
|
||||
cls.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=["--enable-torch-compile", "--torch-compile-max-bs", "8"],
|
||||
other_args=["--enable-torch-compile", "--torch-compile-max-bs", "4"],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
|
||||
@@ -129,7 +129,7 @@ def init_process_hf(
|
||||
hf_instruct_params = []
|
||||
hf_base_params = []
|
||||
|
||||
print("get parameter in hf instruct model and base model")
|
||||
print("[hf] get parameter in hf instruct model and base model")
|
||||
for parameter_name in checking_parameters:
|
||||
hf_instruct_params.append(
|
||||
hf_instruct_model.get_parameter(parameter_name)[:truncate_size]
|
||||
@@ -152,10 +152,12 @@ def init_process_hf(
|
||||
param_queue.put(("hf_base_params", hf_base_params))
|
||||
|
||||
# Init weight update group for rank 0 (the training engine in RLHF).
|
||||
print(f"rank {rank} world_size: {world_size} init custom process group")
|
||||
port = 60000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
|
||||
init_method = f"tcp://localhost:{port}"
|
||||
print(f"[hf] {rank=} {world_size=} init custom process group. {init_method=}")
|
||||
group = init_custom_process_group(
|
||||
backend="nccl",
|
||||
init_method="tcp://localhost:65500",
|
||||
init_method=init_method,
|
||||
world_size=world_size,
|
||||
rank=rank,
|
||||
group_name="test_parameter_update_group",
|
||||
@@ -184,7 +186,7 @@ def init_process_hf(
|
||||
|
||||
# Measure the latency of broadcasting/weights update.
|
||||
broadcast_time = time_end_broadcast - time_begin_broadcast
|
||||
print(f"rank {rank} broadcast parameter time: {broadcast_time:.3f}s")
|
||||
print(f"[hf] {rank=} {broadcast_time=:.3f}s")
|
||||
param_queue.put(("broadcast_time", broadcast_time))
|
||||
|
||||
# Delete the huggingface models to free up memory.
|
||||
@@ -210,17 +212,21 @@ def init_process_sgl(
|
||||
torch.cuda.synchronize()
|
||||
base_gpu_id = 1 if rank == 1 else 1 + tp_size
|
||||
if backend == "Engine":
|
||||
print(f"[sgl] rank {rank} init engine")
|
||||
engine = sgl.Engine(
|
||||
model_path=model_name,
|
||||
random_seed=42,
|
||||
base_gpu_id=base_gpu_id,
|
||||
tp_size=tp_size,
|
||||
cuda_graph_max_bs=2,
|
||||
)
|
||||
else:
|
||||
if rank == 1:
|
||||
url = DEFAULT_URL_FOR_TEST
|
||||
else:
|
||||
url = DEFAULT_URL_FOR_TEST.replace("2157", "2159")
|
||||
host, port = DEFAULT_URL_FOR_TEST.split(":")
|
||||
url = ":".join(host, str(int(port) + 10000))
|
||||
|
||||
print(f"[sgl] rank {rank} init server on url: {url}")
|
||||
process = popen_launch_server(
|
||||
model_name,
|
||||
url,
|
||||
@@ -230,13 +236,11 @@ def init_process_sgl(
|
||||
str(base_gpu_id),
|
||||
"--tp-size",
|
||||
str(tp_size),
|
||||
"--cuda-graph-max-bs",
|
||||
2,
|
||||
),
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
if backend == "Engine":
|
||||
print(f"rank {rank} init engine")
|
||||
else:
|
||||
print(f"rank {rank} init server on url: {url}")
|
||||
|
||||
# Get weights of instruct model, i.e. pre-training weights.
|
||||
instruct_params = []
|
||||
@@ -252,11 +256,13 @@ def init_process_sgl(
|
||||
|
||||
param_queue.put((f"sgl_dp_{rank}_instruct_params", instruct_params))
|
||||
|
||||
port = 60000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
|
||||
|
||||
# Init weight update group with the training engine.
|
||||
if backend == "Engine":
|
||||
engine.init_weights_update_group(
|
||||
master_address="localhost",
|
||||
master_port="65500",
|
||||
master_port=str(port),
|
||||
rank_offset=base_gpu_id,
|
||||
world_size=world_size,
|
||||
group_name="test_parameter_update_group",
|
||||
@@ -267,7 +273,7 @@ def init_process_sgl(
|
||||
f"{url}/init_weights_update_group",
|
||||
json={
|
||||
"master_address": "localhost",
|
||||
"master_port": "65500",
|
||||
"master_port": str(port),
|
||||
"rank_offset": base_gpu_id,
|
||||
"world_size": world_size,
|
||||
"group_name": "test_parameter_update_group",
|
||||
@@ -311,7 +317,7 @@ def init_process_sgl(
|
||||
# Measure the latency of broadcast/weights update.
|
||||
update_time = time_end_update - time_begin_update
|
||||
print(
|
||||
f"fully update model_name {model_name} rank {rank} parameter from distributed time: {update_time:.3f}s"
|
||||
f"[sgl] fully update model_name {model_name} rank {rank} parameter from distributed time: {update_time:.3f}s"
|
||||
)
|
||||
param_queue.put((f"update_sgl_dp_{rank}_time", update_time))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user