sglang/test/srt/test_disaggregation.py

import os
import subprocess
import time
import unittest
from types import SimpleNamespace
from urllib.parse import urlparse

import requests

from sglang.srt.utils import kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
from sglang.test.test_utils import (
    DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    CustomTestCase,
    popen_launch_pd_server,
    run_with_timeout,
)


# skip the test because we have different_tp test
@unittest.skip("skip the test because we have different_tp test")
class TestDisaggregationAccuracy(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
        parsed_url = urlparse(DEFAULT_URL_FOR_TEST)
        cls.base_host = parsed_url.hostname
        base_port = str(parsed_url.port)
        cls.lb_port = base_port
        cls.prefill_port = f"{int(base_port) + 100}"
        cls.decode_port = f"{int(base_port) + 200}"
        cls.prefill_url = f"http://{cls.base_host}:{cls.prefill_port}"
        cls.decode_url = f"http://{cls.base_host}:{cls.decode_port}"
        cls.lb_url = f"http://{cls.base_host}:{cls.lb_port}"
        print(f"{cls.base_host=} {cls.lb_port=} {cls.prefill_port=} {cls.decode_port=}")

        # Non blocking start servers
        cls.start_prefill()
        cls.start_decode()

        # Block until both
        cls.wait_server_ready(cls.prefill_url + "/health")
        cls.wait_server_ready(cls.decode_url + "/health")

        lb_command = [
            "python3",
            "-m",
            "sglang.srt.disaggregation.mini_lb",
            "--prefill",
            cls.prefill_url,
            "--decode",
            cls.decode_url,
            "--host",
            cls.base_host,
            "--port",
            cls.lb_port,
        ]

        print("Starting load balancer:", " ".join(lb_command))
        cls.process_lb = subprocess.Popen(
            lb_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )
        cls.wait_server_ready(cls.lb_url + "/health")

    @classmethod
    def start_prefill(cls):
        prefill_args = [
            "--trust-remote-code",
            "--disaggregation-mode",
            "prefill",
            "--tp",
            "1",
            "--disaggregation-ib-device",
            "mlx5_roce0",
        ]
        cls.process_prefill = popen_launch_pd_server(
            cls.model,
            cls.prefill_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=prefill_args,
        )

    @classmethod
    def start_decode(cls):
        decode_args = [
            "--trust-remote-code",
            "--disaggregation-mode",
            "decode",
            "--tp",
            "1",
            "--base-gpu-id",
            "1",
            "--disaggregation-ib-device",
            "mlx5_roce1",
        ]
        cls.process_decode = popen_launch_pd_server(
            cls.model,
            cls.decode_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=decode_args,
        )

    @classmethod
    def wait_server_ready(cls, url, timeout=60):
        start_time = time.perf_counter()
        while True:
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    print(f"Server {url} is ready")
                    return
            except Exception:
                pass

            if time.perf_counter() - start_time > timeout:
                raise RuntimeError(f"Server {url} failed to start in {timeout}s")
            time.sleep(1)

    @classmethod
    def tearDownClass(cls):
        for process in [cls.process_lb, cls.process_decode, cls.process_prefill]:
            if process:
                try:
                    kill_process_tree(process.pid)
                except Exception as e:
                    print(f"Error killing process {process.pid}: {e}")

        # wait for 5 seconds
        time.sleep(5)

    def test_gsm8k(self):
        args = SimpleNamespace(
            num_shots=5,
            data_path=None,
            num_questions=200,
            max_new_tokens=512,
            parallel=128,
            host=f"http://{self.base_host}",
            port=int(self.lb_port),
        )
        metrics = run_eval_few_shot_gsm8k(args)
        print(f"Evaluation metrics: {metrics}")

        self.assertGreater(metrics["accuracy"], 0.62)

    def test_logprob(self):
        prompt = "The capital of taiwan is "
        response = requests.post(
            self.lb_url + "/generate",
            json={
                "text": prompt,
                "sampling_params": {"temperature": 0},
                "return_logprob": True,
                "return_input_logprob": True,
                "logprob_start_len": 0,
            },
        )

        j = response.json()
        completion_tokens = j["meta_info"]["completion_tokens"]
        input_logprobs = j["meta_info"]["input_token_logprobs"]
        output_logprobs = j["meta_info"]["output_token_logprobs"]

        assert (
            len(output_logprobs) == completion_tokens
        ), f"output_logprobs and completion_tokens should have the same length, but got {len(output_logprobs)} and {completion_tokens}"
        assert (
            len(input_logprobs) > 0
        ), f"input_logprobs should have at least one token, but got {len(input_logprobs)}"


class TestDisaggregationMooncakeFailure(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        # set DISAGGREGATION_TEST_FAILURE_PROB to simulate failure
        os.environ["DISAGGREGATION_TEST_FAILURE_PROB"] = "0.05"

        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
        parsed_url = urlparse(DEFAULT_URL_FOR_TEST)
        cls.base_host = parsed_url.hostname
        base_port = str(parsed_url.port)
        cls.lb_port = base_port
        cls.prefill_port = f"{int(base_port) + 100}"
        cls.decode_port = f"{int(base_port) + 200}"
        cls.prefill_url = f"http://{cls.base_host}:{cls.prefill_port}"
        cls.decode_url = f"http://{cls.base_host}:{cls.decode_port}"
        cls.lb_url = f"http://{cls.base_host}:{cls.lb_port}"
        print(f"{cls.base_host=} {cls.lb_port=} {cls.prefill_port=} {cls.decode_port=}")

        # Non blocking start servers
        cls.start_prefill()
        cls.start_decode()

        # Block until both
        cls.wait_server_ready(cls.prefill_url + "/health")
        cls.wait_server_ready(cls.decode_url + "/health")

        lb_command = [
            "python3",
            "-m",
            "sglang.srt.disaggregation.mini_lb",
            "--prefill",
            cls.prefill_url,
            "--decode",
            cls.decode_url,
            "--host",
            cls.base_host,
            "--port",
            cls.lb_port,
        ]

        print("Starting load balancer:", " ".join(lb_command))
        cls.process_lb = subprocess.Popen(
            lb_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )
        cls.wait_server_ready(cls.lb_url + "/health")

    @classmethod
    def start_prefill(cls):
        prefill_args = [
            "--trust-remote-code",
            "--disaggregation-mode",
            "prefill",
            "--tp",
            "1",
            "--disaggregation-ib-device",
            "mlx5_roce0",
        ]
        cls.process_prefill = popen_launch_pd_server(
            cls.model,
            cls.prefill_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=prefill_args,
        )

    @classmethod
    def start_decode(cls):
        decode_args = [
            "--trust-remote-code",
            "--disaggregation-mode",
            "decode",
            "--tp",
            "1",
            "--base-gpu-id",
            "1",
            "--disaggregation-ib-device",
            "mlx5_roce1",
        ]
        cls.process_decode = popen_launch_pd_server(
            cls.model,
            cls.decode_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=decode_args,
        )

    @classmethod
    def wait_server_ready(cls, url, timeout=60):
        start_time = time.perf_counter()
        while True:
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    print(f"Server {url} is ready")
                    return
            except Exception:
                pass

            if time.perf_counter() - start_time > timeout:
                raise RuntimeError(f"Server {url} failed to start in {timeout}s")
            time.sleep(1)

    @classmethod
    def tearDownClass(cls):
        # unset DISAGGREGATION_TEST_FAILURE_PROB
        os.environ.pop("DISAGGREGATION_TEST_FAILURE_PROB")
        for process in [cls.process_lb, cls.process_decode, cls.process_prefill]:
            if process:
                try:
                    kill_process_tree(process.pid)
                except Exception as e:
                    print(f"Error killing process {process.pid}: {e}")

        # wait for 5 seconds
        time.sleep(5)

    def test_gsm8k(self):
        args = SimpleNamespace(
            num_shots=5,
            data_path=None,
            num_questions=200,
            max_new_tokens=512,
            parallel=128,
            host=f"http://{self.base_host}",
            port=int(self.lb_port),
        )
        metrics = run_eval_few_shot_gsm8k(args)
        print(f"Evaluation metrics: {metrics}")
        # Expect lots of failure but the server cannot crash


class TestDisaggregationMooncakeSpec(CustomTestCase):

    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
        cls.draft_model = DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST
        parsed_url = urlparse(DEFAULT_URL_FOR_TEST)
        cls.base_host = parsed_url.hostname
        base_port = str(parsed_url.port)
        cls.lb_port = base_port
        cls.prefill_port = f"{int(base_port) + 100}"
        cls.decode_port = f"{int(base_port) + 200}"
        cls.prefill_url = f"http://{cls.base_host}:{cls.prefill_port}"
        cls.decode_url = f"http://{cls.base_host}:{cls.decode_port}"
        cls.lb_url = f"http://{cls.base_host}:{cls.lb_port}"
        cls.spec_args = [
            "--speculative-algorithm",
            "EAGLE",
            "--speculative-draft-model-path",
            cls.draft_model,
            "--speculative-num-steps",
            "3",
            "--speculative-eagle-topk",
            "4",
            "--speculative-num-draft-tokens",
            "16",
            "--cuda-graph-max-bs",
            "8",
        ]
        print(f"{cls.base_host=} {cls.lb_port=} {cls.prefill_port=} {cls.decode_port=}")

        # Non blocking start servers
        cls.start_prefill()
        cls.start_decode()

        # Block until both
        cls.wait_server_ready(cls.prefill_url + "/health")
        cls.wait_server_ready(cls.decode_url + "/health")

        lb_command = [
            "python3",
            "-m",
            "sglang.srt.disaggregation.mini_lb",
            "--prefill",
            cls.prefill_url,
            "--decode",
            cls.decode_url,
            "--host",
            cls.base_host,
            "--port",
            cls.lb_port,
        ]

        print("Starting load balancer:", " ".join(lb_command))
        cls.process_lb = subprocess.Popen(
            lb_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )
        cls.wait_server_ready(cls.lb_url + "/health")

    @classmethod
    def wait_server_ready(cls, url, timeout=60):
        start_time = time.perf_counter()
        while True:
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    print(f"Server {url} is ready")
                    return
            except Exception:
                pass

            if time.perf_counter() - start_time > timeout:
                raise RuntimeError(f"Server {url} failed to start in {timeout}s")
            time.sleep(1)

    @classmethod
    def start_prefill(cls):
        prefill_args = [
            "--trust-remote-code",
            "--disaggregation-mode",
            "prefill",
            "--tp",
            "2",
            "--disaggregation-ib-device",
            "mlx5_roce0,mlx5_roce1",
        ] + cls.spec_args
        cls.process_prefill = popen_launch_pd_server(
            cls.model,
            cls.prefill_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=prefill_args,
        )

    @classmethod
    def start_decode(cls):
        decode_args = [
            "--trust-remote-code",
            "--disaggregation-mode",
            "decode",
            "--tp",
            "2",
            "--base-gpu-id",
            "2",
            "--disaggregation-ib-device",
            "mlx5_roce2,mlx5_roce3",
        ] + cls.spec_args
        cls.process_decode = popen_launch_pd_server(
            cls.model,
            cls.decode_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=decode_args,
        )

    @classmethod
    def wait_server_ready(cls, url, timeout=60):
        start_time = time.perf_counter()
        while True:
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    print(f"Server {url} is ready")
                    return
            except Exception:
                pass

            if time.perf_counter() - start_time > timeout:
                raise RuntimeError(f"Server {url} failed to start in {timeout}s")
            time.sleep(1)

    @classmethod
    def tearDownClass(cls):
        for process in [cls.process_lb, cls.process_decode, cls.process_prefill]:
            if process:
                try:
                    kill_process_tree(process.pid)
                except Exception as e:
                    print(f"Error killing process {process.pid}: {e}")

        # wait for 5 seconds
        time.sleep(5)

    def test_gsm8k(self):
        args = SimpleNamespace(
            num_shots=5,
            data_path=None,
            num_questions=200,
            max_new_tokens=512,
            parallel=2,
            host=f"http://{self.base_host}",
            port=int(self.lb_port),
        )
        metrics = run_eval_few_shot_gsm8k(args)
        print(f"Evaluation metrics: {metrics}")

        self.assertGreater(metrics["accuracy"], 0.20)


if __name__ == "__main__":
    unittest.main()
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`import os`
[PD] Add simple unit test for disaggregation feature (#5654) Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> 2025-05-11 13:35:27 +08:00			`import subprocess`
			`import time`
			`import unittest`
			`from types import SimpleNamespace`
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`from urllib.parse import urlparse`
[PD] Add simple unit test for disaggregation feature (#5654) Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> 2025-05-11 13:35:27 +08:00
			`import requests`

			`from sglang.srt.utils import kill_process_tree`
			`from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k`
			`from sglang.test.test_utils import (`
[PD] support spec decode (#6507) Co-authored-by: SangBin Cho <rkooo567@gmail.com> 2025-05-23 12:03:05 -07:00			`DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,`
			`DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,`
[PD] Add simple unit test for disaggregation feature (#5654) Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> 2025-05-11 13:35:27 +08:00			`DEFAULT_MODEL_NAME_FOR_TEST,`
			`DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,`
			`DEFAULT_URL_FOR_TEST,`
			`CustomTestCase,`
			`popen_launch_pd_server,`
			`run_with_timeout,`
			`)`


[PD] support spec decode (#6507) Co-authored-by: SangBin Cho <rkooo567@gmail.com> 2025-05-23 12:03:05 -07:00			`# skip the test because we have different_tp test`
			`@unittest.skip("skip the test because we have different_tp test")`
			`class TestDisaggregationAccuracy(CustomTestCase):`
[PD] Add simple unit test for disaggregation feature (#5654) Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> 2025-05-11 13:35:27 +08:00			`@classmethod`
			`def setUpClass(cls):`
			`cls.model = DEFAULT_MODEL_NAME_FOR_TEST`
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`parsed_url = urlparse(DEFAULT_URL_FOR_TEST)`
			`cls.base_host = parsed_url.hostname`
			`base_port = str(parsed_url.port)`
			`cls.lb_port = base_port`
			`cls.prefill_port = f"{int(base_port) + 100}"`
			`cls.decode_port = f"{int(base_port) + 200}"`
			`cls.prefill_url = f"http://{cls.base_host}:{cls.prefill_port}"`
			`cls.decode_url = f"http://{cls.base_host}:{cls.decode_port}"`
			`cls.lb_url = f"http://{cls.base_host}:{cls.lb_port}"`
			`print(f"{cls.base_host=} {cls.lb_port=} {cls.prefill_port=} {cls.decode_port=}")`

			`# Non blocking start servers`
			`cls.start_prefill()`
			`cls.start_decode()`

			`# Block until both`
[PD] Add simple unit test for disaggregation feature (#5654) Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> 2025-05-11 13:35:27 +08:00			`cls.wait_server_ready(cls.prefill_url + "/health")`
			`cls.wait_server_ready(cls.decode_url + "/health")`

			`lb_command = [`
			`"python3",`
			`"-m",`
			`"sglang.srt.disaggregation.mini_lb",`
			`"--prefill",`
			`cls.prefill_url,`
			`"--decode",`
			`cls.decode_url,`
			`"--host",`
			`cls.base_host,`
			`"--port",`
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`cls.lb_port,`
[PD] Add simple unit test for disaggregation feature (#5654) Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> 2025-05-11 13:35:27 +08:00			`]`

			`print("Starting load balancer:", " ".join(lb_command))`
			`cls.process_lb = subprocess.Popen(`
			`lb_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE`
			`)`
			`cls.wait_server_ready(cls.lb_url + "/health")`

			`@classmethod`
			`def start_prefill(cls):`
			`prefill_args = [`
			`"--trust-remote-code",`
			`"--disaggregation-mode",`
			`"prefill",`
			`"--tp",`
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`"1",`
			`"--disaggregation-ib-device",`
			`"mlx5_roce0",`
[PD] Add simple unit test for disaggregation feature (#5654) Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> 2025-05-11 13:35:27 +08:00			`]`
			`cls.process_prefill = popen_launch_pd_server(`
			`cls.model,`
			`cls.prefill_url,`
			`timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,`
			`other_args=prefill_args,`
			`)`

			`@classmethod`
			`def start_decode(cls):`
			`decode_args = [`
			`"--trust-remote-code",`
			`"--disaggregation-mode",`
			`"decode",`
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`"--tp",`
			`"1",`
			`"--base-gpu-id",`
			`"1",`
			`"--disaggregation-ib-device",`
			`"mlx5_roce1",`
			`]`
			`cls.process_decode = popen_launch_pd_server(`
			`cls.model,`
			`cls.decode_url,`
			`timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,`
			`other_args=decode_args,`
			`)`

			`@classmethod`
			`def wait_server_ready(cls, url, timeout=60):`
			`start_time = time.perf_counter()`
			`while True:`
			`try:`
			`response = requests.get(url)`
			`if response.status_code == 200:`
			`print(f"Server {url} is ready")`
			`return`
			`except Exception:`
			`pass`

			`if time.perf_counter() - start_time > timeout:`
			`raise RuntimeError(f"Server {url} failed to start in {timeout}s")`
			`time.sleep(1)`

			`@classmethod`
			`def tearDownClass(cls):`
			`for process in [cls.process_lb, cls.process_decode, cls.process_prefill]:`
			`if process:`
			`try:`
			`kill_process_tree(process.pid)`
			`except Exception as e:`
			`print(f"Error killing process {process.pid}: {e}")`

			`# wait for 5 seconds`
			`time.sleep(5)`

			`def test_gsm8k(self):`
			`args = SimpleNamespace(`
			`num_shots=5,`
			`data_path=None,`
			`num_questions=200,`
			`max_new_tokens=512,`
			`parallel=128,`
			`host=f"http://{self.base_host}",`
			`port=int(self.lb_port),`
			`)`
			`metrics = run_eval_few_shot_gsm8k(args)`
			`print(f"Evaluation metrics: {metrics}")`

			`self.assertGreater(metrics["accuracy"], 0.62)`

			`def test_logprob(self):`
			`prompt = "The capital of taiwan is "`
			`response = requests.post(`
			`self.lb_url + "/generate",`
			`json={`
			`"text": prompt,`
			`"sampling_params": {"temperature": 0},`
			`"return_logprob": True,`
			`"return_input_logprob": True,`
			`"logprob_start_len": 0,`
			`},`
			`)`

			`j = response.json()`
			`completion_tokens = j["meta_info"]["completion_tokens"]`
			`input_logprobs = j["meta_info"]["input_token_logprobs"]`
			`output_logprobs = j["meta_info"]["output_token_logprobs"]`

			`assert (`
			`len(output_logprobs) == completion_tokens`
			`), f"output_logprobs and completion_tokens should have the same length, but got {len(output_logprobs)} and {completion_tokens}"`
			`assert (`
			`len(input_logprobs) > 0`
			`), f"input_logprobs should have at least one token, but got {len(input_logprobs)}"`


			`class TestDisaggregationMooncakeFailure(CustomTestCase):`
			`@classmethod`
			`def setUpClass(cls):`
			`# set DISAGGREGATION_TEST_FAILURE_PROB to simulate failure`
			`os.environ["DISAGGREGATION_TEST_FAILURE_PROB"] = "0.05"`

			`cls.model = DEFAULT_MODEL_NAME_FOR_TEST`
			`parsed_url = urlparse(DEFAULT_URL_FOR_TEST)`
			`cls.base_host = parsed_url.hostname`
			`base_port = str(parsed_url.port)`
			`cls.lb_port = base_port`
			`cls.prefill_port = f"{int(base_port) + 100}"`
			`cls.decode_port = f"{int(base_port) + 200}"`
			`cls.prefill_url = f"http://{cls.base_host}:{cls.prefill_port}"`
			`cls.decode_url = f"http://{cls.base_host}:{cls.decode_port}"`
			`cls.lb_url = f"http://{cls.base_host}:{cls.lb_port}"`
			`print(f"{cls.base_host=} {cls.lb_port=} {cls.prefill_port=} {cls.decode_port=}")`

			`# Non blocking start servers`
			`cls.start_prefill()`
			`cls.start_decode()`

			`# Block until both`
			`cls.wait_server_ready(cls.prefill_url + "/health")`
			`cls.wait_server_ready(cls.decode_url + "/health")`

			`lb_command = [`
			`"python3",`
			`"-m",`
			`"sglang.srt.disaggregation.mini_lb",`
			`"--prefill",`
			`cls.prefill_url,`
			`"--decode",`
			`cls.decode_url,`
[PD] Add simple unit test for disaggregation feature (#5654) Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> 2025-05-11 13:35:27 +08:00			`"--host",`
			`cls.base_host,`
			`"--port",`
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`cls.lb_port,`
			`]`

			`print("Starting load balancer:", " ".join(lb_command))`
			`cls.process_lb = subprocess.Popen(`
			`lb_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE`
			`)`
			`cls.wait_server_ready(cls.lb_url + "/health")`

			`@classmethod`
			`def start_prefill(cls):`
			`prefill_args = [`
			`"--trust-remote-code",`
			`"--disaggregation-mode",`
			`"prefill",`
[PD] Add simple unit test for disaggregation feature (#5654) Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> 2025-05-11 13:35:27 +08:00			`"--tp",`
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`"1",`
			`"--disaggregation-ib-device",`
			`"mlx5_roce0",`
			`]`
			`cls.process_prefill = popen_launch_pd_server(`
			`cls.model,`
			`cls.prefill_url,`
			`timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,`
			`other_args=prefill_args,`
			`)`

			`@classmethod`
			`def start_decode(cls):`
			`decode_args = [`
			`"--trust-remote-code",`
			`"--disaggregation-mode",`
			`"decode",`
			`"--tp",`
			`"1",`
[PD] Add simple unit test for disaggregation feature (#5654) Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> 2025-05-11 13:35:27 +08:00			`"--base-gpu-id",`
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`"1",`
			`"--disaggregation-ib-device",`
			`"mlx5_roce1",`
[PD] Add simple unit test for disaggregation feature (#5654) Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> 2025-05-11 13:35:27 +08:00			`]`
			`cls.process_decode = popen_launch_pd_server(`
			`cls.model,`
			`cls.decode_url,`
			`timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,`
			`other_args=decode_args,`
			`)`

			`@classmethod`
			`def wait_server_ready(cls, url, timeout=60):`
Use monotonic clock for interval measurement (#6211) Signed-off-by: Lifu Huang <lifu.hlf@gmail.com> 2025-05-17 16:49:18 -07:00			`start_time = time.perf_counter()`
[PD] Add simple unit test for disaggregation feature (#5654) Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> 2025-05-11 13:35:27 +08:00			`while True:`
			`try:`
			`response = requests.get(url)`
			`if response.status_code == 200:`
			`print(f"Server {url} is ready")`
			`return`
			`except Exception:`
			`pass`

Use monotonic clock for interval measurement (#6211) Signed-off-by: Lifu Huang <lifu.hlf@gmail.com> 2025-05-17 16:49:18 -07:00			`if time.perf_counter() - start_time > timeout:`
[PD] Add simple unit test for disaggregation feature (#5654) Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> 2025-05-11 13:35:27 +08:00			`raise RuntimeError(f"Server {url} failed to start in {timeout}s")`
			`time.sleep(1)`

			`@classmethod`
			`def tearDownClass(cls):`
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`# unset DISAGGREGATION_TEST_FAILURE_PROB`
			`os.environ.pop("DISAGGREGATION_TEST_FAILURE_PROB")`
[PD] Add simple unit test for disaggregation feature (#5654) Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> 2025-05-11 13:35:27 +08:00			`for process in [cls.process_lb, cls.process_decode, cls.process_prefill]:`
			`if process:`
			`try:`
			`kill_process_tree(process.pid)`
			`except Exception as e:`
			`print(f"Error killing process {process.pid}: {e}")`

[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`# wait for 5 seconds`
			`time.sleep(5)`

[PD] Add simple unit test for disaggregation feature (#5654) Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> 2025-05-11 13:35:27 +08:00			`def test_gsm8k(self):`
			`args = SimpleNamespace(`
			`num_shots=5,`
			`data_path=None,`
			`num_questions=200,`
			`max_new_tokens=512,`
			`parallel=128,`
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`host=f"http://{self.base_host}",`
			`port=int(self.lb_port),`
[PD] Add simple unit test for disaggregation feature (#5654) Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> 2025-05-11 13:35:27 +08:00			`)`
			`metrics = run_eval_few_shot_gsm8k(args)`
			`print(f"Evaluation metrics: {metrics}")`
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`# Expect lots of failure but the server cannot crash`
[PD] Add simple unit test for disaggregation feature (#5654) Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> 2025-05-11 13:35:27 +08:00

[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`class TestDisaggregationMooncakeSpec(CustomTestCase):`
[PD] support spec decode (#6507) Co-authored-by: SangBin Cho <rkooo567@gmail.com> 2025-05-23 12:03:05 -07:00
			`@classmethod`
			`def setUpClass(cls):`
			`cls.model = DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST`
			`cls.draft_model = DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST`
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`parsed_url = urlparse(DEFAULT_URL_FOR_TEST)`
			`cls.base_host = parsed_url.hostname`
			`base_port = str(parsed_url.port)`
			`cls.lb_port = base_port`
			`cls.prefill_port = f"{int(base_port) + 100}"`
			`cls.decode_port = f"{int(base_port) + 200}"`
			`cls.prefill_url = f"http://{cls.base_host}:{cls.prefill_port}"`
			`cls.decode_url = f"http://{cls.base_host}:{cls.decode_port}"`
			`cls.lb_url = f"http://{cls.base_host}:{cls.lb_port}"`
[PD] support spec decode (#6507) Co-authored-by: SangBin Cho <rkooo567@gmail.com> 2025-05-23 12:03:05 -07:00			`cls.spec_args = [`
			`"--speculative-algorithm",`
			`"EAGLE",`
			`"--speculative-draft-model-path",`
			`cls.draft_model,`
			`"--speculative-num-steps",`
			`"3",`
			`"--speculative-eagle-topk",`
			`"4",`
			`"--speculative-num-draft-tokens",`
			`"16",`
			`"--cuda-graph-max-bs",`
			`"8",`
			`]`
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`print(f"{cls.base_host=} {cls.lb_port=} {cls.prefill_port=} {cls.decode_port=}")`
[PD] support spec decode (#6507) Co-authored-by: SangBin Cho <rkooo567@gmail.com> 2025-05-23 12:03:05 -07:00
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`# Non blocking start servers`
			`cls.start_prefill()`
			`cls.start_decode()`
[PD] support spec decode (#6507) Co-authored-by: SangBin Cho <rkooo567@gmail.com> 2025-05-23 12:03:05 -07:00
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`# Block until both`
[PD] support spec decode (#6507) Co-authored-by: SangBin Cho <rkooo567@gmail.com> 2025-05-23 12:03:05 -07:00			`cls.wait_server_ready(cls.prefill_url + "/health")`
			`cls.wait_server_ready(cls.decode_url + "/health")`

			`lb_command = [`
			`"python3",`
			`"-m",`
			`"sglang.srt.disaggregation.mini_lb",`
			`"--prefill",`
			`cls.prefill_url,`
			`"--decode",`
			`cls.decode_url,`
			`"--host",`
			`cls.base_host,`
			`"--port",`
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`cls.lb_port,`
[PD] support spec decode (#6507) Co-authored-by: SangBin Cho <rkooo567@gmail.com> 2025-05-23 12:03:05 -07:00			`]`

			`print("Starting load balancer:", " ".join(lb_command))`
			`cls.process_lb = subprocess.Popen(`
			`lb_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE`
			`)`
			`cls.wait_server_ready(cls.lb_url + "/health")`

			`@classmethod`
			`def wait_server_ready(cls, url, timeout=60):`
			`start_time = time.perf_counter()`
			`while True:`
			`try:`
			`response = requests.get(url)`
			`if response.status_code == 200:`
			`print(f"Server {url} is ready")`
			`return`
			`except Exception:`
			`pass`

			`if time.perf_counter() - start_time > timeout:`
			`raise RuntimeError(f"Server {url} failed to start in {timeout}s")`
			`time.sleep(1)`

			`@classmethod`
			`def start_prefill(cls):`
			`prefill_args = [`
			`"--trust-remote-code",`
			`"--disaggregation-mode",`
			`"prefill",`
			`"--tp",`
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`"2",`
			`"--disaggregation-ib-device",`
			`"mlx5_roce0,mlx5_roce1",`
[PD] support spec decode (#6507) Co-authored-by: SangBin Cho <rkooo567@gmail.com> 2025-05-23 12:03:05 -07:00			`] + cls.spec_args`
			`cls.process_prefill = popen_launch_pd_server(`
			`cls.model,`
			`cls.prefill_url,`
			`timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,`
			`other_args=prefill_args,`
			`)`

			`@classmethod`
			`def start_decode(cls):`
			`decode_args = [`
			`"--trust-remote-code",`
			`"--disaggregation-mode",`
			`"decode",`
			`"--tp",`
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`"2",`
[PD] support spec decode (#6507) Co-authored-by: SangBin Cho <rkooo567@gmail.com> 2025-05-23 12:03:05 -07:00			`"--base-gpu-id",`
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`"2",`
			`"--disaggregation-ib-device",`
			`"mlx5_roce2,mlx5_roce3",`
[PD] support spec decode (#6507) Co-authored-by: SangBin Cho <rkooo567@gmail.com> 2025-05-23 12:03:05 -07:00			`] + cls.spec_args`
			`cls.process_decode = popen_launch_pd_server(`
			`cls.model,`
			`cls.decode_url,`
			`timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,`
			`other_args=decode_args,`
			`)`

[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`@classmethod`
			`def wait_server_ready(cls, url, timeout=60):`
			`start_time = time.perf_counter()`
			`while True:`
			`try:`
			`response = requests.get(url)`
			`if response.status_code == 200:`
			`print(f"Server {url} is ready")`
			`return`
			`except Exception:`
			`pass`

			`if time.perf_counter() - start_time > timeout:`
			`raise RuntimeError(f"Server {url} failed to start in {timeout}s")`
			`time.sleep(1)`

			`@classmethod`
			`def tearDownClass(cls):`
			`for process in [cls.process_lb, cls.process_decode, cls.process_prefill]:`
			`if process:`
			`try:`
			`kill_process_tree(process.pid)`
			`except Exception as e:`
			`print(f"Error killing process {process.pid}: {e}")`

			`# wait for 5 seconds`
			`time.sleep(5)`

[PD] support spec decode (#6507) Co-authored-by: SangBin Cho <rkooo567@gmail.com> 2025-05-23 12:03:05 -07:00			`def test_gsm8k(self):`
			`args = SimpleNamespace(`
			`num_shots=5,`
			`data_path=None,`
			`num_questions=200,`
			`max_new_tokens=512,`
[PD] Support logprob & Add failure test (#6558) 2025-05-23 14:29:20 -07:00			`parallel=2,`
			`host=f"http://{self.base_host}",`
			`port=int(self.lb_port),`
[PD] support spec decode (#6507) Co-authored-by: SangBin Cho <rkooo567@gmail.com> 2025-05-23 12:03:05 -07:00			`)`
			`metrics = run_eval_few_shot_gsm8k(args)`
			`print(f"Evaluation metrics: {metrics}")`

			`self.assertGreater(metrics["accuracy"], 0.20)`


[PD] Add simple unit test for disaggregation feature (#5654) Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> 2025-05-11 13:35:27 +08:00			`if __name__ == "__main__":`
			`unittest.main()`