diff --git a/.github/workflows/pr-test-npu.yml b/.github/workflows/pr-test-npu.yml index faae29781..7cf45263c 100644 --- a/.github/workflows/pr-test-npu.yml +++ b/.github/workflows/pr-test-npu.yml @@ -22,7 +22,7 @@ concurrency: cancel-in-progress: true jobs: - unit-test-basic: + per-commit-1-ascend-npu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false runs-on: linux-arm64-npu-1 @@ -44,13 +44,77 @@ jobs: timeout-minutes: 30 env: SGLANG_USE_MODELSCOPE: true + SGLANG_IS_IN_CI: true HF_ENDPOINT: https://hf-mirror.com + TORCH_EXTENSIONS_DIR: /tmp/torch_extensions run: | cd test/srt - python3 run_suite.py --suite per-commit-npu + python3 run_suite.py --suite per-commit-1-ascend-npu + + per-commit-2-ascend-npu: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + runs-on: linux-arm64-npu-2 + container: + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1.alpha003-910b-ubuntu22.04-py3.11 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/npu_ci_install_dependency.sh + # copy required file from our daily cache + cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp + # copy download through proxy + curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl + + - name: Run test + timeout-minutes: 30 + env: + SGLANG_USE_MODELSCOPE: true + SGLANG_IS_IN_CI: true + HF_ENDPOINT: https://hf-mirror.com + TORCH_EXTENSIONS_DIR: /tmp/torch_extensions + run: | + cd test/srt + python3 run_suite.py --suite per-commit-2-ascend-npu + + per-commit-4-ascend-npu: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + runs-on: linux-arm64-npu-4 + container: + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1.alpha003-910b-ubuntu22.04-py3.11 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/npu_ci_install_dependency.sh + # copy required file from our daily cache + cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp + # copy download through proxy + curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl + + - name: Run test + timeout-minutes: 30 + env: + SGLANG_USE_MODELSCOPE: true + SGLANG_IS_IN_CI: true + HF_ENDPOINT: https://hf-mirror.com + TORCH_EXTENSIONS_DIR: /tmp/torch_extensions + run: | + cd test/srt + python3 run_suite.py --suite per-commit-4-ascend-npu --timeout-per-file 3600 + finish: if: always() - needs: [ unit-test-basic ] + needs: + - per-commit-1-ascend-npu + - per-commit-2-ascend-npu + - per-commit-4-ascend-npu runs-on: ubuntu-latest steps: - name: Check all dependent job statuses diff --git a/python/sglang/srt/layers/moe/topk.py b/python/sglang/srt/layers/moe/topk.py index c346e12f7..78bd6f08d 100644 --- a/python/sglang/srt/layers/moe/topk.py +++ b/python/sglang/srt/layers/moe/topk.py @@ -398,8 +398,12 @@ def grouped_topk_gpu( .reshape(num_token, -1) ) # [n, e] tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e] + # TODO: NPU can't support directly evaluating a comparison for now topk_weights, topk_ids = torch.topk( - tmp_scores, k=topk, dim=-1, sorted=num_fused_shared_experts > 0 + tmp_scores, + k=topk, + dim=-1, + sorted=(True if num_fused_shared_experts > 0 else False), ) if num_fused_shared_experts: topk_ids[:, -1] = torch.randint( @@ -489,8 +493,12 @@ def biased_grouped_topk_impl( tmp_scores = scores_for_choice.masked_fill( ~score_mask.bool(), float("-inf") ) # [n, e] + # TODO: NPU can't support directly evaluating a comparison for now _, topk_ids = torch.topk( - tmp_scores, k=topk, dim=-1, sorted=num_fused_shared_experts > 0 + tmp_scores, + k=topk, + dim=-1, + sorted=(True if num_fused_shared_experts > 0 else False), ) topk_weights = scores.gather(1, topk_ids) diff --git a/scripts/npu_ci_install_dependency.sh b/scripts/npu_ci_install_dependency.sh index 3fcb36492..29a28eb01 100755 --- a/scripts/npu_ci_install_dependency.sh +++ b/scripts/npu_ci_install_dependency.sh @@ -1,47 +1,59 @@ #!/bin/bash set -euo pipefail -# Install the required dependencies from cache -sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list -apt update -y -apt install -y build-essential cmake python3-pip python3-dev wget net-tools zlib1g-dev lld clang software-properties-common curl +CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local" +PIP_INSTALL="pip install --no-cache-dir" -# Setup pip cache -pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple -pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local -python3 -m pip install --upgrade pip -pip uninstall sgl-kernel -y || true + +# Update apt & pip sources +sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list +pip config set global.index-url http://${CACHING_URL}/pypi/simple +pip config set global.trusted-host ${CACHING_URL} + + +# Install the required dependencies in CI. +apt update -y && apt install -y \ + build-essential \ + cmake \ + wget \ + curl \ + net-tools \ + zlib1g-dev \ + lld \ + clang \ + locales \ + ccache \ + ca-certificates +update-ca-certificates +python3 -m ${PIP_INSTALL} --upgrade pip ### Download MemFabricV2 MF_WHL_NAME="mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl" -MEMFABRIC_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com:443/sglang/${MF_WHL_NAME}" -wget "${MEMFABRIC_URL}" && pip install "./${MF_WHL_NAME}" +MEMFABRIC_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/${MF_WHL_NAME}" +wget "${MEMFABRIC_URL}" && ${PIP_INSTALL} "./${MF_WHL_NAME}" ### Install vLLM VLLM_TAG=v0.8.5 git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_TAG -(cd vllm && VLLM_TARGET_DEVICE="empty" pip install -v -e .) +(cd vllm && VLLM_TARGET_DEVICE="empty" ${PIP_INSTALL} -v -e .) ### Install PyTorch and PTA PYTORCH_VERSION=2.6.0 TORCHVISION_VERSION=0.21.0 -PTA_VERSION=2.6.0rc1 -pip install torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu -pip install torch_npu==$PTA_VERSION +PTA_VERSION=2.6.0 +${PIP_INSTALL} torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu +${PIP_INSTALL} torch_npu==$PTA_VERSION ### Install Triton-Ascend -TRITON_ASCEND_VERSION=3.2.0rc2 -pip install attrs==24.2.0 numpy==1.26.4 scipy==1.13.1 decorator==5.1.1 psutil==6.0.0 pytest==8.3.2 pytest-xdist==3.6.1 pyyaml pybind11 -pip install triton-ascend==$TRITON_ASCEND_VERSION +TRITON_ASCEND_NAME="triton_ascend-3.2.0.dev20250729-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl" +TRITON_ASCEND_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/${TRITON_ASCEND_NAME}" +${PIP_INSTALL} attrs==24.2.0 numpy==1.26.4 scipy==1.13.1 decorator==5.1.1 psutil==6.0.0 pytest==8.3.2 pytest-xdist==3.6.1 pyyaml pybind11 +wget "${TRITON_ASCEND_URL}" && ${PIP_INSTALL} "./${TRITON_ASCEND_NAME}" -pip install -e "python[srt_npu]" - - -### Modify PyTorch TODO: to be removed later -TORCH_LOCATION=$(python3 -c 'import torch; print(torch.__path__[0])') -sed -i 's/from triton.runtime.autotuner import OutOfResources/from triton.runtime.errors import OutOfResources/' "${TORCH_LOCATION}/_inductor/runtime/triton_heuristics.py" +### Install SGLang +${PIP_INSTALL} -v -e "python[srt_npu]" diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 7b43d5175..93b818966 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -154,8 +154,14 @@ suites = { TestFile("test_rope_rocm.py", 3), TestFile("test_awq_dequant.py", 2), ], - "per-commit-npu": [ - TestFile("test_ascend_attention_backend.py", 400), + "per-commit-1-ascend-npu": [ + TestFile("test_ascend_tp1_bf16.py", 400), + ], + "per-commit-2-ascend-npu": [ + TestFile("test_ascend_tp2_bf16.py", 400), + ], + "per-commit-4-ascend-npu": [ + TestFile("test_ascend_mla_w8a8int8.py", 400), ], "per-commit-2-gpu": [ TestFile("models/lora/test_lora_tp.py", 116), diff --git a/test/srt/test_ascend_attention_backend.py b/test/srt/test_ascend_attention_backend.py deleted file mode 100644 index e406fee3c..000000000 --- a/test/srt/test_ascend_attention_backend.py +++ /dev/null @@ -1,62 +0,0 @@ -""" -Usage: -python3 -m unittest test_ascend_attention_backend.TestAscendAttnBackend.test_gsm8k -""" - -import unittest -from types import SimpleNamespace -from urllib.parse import urlparse - -from sglang.srt.utils import kill_process_tree -from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k -from sglang.test.run_eval import run_eval -from sglang.test.test_utils import ( - DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - CustomTestCase, - is_in_ci, - popen_launch_server, - run_bench_offline_throughput, -) - -DEFAULT_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-7B-Instruct" - - -class TestAscendAttnBackend(CustomTestCase): - def test_gsm8k(self): - model = DEFAULT_MODEL_NAME_FOR_TEST - base_url = DEFAULT_URL_FOR_TEST - url = urlparse(base_url) - process = popen_launch_server( - model, - base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=[ - "--attention-backend", - "ascend", - "--mem-fraction-static", - 0.8, - ], - ) - - try: - args = SimpleNamespace( - num_shots=5, - data_path=None, - num_questions=1319, - max_new_tokens=512, - parallel=128, - host=f"http://{url.hostname}", - port=int(url.port), - ) - - metrics = run_eval_few_shot_gsm8k(args) - self.assertGreaterEqual(metrics["accuracy"], 0.62) - self.assertLessEqual(metrics["latency"], 150) - finally: - kill_process_tree(process.pid) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/srt/test_ascend_mla_backend.py b/test/srt/test_ascend_mla_backend.py deleted file mode 100644 index 0db2f3b3e..000000000 --- a/test/srt/test_ascend_mla_backend.py +++ /dev/null @@ -1,96 +0,0 @@ -""" -Usage: -python3 -m unittest test_ascend_mla_backend.TestAscendMLABackend.test_gsm8k -""" - -import os -import unittest -from types import SimpleNamespace -from urllib.parse import urlparse - -from sglang.srt.utils import kill_process_tree -from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k -from sglang.test.run_eval import run_eval -from sglang.test.test_utils import ( - DEFAULT_MLA_MODEL_NAME_FOR_TEST, - DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - CustomTestCase, - is_in_ci, - popen_launch_server, - run_bench_offline_throughput, -) - -if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ: - os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1,2,3" -DEFAULT_PORT_FOR_SRT_TEST_RUNNER = ( - 7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100 -) -DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}" -DEFAULT_MODEL_NAME_FOR_TEST = "/models/DeepSeek-V2-Lite-Chat" -if not os.path.exists(DEFAULT_MODEL_NAME_FOR_TEST): - DEFAULT_MODEL_NAME_FOR_TEST = DEFAULT_MLA_MODEL_NAME_FOR_TEST - - -class TestAscendMLABackend(CustomTestCase): - def test_latency(self): - output_throughput = run_bench_offline_throughput( - DEFAULT_MODEL_NAME_FOR_TEST, - [ - "--attention-backend", - "ascend", - "--mem-fraction-static", - 0.7, - "--tp-size", - "4", - "--trust-remote-code", - "--disable-cuda-graph", - ], - ) - - print(f"{output_throughput=}") - - if is_in_ci(): - self.assertGreater(output_throughput, 18) - - def test_gsm8k(self): - model = DEFAULT_MODEL_NAME_FOR_TEST - base_url = DEFAULT_URL_FOR_TEST - url = urlparse(base_url) - process = popen_launch_server( - model, - base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=[ - "--attention-backend", - "ascend", - "--mem-fraction-static", - 0.7, - "--tp-size", - "4", - "--trust-remote-code", - "--disable-cuda-graph", - ], - ) - - try: - args = SimpleNamespace( - num_shots=5, - data_path=None, - num_questions=128, - max_new_tokens=512, - parallel=128, - host=f"http://{url.hostname}", - port=int(url.port), - ) - - metrics = run_eval_few_shot_gsm8k(args) - self.assertGreaterEqual(metrics["accuracy"], 0.62) - self.assertGreaterEqual(metrics["output_throughput"], 50) - finally: - kill_process_tree(process.pid) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/srt/test_ascend_mla_w8a8int8.py b/test/srt/test_ascend_mla_w8a8int8.py new file mode 100644 index 000000000..cdbc52023 --- /dev/null +++ b/test/srt/test_ascend_mla_w8a8int8.py @@ -0,0 +1,100 @@ +import unittest +from types import SimpleNamespace +from urllib.parse import urlparse + +from sglang.srt.utils import kill_process_tree +from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + is_in_ci, + popen_launch_server, + run_bench_offline_throughput, +) + +TEST_MODEL_MATRIX = { + "/root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V2-Lite-W8A8": { + "accuracy": 0.34, + "latency": 1000, + "output_throughput": 6, + }, +} + + +class TestAscendMlaW8A8Int8(CustomTestCase): + + @classmethod + def setUpClass(cls): + cls.models = TEST_MODEL_MATRIX.keys() + cls.base_url = DEFAULT_URL_FOR_TEST + cls.url = urlparse(DEFAULT_URL_FOR_TEST) + cls.common_args = [ + "--trust-remote-code", + "--disable-cuda-graph", + "--mem-fraction-static", + 0.8, + "--attention-backend", + "ascend", + "--quantization", + "w8a8_int8", + "--tp-size", + 4, + ] + + def test_a_gsm8k(self): + for model in self.models: + with self.subTest(model=model): + print(f"##=== Testing accuracy: {model} ===##") + + process = popen_launch_server( + model, + self.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + *self.common_args, + ], + ) + + try: + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=1319, + max_new_tokens=512, + parallel=128, + host=f"http://{self.url.hostname}", + port=int(self.url.port), + ) + + metrics = run_eval_few_shot_gsm8k(args) + self.assertGreaterEqual( + metrics["accuracy"], + TEST_MODEL_MATRIX[model]["accuracy"], + ) + finally: + kill_process_tree(process.pid) + + def test_b_throughput(self): + for model in self.models: + with self.subTest(model=model): + print(f"##=== Testing throughput: {model} ===##") + + output_throughput = run_bench_offline_throughput( + model, + [ + *self.common_args, + ], + ) + + print(f"##=== {model} throughput: {output_throughput} ===##") + + if is_in_ci(): + self.assertGreater( + output_throughput, + TEST_MODEL_MATRIX[model]["output_throughput"], + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_ascend_tp1_bf16.py b/test/srt/test_ascend_tp1_bf16.py new file mode 100644 index 000000000..90fde7a80 --- /dev/null +++ b/test/srt/test_ascend_tp1_bf16.py @@ -0,0 +1,96 @@ +import unittest +from types import SimpleNamespace +from urllib.parse import urlparse + +from sglang.srt.utils import kill_process_tree +from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + is_in_ci, + popen_launch_server, + run_bench_offline_throughput, +) + +TEST_MODEL_MATRIX = { + "Qwen/Qwen2.5-7B-Instruct": { + "accuracy": 0.85, + "latency": 150, + "output_throughput": 30, + }, +} + + +class TestAscendTp1Bf16(CustomTestCase): + + @classmethod + def setUpClass(cls): + cls.models = TEST_MODEL_MATRIX.keys() + cls.base_url = DEFAULT_URL_FOR_TEST + cls.url = urlparse(DEFAULT_URL_FOR_TEST) + cls.common_args = [ + "--trust-remote-code", + "--disable-cuda-graph", + "--mem-fraction-static", + 0.8, + "--attention-backend", + "ascend", + ] + + def test_a_gsm8k(self): + for model in self.models: + with self.subTest(model=model): + print(f"##=== Testing accuracy: {model} ===##") + + process = popen_launch_server( + model, + self.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + *self.common_args, + ], + ) + + try: + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=1319, + max_new_tokens=512, + parallel=128, + host=f"http://{self.url.hostname}", + port=int(self.url.port), + ) + + metrics = run_eval_few_shot_gsm8k(args) + self.assertGreaterEqual( + metrics["accuracy"], + TEST_MODEL_MATRIX[model]["accuracy"], + ) + finally: + kill_process_tree(process.pid) + + def test_b_throughput(self): + for model in self.models: + with self.subTest(model=model): + print(f"##=== Testing throughput: {model} ===##") + + output_throughput = run_bench_offline_throughput( + model, + [ + *self.common_args, + ], + ) + + print(f"##=== {model} throughput: {output_throughput} ===##") + + if is_in_ci(): + self.assertGreater( + output_throughput, + TEST_MODEL_MATRIX[model]["output_throughput"], + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_ascend_tp2_bf16.py b/test/srt/test_ascend_tp2_bf16.py new file mode 100644 index 000000000..d5e141c9f --- /dev/null +++ b/test/srt/test_ascend_tp2_bf16.py @@ -0,0 +1,98 @@ +import unittest +from types import SimpleNamespace +from urllib.parse import urlparse + +from sglang.srt.utils import kill_process_tree +from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + is_in_ci, + popen_launch_server, + run_bench_offline_throughput, +) + +TEST_MODEL_MATRIX = { + "Qwen/Qwen2.5-7B-Instruct": { + "accuracy": 0.85, + "latency": 180, + "output_throughput": 20, + }, +} + + +class TestAscendTp2Bf16(CustomTestCase): + + @classmethod + def setUpClass(cls): + cls.models = TEST_MODEL_MATRIX.keys() + cls.base_url = DEFAULT_URL_FOR_TEST + cls.url = urlparse(DEFAULT_URL_FOR_TEST) + cls.common_args = [ + "--trust-remote-code", + "--disable-cuda-graph", + "--mem-fraction-static", + 0.8, + "--attention-backend", + "ascend", + "--tp-size", + 2, + ] + + def test_a_gsm8k(self): + for model in self.models: + with self.subTest(model=model): + print(f"##=== Testing accuracy: {model} ===##") + + process = popen_launch_server( + model, + self.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + *self.common_args, + ], + ) + + try: + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=1319, + max_new_tokens=512, + parallel=128, + host=f"http://{self.url.hostname}", + port=int(self.url.port), + ) + + metrics = run_eval_few_shot_gsm8k(args) + self.assertGreaterEqual( + metrics["accuracy"], + TEST_MODEL_MATRIX[model]["accuracy"], + ) + finally: + kill_process_tree(process.pid) + + def test_b_throughput(self): + for model in self.models: + with self.subTest(model=model): + print(f"##=== Testing throughput: {model} ===##") + + output_throughput = run_bench_offline_throughput( + model, + [ + *self.common_args, + ], + ) + + print(f"##=== {model} throughput: {output_throughput} ===##") + + if is_in_ci(): + self.assertGreater( + output_throughput, + TEST_MODEL_MATRIX[model]["output_throughput"], + ) + + +if __name__ == "__main__": + unittest.main()