[CI] Ascend NPU CI enhancement (#8294)
Co-authored-by: ronnie_zheng <zl19940307@163.com>
This commit is contained in:
70
.github/workflows/pr-test-npu.yml
vendored
70
.github/workflows/pr-test-npu.yml
vendored
@@ -22,7 +22,7 @@ concurrency:
|
|||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
unit-test-basic:
|
per-commit-1-ascend-npu:
|
||||||
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
||||||
github.event.pull_request.draft == false
|
github.event.pull_request.draft == false
|
||||||
runs-on: linux-arm64-npu-1
|
runs-on: linux-arm64-npu-1
|
||||||
@@ -44,13 +44,77 @@ jobs:
|
|||||||
timeout-minutes: 30
|
timeout-minutes: 30
|
||||||
env:
|
env:
|
||||||
SGLANG_USE_MODELSCOPE: true
|
SGLANG_USE_MODELSCOPE: true
|
||||||
|
SGLANG_IS_IN_CI: true
|
||||||
HF_ENDPOINT: https://hf-mirror.com
|
HF_ENDPOINT: https://hf-mirror.com
|
||||||
|
TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
|
||||||
run: |
|
run: |
|
||||||
cd test/srt
|
cd test/srt
|
||||||
python3 run_suite.py --suite per-commit-npu
|
python3 run_suite.py --suite per-commit-1-ascend-npu
|
||||||
|
|
||||||
|
per-commit-2-ascend-npu:
|
||||||
|
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
||||||
|
github.event.pull_request.draft == false
|
||||||
|
runs-on: linux-arm64-npu-2
|
||||||
|
container:
|
||||||
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1.alpha003-910b-ubuntu22.04-py3.11
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
bash scripts/npu_ci_install_dependency.sh
|
||||||
|
# copy required file from our daily cache
|
||||||
|
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
|
||||||
|
# copy download through proxy
|
||||||
|
curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
|
||||||
|
|
||||||
|
- name: Run test
|
||||||
|
timeout-minutes: 30
|
||||||
|
env:
|
||||||
|
SGLANG_USE_MODELSCOPE: true
|
||||||
|
SGLANG_IS_IN_CI: true
|
||||||
|
HF_ENDPOINT: https://hf-mirror.com
|
||||||
|
TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
|
||||||
|
run: |
|
||||||
|
cd test/srt
|
||||||
|
python3 run_suite.py --suite per-commit-2-ascend-npu
|
||||||
|
|
||||||
|
per-commit-4-ascend-npu:
|
||||||
|
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
||||||
|
github.event.pull_request.draft == false
|
||||||
|
runs-on: linux-arm64-npu-4
|
||||||
|
container:
|
||||||
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1.alpha003-910b-ubuntu22.04-py3.11
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
bash scripts/npu_ci_install_dependency.sh
|
||||||
|
# copy required file from our daily cache
|
||||||
|
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
|
||||||
|
# copy download through proxy
|
||||||
|
curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
|
||||||
|
|
||||||
|
- name: Run test
|
||||||
|
timeout-minutes: 30
|
||||||
|
env:
|
||||||
|
SGLANG_USE_MODELSCOPE: true
|
||||||
|
SGLANG_IS_IN_CI: true
|
||||||
|
HF_ENDPOINT: https://hf-mirror.com
|
||||||
|
TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
|
||||||
|
run: |
|
||||||
|
cd test/srt
|
||||||
|
python3 run_suite.py --suite per-commit-4-ascend-npu --timeout-per-file 3600
|
||||||
|
|
||||||
finish:
|
finish:
|
||||||
if: always()
|
if: always()
|
||||||
needs: [ unit-test-basic ]
|
needs:
|
||||||
|
- per-commit-1-ascend-npu
|
||||||
|
- per-commit-2-ascend-npu
|
||||||
|
- per-commit-4-ascend-npu
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Check all dependent job statuses
|
- name: Check all dependent job statuses
|
||||||
|
|||||||
@@ -398,8 +398,12 @@ def grouped_topk_gpu(
|
|||||||
.reshape(num_token, -1)
|
.reshape(num_token, -1)
|
||||||
) # [n, e]
|
) # [n, e]
|
||||||
tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e]
|
tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e]
|
||||||
|
# TODO: NPU can't support directly evaluating a comparison for now
|
||||||
topk_weights, topk_ids = torch.topk(
|
topk_weights, topk_ids = torch.topk(
|
||||||
tmp_scores, k=topk, dim=-1, sorted=num_fused_shared_experts > 0
|
tmp_scores,
|
||||||
|
k=topk,
|
||||||
|
dim=-1,
|
||||||
|
sorted=(True if num_fused_shared_experts > 0 else False),
|
||||||
)
|
)
|
||||||
if num_fused_shared_experts:
|
if num_fused_shared_experts:
|
||||||
topk_ids[:, -1] = torch.randint(
|
topk_ids[:, -1] = torch.randint(
|
||||||
@@ -489,8 +493,12 @@ def biased_grouped_topk_impl(
|
|||||||
tmp_scores = scores_for_choice.masked_fill(
|
tmp_scores = scores_for_choice.masked_fill(
|
||||||
~score_mask.bool(), float("-inf")
|
~score_mask.bool(), float("-inf")
|
||||||
) # [n, e]
|
) # [n, e]
|
||||||
|
# TODO: NPU can't support directly evaluating a comparison for now
|
||||||
_, topk_ids = torch.topk(
|
_, topk_ids = torch.topk(
|
||||||
tmp_scores, k=topk, dim=-1, sorted=num_fused_shared_experts > 0
|
tmp_scores,
|
||||||
|
k=topk,
|
||||||
|
dim=-1,
|
||||||
|
sorted=(True if num_fused_shared_experts > 0 else False),
|
||||||
)
|
)
|
||||||
topk_weights = scores.gather(1, topk_ids)
|
topk_weights = scores.gather(1, topk_ids)
|
||||||
|
|
||||||
|
|||||||
@@ -1,47 +1,59 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
# Install the required dependencies from cache
|
CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
|
||||||
sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
|
PIP_INSTALL="pip install --no-cache-dir"
|
||||||
apt update -y
|
|
||||||
apt install -y build-essential cmake python3-pip python3-dev wget net-tools zlib1g-dev lld clang software-properties-common curl
|
|
||||||
|
|
||||||
# Setup pip cache
|
|
||||||
pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
|
# Update apt & pip sources
|
||||||
pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
|
sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
|
||||||
python3 -m pip install --upgrade pip
|
pip config set global.index-url http://${CACHING_URL}/pypi/simple
|
||||||
pip uninstall sgl-kernel -y || true
|
pip config set global.trusted-host ${CACHING_URL}
|
||||||
|
|
||||||
|
|
||||||
|
# Install the required dependencies in CI.
|
||||||
|
apt update -y && apt install -y \
|
||||||
|
build-essential \
|
||||||
|
cmake \
|
||||||
|
wget \
|
||||||
|
curl \
|
||||||
|
net-tools \
|
||||||
|
zlib1g-dev \
|
||||||
|
lld \
|
||||||
|
clang \
|
||||||
|
locales \
|
||||||
|
ccache \
|
||||||
|
ca-certificates
|
||||||
|
update-ca-certificates
|
||||||
|
python3 -m ${PIP_INSTALL} --upgrade pip
|
||||||
|
|
||||||
|
|
||||||
### Download MemFabricV2
|
### Download MemFabricV2
|
||||||
MF_WHL_NAME="mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl"
|
MF_WHL_NAME="mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl"
|
||||||
MEMFABRIC_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com:443/sglang/${MF_WHL_NAME}"
|
MEMFABRIC_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/${MF_WHL_NAME}"
|
||||||
wget "${MEMFABRIC_URL}" && pip install "./${MF_WHL_NAME}"
|
wget "${MEMFABRIC_URL}" && ${PIP_INSTALL} "./${MF_WHL_NAME}"
|
||||||
|
|
||||||
|
|
||||||
### Install vLLM
|
### Install vLLM
|
||||||
VLLM_TAG=v0.8.5
|
VLLM_TAG=v0.8.5
|
||||||
git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_TAG
|
git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_TAG
|
||||||
(cd vllm && VLLM_TARGET_DEVICE="empty" pip install -v -e .)
|
(cd vllm && VLLM_TARGET_DEVICE="empty" ${PIP_INSTALL} -v -e .)
|
||||||
|
|
||||||
|
|
||||||
### Install PyTorch and PTA
|
### Install PyTorch and PTA
|
||||||
PYTORCH_VERSION=2.6.0
|
PYTORCH_VERSION=2.6.0
|
||||||
TORCHVISION_VERSION=0.21.0
|
TORCHVISION_VERSION=0.21.0
|
||||||
PTA_VERSION=2.6.0rc1
|
PTA_VERSION=2.6.0
|
||||||
pip install torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu
|
${PIP_INSTALL} torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu
|
||||||
pip install torch_npu==$PTA_VERSION
|
${PIP_INSTALL} torch_npu==$PTA_VERSION
|
||||||
|
|
||||||
|
|
||||||
### Install Triton-Ascend
|
### Install Triton-Ascend
|
||||||
TRITON_ASCEND_VERSION=3.2.0rc2
|
TRITON_ASCEND_NAME="triton_ascend-3.2.0.dev20250729-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl"
|
||||||
pip install attrs==24.2.0 numpy==1.26.4 scipy==1.13.1 decorator==5.1.1 psutil==6.0.0 pytest==8.3.2 pytest-xdist==3.6.1 pyyaml pybind11
|
TRITON_ASCEND_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/${TRITON_ASCEND_NAME}"
|
||||||
pip install triton-ascend==$TRITON_ASCEND_VERSION
|
${PIP_INSTALL} attrs==24.2.0 numpy==1.26.4 scipy==1.13.1 decorator==5.1.1 psutil==6.0.0 pytest==8.3.2 pytest-xdist==3.6.1 pyyaml pybind11
|
||||||
|
wget "${TRITON_ASCEND_URL}" && ${PIP_INSTALL} "./${TRITON_ASCEND_NAME}"
|
||||||
|
|
||||||
|
|
||||||
pip install -e "python[srt_npu]"
|
### Install SGLang
|
||||||
|
${PIP_INSTALL} -v -e "python[srt_npu]"
|
||||||
|
|
||||||
### Modify PyTorch TODO: to be removed later
|
|
||||||
TORCH_LOCATION=$(python3 -c 'import torch; print(torch.__path__[0])')
|
|
||||||
sed -i 's/from triton.runtime.autotuner import OutOfResources/from triton.runtime.errors import OutOfResources/' "${TORCH_LOCATION}/_inductor/runtime/triton_heuristics.py"
|
|
||||||
|
|||||||
@@ -154,8 +154,14 @@ suites = {
|
|||||||
TestFile("test_rope_rocm.py", 3),
|
TestFile("test_rope_rocm.py", 3),
|
||||||
TestFile("test_awq_dequant.py", 2),
|
TestFile("test_awq_dequant.py", 2),
|
||||||
],
|
],
|
||||||
"per-commit-npu": [
|
"per-commit-1-ascend-npu": [
|
||||||
TestFile("test_ascend_attention_backend.py", 400),
|
TestFile("test_ascend_tp1_bf16.py", 400),
|
||||||
|
],
|
||||||
|
"per-commit-2-ascend-npu": [
|
||||||
|
TestFile("test_ascend_tp2_bf16.py", 400),
|
||||||
|
],
|
||||||
|
"per-commit-4-ascend-npu": [
|
||||||
|
TestFile("test_ascend_mla_w8a8int8.py", 400),
|
||||||
],
|
],
|
||||||
"per-commit-2-gpu": [
|
"per-commit-2-gpu": [
|
||||||
TestFile("models/lora/test_lora_tp.py", 116),
|
TestFile("models/lora/test_lora_tp.py", 116),
|
||||||
|
|||||||
@@ -1,62 +0,0 @@
|
|||||||
"""
|
|
||||||
Usage:
|
|
||||||
python3 -m unittest test_ascend_attention_backend.TestAscendAttnBackend.test_gsm8k
|
|
||||||
"""
|
|
||||||
|
|
||||||
import unittest
|
|
||||||
from types import SimpleNamespace
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
from sglang.srt.utils import kill_process_tree
|
|
||||||
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
|
|
||||||
from sglang.test.run_eval import run_eval
|
|
||||||
from sglang.test.test_utils import (
|
|
||||||
DEFAULT_MODEL_NAME_FOR_TEST,
|
|
||||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
|
||||||
DEFAULT_URL_FOR_TEST,
|
|
||||||
CustomTestCase,
|
|
||||||
is_in_ci,
|
|
||||||
popen_launch_server,
|
|
||||||
run_bench_offline_throughput,
|
|
||||||
)
|
|
||||||
|
|
||||||
DEFAULT_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-7B-Instruct"
|
|
||||||
|
|
||||||
|
|
||||||
class TestAscendAttnBackend(CustomTestCase):
|
|
||||||
def test_gsm8k(self):
|
|
||||||
model = DEFAULT_MODEL_NAME_FOR_TEST
|
|
||||||
base_url = DEFAULT_URL_FOR_TEST
|
|
||||||
url = urlparse(base_url)
|
|
||||||
process = popen_launch_server(
|
|
||||||
model,
|
|
||||||
base_url,
|
|
||||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
|
||||||
other_args=[
|
|
||||||
"--attention-backend",
|
|
||||||
"ascend",
|
|
||||||
"--mem-fraction-static",
|
|
||||||
0.8,
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
args = SimpleNamespace(
|
|
||||||
num_shots=5,
|
|
||||||
data_path=None,
|
|
||||||
num_questions=1319,
|
|
||||||
max_new_tokens=512,
|
|
||||||
parallel=128,
|
|
||||||
host=f"http://{url.hostname}",
|
|
||||||
port=int(url.port),
|
|
||||||
)
|
|
||||||
|
|
||||||
metrics = run_eval_few_shot_gsm8k(args)
|
|
||||||
self.assertGreaterEqual(metrics["accuracy"], 0.62)
|
|
||||||
self.assertLessEqual(metrics["latency"], 150)
|
|
||||||
finally:
|
|
||||||
kill_process_tree(process.pid)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
unittest.main()
|
|
||||||
@@ -1,96 +0,0 @@
|
|||||||
"""
|
|
||||||
Usage:
|
|
||||||
python3 -m unittest test_ascend_mla_backend.TestAscendMLABackend.test_gsm8k
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import unittest
|
|
||||||
from types import SimpleNamespace
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
from sglang.srt.utils import kill_process_tree
|
|
||||||
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
|
|
||||||
from sglang.test.run_eval import run_eval
|
|
||||||
from sglang.test.test_utils import (
|
|
||||||
DEFAULT_MLA_MODEL_NAME_FOR_TEST,
|
|
||||||
DEFAULT_MODEL_NAME_FOR_TEST,
|
|
||||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
|
||||||
DEFAULT_URL_FOR_TEST,
|
|
||||||
CustomTestCase,
|
|
||||||
is_in_ci,
|
|
||||||
popen_launch_server,
|
|
||||||
run_bench_offline_throughput,
|
|
||||||
)
|
|
||||||
|
|
||||||
if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
|
|
||||||
os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1,2,3"
|
|
||||||
DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
|
|
||||||
7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
|
|
||||||
)
|
|
||||||
DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
|
|
||||||
DEFAULT_MODEL_NAME_FOR_TEST = "/models/DeepSeek-V2-Lite-Chat"
|
|
||||||
if not os.path.exists(DEFAULT_MODEL_NAME_FOR_TEST):
|
|
||||||
DEFAULT_MODEL_NAME_FOR_TEST = DEFAULT_MLA_MODEL_NAME_FOR_TEST
|
|
||||||
|
|
||||||
|
|
||||||
class TestAscendMLABackend(CustomTestCase):
|
|
||||||
def test_latency(self):
|
|
||||||
output_throughput = run_bench_offline_throughput(
|
|
||||||
DEFAULT_MODEL_NAME_FOR_TEST,
|
|
||||||
[
|
|
||||||
"--attention-backend",
|
|
||||||
"ascend",
|
|
||||||
"--mem-fraction-static",
|
|
||||||
0.7,
|
|
||||||
"--tp-size",
|
|
||||||
"4",
|
|
||||||
"--trust-remote-code",
|
|
||||||
"--disable-cuda-graph",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f"{output_throughput=}")
|
|
||||||
|
|
||||||
if is_in_ci():
|
|
||||||
self.assertGreater(output_throughput, 18)
|
|
||||||
|
|
||||||
def test_gsm8k(self):
|
|
||||||
model = DEFAULT_MODEL_NAME_FOR_TEST
|
|
||||||
base_url = DEFAULT_URL_FOR_TEST
|
|
||||||
url = urlparse(base_url)
|
|
||||||
process = popen_launch_server(
|
|
||||||
model,
|
|
||||||
base_url,
|
|
||||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
|
||||||
other_args=[
|
|
||||||
"--attention-backend",
|
|
||||||
"ascend",
|
|
||||||
"--mem-fraction-static",
|
|
||||||
0.7,
|
|
||||||
"--tp-size",
|
|
||||||
"4",
|
|
||||||
"--trust-remote-code",
|
|
||||||
"--disable-cuda-graph",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
args = SimpleNamespace(
|
|
||||||
num_shots=5,
|
|
||||||
data_path=None,
|
|
||||||
num_questions=128,
|
|
||||||
max_new_tokens=512,
|
|
||||||
parallel=128,
|
|
||||||
host=f"http://{url.hostname}",
|
|
||||||
port=int(url.port),
|
|
||||||
)
|
|
||||||
|
|
||||||
metrics = run_eval_few_shot_gsm8k(args)
|
|
||||||
self.assertGreaterEqual(metrics["accuracy"], 0.62)
|
|
||||||
self.assertGreaterEqual(metrics["output_throughput"], 50)
|
|
||||||
finally:
|
|
||||||
kill_process_tree(process.pid)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
unittest.main()
|
|
||||||
100
test/srt/test_ascend_mla_w8a8int8.py
Normal file
100
test/srt/test_ascend_mla_w8a8int8.py
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
import unittest
|
||||||
|
from types import SimpleNamespace
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from sglang.srt.utils import kill_process_tree
|
||||||
|
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
|
||||||
|
from sglang.test.test_utils import (
|
||||||
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||||
|
DEFAULT_URL_FOR_TEST,
|
||||||
|
CustomTestCase,
|
||||||
|
is_in_ci,
|
||||||
|
popen_launch_server,
|
||||||
|
run_bench_offline_throughput,
|
||||||
|
)
|
||||||
|
|
||||||
|
TEST_MODEL_MATRIX = {
|
||||||
|
"/root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V2-Lite-W8A8": {
|
||||||
|
"accuracy": 0.34,
|
||||||
|
"latency": 1000,
|
||||||
|
"output_throughput": 6,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TestAscendMlaW8A8Int8(CustomTestCase):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls):
|
||||||
|
cls.models = TEST_MODEL_MATRIX.keys()
|
||||||
|
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||||
|
cls.url = urlparse(DEFAULT_URL_FOR_TEST)
|
||||||
|
cls.common_args = [
|
||||||
|
"--trust-remote-code",
|
||||||
|
"--disable-cuda-graph",
|
||||||
|
"--mem-fraction-static",
|
||||||
|
0.8,
|
||||||
|
"--attention-backend",
|
||||||
|
"ascend",
|
||||||
|
"--quantization",
|
||||||
|
"w8a8_int8",
|
||||||
|
"--tp-size",
|
||||||
|
4,
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_a_gsm8k(self):
|
||||||
|
for model in self.models:
|
||||||
|
with self.subTest(model=model):
|
||||||
|
print(f"##=== Testing accuracy: {model} ===##")
|
||||||
|
|
||||||
|
process = popen_launch_server(
|
||||||
|
model,
|
||||||
|
self.base_url,
|
||||||
|
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||||
|
other_args=[
|
||||||
|
*self.common_args,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
args = SimpleNamespace(
|
||||||
|
num_shots=5,
|
||||||
|
data_path=None,
|
||||||
|
num_questions=1319,
|
||||||
|
max_new_tokens=512,
|
||||||
|
parallel=128,
|
||||||
|
host=f"http://{self.url.hostname}",
|
||||||
|
port=int(self.url.port),
|
||||||
|
)
|
||||||
|
|
||||||
|
metrics = run_eval_few_shot_gsm8k(args)
|
||||||
|
self.assertGreaterEqual(
|
||||||
|
metrics["accuracy"],
|
||||||
|
TEST_MODEL_MATRIX[model]["accuracy"],
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
kill_process_tree(process.pid)
|
||||||
|
|
||||||
|
def test_b_throughput(self):
|
||||||
|
for model in self.models:
|
||||||
|
with self.subTest(model=model):
|
||||||
|
print(f"##=== Testing throughput: {model} ===##")
|
||||||
|
|
||||||
|
output_throughput = run_bench_offline_throughput(
|
||||||
|
model,
|
||||||
|
[
|
||||||
|
*self.common_args,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"##=== {model} throughput: {output_throughput} ===##")
|
||||||
|
|
||||||
|
if is_in_ci():
|
||||||
|
self.assertGreater(
|
||||||
|
output_throughput,
|
||||||
|
TEST_MODEL_MATRIX[model]["output_throughput"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
96
test/srt/test_ascend_tp1_bf16.py
Normal file
96
test/srt/test_ascend_tp1_bf16.py
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
import unittest
|
||||||
|
from types import SimpleNamespace
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from sglang.srt.utils import kill_process_tree
|
||||||
|
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
|
||||||
|
from sglang.test.test_utils import (
|
||||||
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||||
|
DEFAULT_URL_FOR_TEST,
|
||||||
|
CustomTestCase,
|
||||||
|
is_in_ci,
|
||||||
|
popen_launch_server,
|
||||||
|
run_bench_offline_throughput,
|
||||||
|
)
|
||||||
|
|
||||||
|
TEST_MODEL_MATRIX = {
|
||||||
|
"Qwen/Qwen2.5-7B-Instruct": {
|
||||||
|
"accuracy": 0.85,
|
||||||
|
"latency": 150,
|
||||||
|
"output_throughput": 30,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TestAscendTp1Bf16(CustomTestCase):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls):
|
||||||
|
cls.models = TEST_MODEL_MATRIX.keys()
|
||||||
|
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||||
|
cls.url = urlparse(DEFAULT_URL_FOR_TEST)
|
||||||
|
cls.common_args = [
|
||||||
|
"--trust-remote-code",
|
||||||
|
"--disable-cuda-graph",
|
||||||
|
"--mem-fraction-static",
|
||||||
|
0.8,
|
||||||
|
"--attention-backend",
|
||||||
|
"ascend",
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_a_gsm8k(self):
|
||||||
|
for model in self.models:
|
||||||
|
with self.subTest(model=model):
|
||||||
|
print(f"##=== Testing accuracy: {model} ===##")
|
||||||
|
|
||||||
|
process = popen_launch_server(
|
||||||
|
model,
|
||||||
|
self.base_url,
|
||||||
|
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||||
|
other_args=[
|
||||||
|
*self.common_args,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
args = SimpleNamespace(
|
||||||
|
num_shots=5,
|
||||||
|
data_path=None,
|
||||||
|
num_questions=1319,
|
||||||
|
max_new_tokens=512,
|
||||||
|
parallel=128,
|
||||||
|
host=f"http://{self.url.hostname}",
|
||||||
|
port=int(self.url.port),
|
||||||
|
)
|
||||||
|
|
||||||
|
metrics = run_eval_few_shot_gsm8k(args)
|
||||||
|
self.assertGreaterEqual(
|
||||||
|
metrics["accuracy"],
|
||||||
|
TEST_MODEL_MATRIX[model]["accuracy"],
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
kill_process_tree(process.pid)
|
||||||
|
|
||||||
|
def test_b_throughput(self):
|
||||||
|
for model in self.models:
|
||||||
|
with self.subTest(model=model):
|
||||||
|
print(f"##=== Testing throughput: {model} ===##")
|
||||||
|
|
||||||
|
output_throughput = run_bench_offline_throughput(
|
||||||
|
model,
|
||||||
|
[
|
||||||
|
*self.common_args,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"##=== {model} throughput: {output_throughput} ===##")
|
||||||
|
|
||||||
|
if is_in_ci():
|
||||||
|
self.assertGreater(
|
||||||
|
output_throughput,
|
||||||
|
TEST_MODEL_MATRIX[model]["output_throughput"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
98
test/srt/test_ascend_tp2_bf16.py
Normal file
98
test/srt/test_ascend_tp2_bf16.py
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
import unittest
|
||||||
|
from types import SimpleNamespace
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from sglang.srt.utils import kill_process_tree
|
||||||
|
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
|
||||||
|
from sglang.test.test_utils import (
|
||||||
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||||
|
DEFAULT_URL_FOR_TEST,
|
||||||
|
CustomTestCase,
|
||||||
|
is_in_ci,
|
||||||
|
popen_launch_server,
|
||||||
|
run_bench_offline_throughput,
|
||||||
|
)
|
||||||
|
|
||||||
|
TEST_MODEL_MATRIX = {
|
||||||
|
"Qwen/Qwen2.5-7B-Instruct": {
|
||||||
|
"accuracy": 0.85,
|
||||||
|
"latency": 180,
|
||||||
|
"output_throughput": 20,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TestAscendTp2Bf16(CustomTestCase):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls):
|
||||||
|
cls.models = TEST_MODEL_MATRIX.keys()
|
||||||
|
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||||
|
cls.url = urlparse(DEFAULT_URL_FOR_TEST)
|
||||||
|
cls.common_args = [
|
||||||
|
"--trust-remote-code",
|
||||||
|
"--disable-cuda-graph",
|
||||||
|
"--mem-fraction-static",
|
||||||
|
0.8,
|
||||||
|
"--attention-backend",
|
||||||
|
"ascend",
|
||||||
|
"--tp-size",
|
||||||
|
2,
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_a_gsm8k(self):
|
||||||
|
for model in self.models:
|
||||||
|
with self.subTest(model=model):
|
||||||
|
print(f"##=== Testing accuracy: {model} ===##")
|
||||||
|
|
||||||
|
process = popen_launch_server(
|
||||||
|
model,
|
||||||
|
self.base_url,
|
||||||
|
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||||
|
other_args=[
|
||||||
|
*self.common_args,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
args = SimpleNamespace(
|
||||||
|
num_shots=5,
|
||||||
|
data_path=None,
|
||||||
|
num_questions=1319,
|
||||||
|
max_new_tokens=512,
|
||||||
|
parallel=128,
|
||||||
|
host=f"http://{self.url.hostname}",
|
||||||
|
port=int(self.url.port),
|
||||||
|
)
|
||||||
|
|
||||||
|
metrics = run_eval_few_shot_gsm8k(args)
|
||||||
|
self.assertGreaterEqual(
|
||||||
|
metrics["accuracy"],
|
||||||
|
TEST_MODEL_MATRIX[model]["accuracy"],
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
kill_process_tree(process.pid)
|
||||||
|
|
||||||
|
def test_b_throughput(self):
|
||||||
|
for model in self.models:
|
||||||
|
with self.subTest(model=model):
|
||||||
|
print(f"##=== Testing throughput: {model} ===##")
|
||||||
|
|
||||||
|
output_throughput = run_bench_offline_throughput(
|
||||||
|
model,
|
||||||
|
[
|
||||||
|
*self.common_args,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"##=== {model} throughput: {output_throughput} ===##")
|
||||||
|
|
||||||
|
if is_in_ci():
|
||||||
|
self.assertGreater(
|
||||||
|
output_throughput,
|
||||||
|
TEST_MODEL_MATRIX[model]["output_throughput"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
Reference in New Issue
Block a user