diff --git a/.github/workflows/pr-test-h20.yml b/.github/workflows/pr-test-h20.yml index e283ea42f..1955dc2d8 100644 --- a/.github/workflows/pr-test-h20.yml +++ b/.github/workflows/pr-test-h20.yml @@ -59,7 +59,7 @@ jobs: cd test/srt python3 run_suite.py --suite per-commit-8-gpu-h20 - pr-test-finish: + pr-test-h20-finish: needs: [ check-changes, per-commit-8-gpu-h20, diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8f7455904..a295f2eb4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,7 +38,7 @@ repos: hooks: - id: codespell additional_dependencies: ['tomli'] - args: ['--toml', 'python/pyproject.toml', '-L', 'cann,thi'] + args: ['--toml', 'python/pyproject.toml', '-L', 'cann,thi,makro,wil,rouge'] exclude: | (?x)^( test/srt/test_reasoning_parser\.py| diff --git a/docs/platforms/ascend_npu.md b/docs/platforms/ascend_npu.md index 53fc009fb..a21a95b60 100644 --- a/docs/platforms/ascend_npu.md +++ b/docs/platforms/ascend_npu.md @@ -1,4 +1,4 @@ -# SGLang on Ascend NPUs +# Ascend NPUs You can install SGLang using any of the methods below. Please go through `System Settings` section to ensure the clusters are roaring at max performance. Feel free to leave an issue [here at sglang](https://github.com/sgl-project/sglang/issues) if you encounter any issues or have any problems. diff --git a/python/pyproject.toml b/python/pyproject.toml index 4e619d3e3..2543e7c1a 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -85,8 +85,11 @@ srt_hip = [ "wave-lang==1.0.1", ] -# CPU: torch wheel for CPU needs to be installed from https://download.pytorch.org/whl/cpu -srt_cpu = ["sglang[runtime_common]", "einops"] +# https://docs.sglang.ai/platforms/cpu_server.html +srt_cpu = ["sglang[runtime_common]"] + +# https://docs.sglang.ai/platforms/ascend_npu.html +srt_npu = ["sglang[runtime_common]"] # xpu is not enabled in public vllm and torch whl, # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm @@ -96,9 +99,6 @@ srt_xpu = ["sglang[runtime_common]"] # https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html srt_hpu = ["sglang[runtime_common]"] -# https://vllm-ascend.readthedocs.io/en/latest/installation.html -srt_npu = ["sglang[runtime_common]"] - openai = ["openai==1.99.1", "tiktoken"] anthropic = ["anthropic>=0.20.0"] litellm = ["litellm>=1.0.0"] diff --git a/python/sglang/bench_one_batch_server.py b/python/sglang/bench_one_batch_server.py index d925ae8ce..8ab952559 100644 --- a/python/sglang/bench_one_batch_server.py +++ b/python/sglang/bench_one_batch_server.py @@ -26,7 +26,7 @@ from sglang.bench_serving import get_tokenizer, sample_random_requests from sglang.profiler import run_profile from sglang.srt.entrypoints.http_server import launch_server from sglang.srt.server_args import ServerArgs -from sglang.srt.utils import kill_process_tree +from sglang.srt.utils import is_blackwell, kill_process_tree from sglang.test.test_utils import is_in_ci, write_github_step_summary @@ -363,7 +363,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs): acc_length, trace_link, ) in result: - hourly_cost = 2 * server_args.tp_size # $2/hour for one H100 + if is_blackwell(): + hourly_cost_per_gpu = 4 # $4/hour for one B200 + else: + hourly_cost_per_gpu = 2 # $2/hour for one H100 + + hourly_cost = hourly_cost_per_gpu * server_args.tp_size input_util = 0.7 accept_length = round(acc_length, 2) if acc_length is not None else "n/a" line = ( diff --git a/python/sglang/profiler.py b/python/sglang/profiler.py index d872ca320..3503ae7fc 100644 --- a/python/sglang/profiler.py +++ b/python/sglang/profiler.py @@ -9,6 +9,7 @@ import argparse import json import os import time +import urllib.parse from argparse import ArgumentParser from pathlib import Path from typing import List, Optional