diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 265e1374b..3b33b319d 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -266,7 +266,7 @@ jobs: cd test/srt python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 - # USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 + USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 - name: Benchmark single latency + torch.compile (TP=2) timeout-minutes: 10 diff --git a/python/pyproject.toml b/python/pyproject.toml index 6078abdd2..3336823b3 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -44,7 +44,7 @@ runtime_common = [ srt = [ "sglang[runtime_common]", - "sgl-kernel==0.0.4", + "sgl-kernel==0.0.4.post1", "flashinfer_python==0.2.2.post1", "torch==2.5.1", "vllm>=0.6.4.post1,<=0.7.2", diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 5aafcc270..6c52709f7 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -480,7 +480,7 @@ class ServerArgs: "--chunked-prefill-size", type=int, default=ServerArgs.chunked_prefill_size, - help="The maximum number of tokens in a chunk for the chunked prefill. Setting this to -1 means disabling chunked prefill", + help="The maximum number of tokens in a chunk for the chunked prefill. Setting this to -1 means disabling chunked prefill.", ) parser.add_argument( "--max-prefill-tokens", @@ -505,7 +505,7 @@ class ServerArgs: "--cpu-offload-gb", type=int, default=ServerArgs.cpu_offload_gb, - help="How many GBs of RAM to reserve for CPU offloading", + help="How many GBs of RAM to reserve for CPU offloading.", ) # Other runtime options diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh index 408adbaf8..b1cf2eb44 100755 --- a/scripts/ci_install_dependency.sh +++ b/scripts/ci_install_dependency.sh @@ -26,4 +26,4 @@ pip install transformers==4.45.2 sentence_transformers accelerate peft pandas da pip install cuda-python nvidia-cuda-nvrtc-cu12 # reinstall sgl-kernel -pip install sgl-kernel==0.0.4 --force-reinstall --no-deps +pip install sgl-kernel==0.0.4.post1 --force-reinstall --no-deps