Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -1,25 +1,30 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Containing tests that check for regressions in vLLM's behavior.

 It should include tests that are reported by users and making sure they
 will never happen again.

 """
+
 import gc

+import pytest
 import torch

 from vllm import LLM, SamplingParams


+@pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len")
 def test_duplicated_ignored_sequence_group():
    """https://github.com/vllm-project/vllm/issues/1655"""

-    sampling_params = SamplingParams(temperature=0.01,
-                                     top_p=0.1,
-                                     max_tokens=256)
-    llm = LLM(model="facebook/opt-125m",
-              max_num_batched_tokens=4096,
-              tensor_parallel_size=1)
+    sampling_params = SamplingParams(temperature=0.01, top_p=0.1, max_tokens=256)
+    llm = LLM(
+        model="distilbert/distilgpt2",
+        max_num_batched_tokens=4096,
+        tensor_parallel_size=1,
+    )
    prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
    outputs = llm.generate(prompts, sampling_params=sampling_params)

@@ -27,12 +32,12 @@ def test_duplicated_ignored_sequence_group():


 def test_max_tokens_none():
-    sampling_params = SamplingParams(temperature=0.01,
-                                     top_p=0.1,
-                                     max_tokens=None)
-    llm = LLM(model="facebook/opt-125m",
-              max_num_batched_tokens=4096,
-              tensor_parallel_size=1)
+    sampling_params = SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
+    llm = LLM(
+        model="distilbert/distilgpt2",
+        max_num_batched_tokens=4096,
+        tensor_parallel_size=1,
+    )
    prompts = ["Just say hello!"]
    outputs = llm.generate(prompts, sampling_params=sampling_params)

@@ -40,7 +45,7 @@ def test_max_tokens_none():


 def test_gc():
-    llm = LLM("facebook/opt-125m", enforce_eager=True)
+    llm = LLM(model="distilbert/distilgpt2", enforce_eager=True)
    del llm

    gc.collect()
@@ -53,6 +58,22 @@ def test_gc():
    assert allocated < 50 * 1024 * 1024


-if __name__ == "__main__":
-    import pytest
-    pytest.main([__file__])
+def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
+    # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_MODELSCOPE", "True")
+        # Don't use HF_TOKEN for ModelScope repos, otherwise it will fail
+        # with 400 Client Error: Bad Request.
+        m.setenv("HF_TOKEN", "")
+        llm = LLM(model="qwen/Qwen1.5-0.5B-Chat")
+
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+        outputs = llm.generate(prompts, sampling_params)
+        assert len(outputs) == 4