Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/examples/offline_inference/qwen_1m.py
+++ b/examples/offline_inference/qwen_1m.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from urllib.request import urlopen
+
+from vllm import LLM, SamplingParams
+
+os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
+
+
+def load_prompt() -> str:
+    # Test cases with various lengths can be found at:
+    #
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/64k.txt
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/200k.txt
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/600k.txt
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/1m.txt
+
+    with urlopen(
+        "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/600k.txt",
+        timeout=5,
+    ) as response:
+        prompt = response.read().decode("utf-8")
+    return prompt
+
+
+# Processing the prompt.
+def process_requests(llm: LLM, prompts: list[str]) -> None:
+    # Create a sampling params object.
+    sampling_params = SamplingParams(
+        temperature=0.7,
+        top_p=0.8,
+        top_k=20,
+        repetition_penalty=1.05,
+        detokenize=True,
+        max_tokens=256,
+    )
+    # Generate texts from the prompts.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt_token_ids = output.prompt_token_ids
+        generated_text = output.outputs[0].text
+        print(
+            f"Prompt length: {len(prompt_token_ids)}, "
+            f"Generated text: {generated_text!r}"
+        )
+
+
+# Create an LLM.
+def initialize_engine() -> LLM:
+    llm = LLM(
+        model="Qwen/Qwen2.5-7B-Instruct-1M",
+        max_model_len=1048576,
+        tensor_parallel_size=4,
+        enforce_eager=True,
+        enable_chunked_prefill=True,
+        max_num_batched_tokens=131072,
+    )
+    return llm
+
+
+def main():
+    llm = initialize_engine()
+    prompt = load_prompt()
+    process_requests(llm, [prompt])
+
+
+if __name__ == "__main__":
+    main()