[Misc][V0 Deprecation] Add __main__ guard to all offline examples (#1837)

### What this PR does / why we need it? Add `__main__` guard to all offline examples. - vLLM version: v0.9.2 - vLLM main: 76b494444f --------- Signed-off-by: shen-shanshan <467638484@qq.com>
2025-07-17 14:13:30 +08:00
parent 19e37cd379
commit aeb5aa8b88
15 changed files with 157 additions and 87 deletions
--- a/examples/offline_multi_step_custom_ops.py
+++ b/examples/offline_multi_step_custom_ops.py
@@ -17,34 +17,45 @@
 # limitations under the License.
 #

+import os
+
 from vllm import LLM, SamplingParams

-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-    "China is",
-]
+os.environ["VLLM_USE_MODELSCOPE"] = "True"
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

-# Create a sampling params object.
-sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
-# Create an LLM.
-llm = LLM(
-    model="Qwen/Qwen2.5-0.5B",
-    block_size=128,
-    max_model_len=1024,  # max length of prompt
-    tensor_parallel_size=1,  # number of NPUs to be used
-    max_num_seqs=26,  # max batch number
-    enforce_eager=
-    True,  # Force PyTorch eager execution to debug intermediate tensors (disables graph optimizations)
-    trust_remote_code=
-    True,  # If the model is a cuscd tom model not yet available in the HuggingFace transformers library
-    num_scheduler_steps=8,
-    gpu_memory_utilization=0.5)

-outputs = llm.generate(prompts, sampling_params)
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+def main():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+        "China is",
+    ]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
+    # Create an LLM.
+    llm = LLM(
+        model="Qwen/Qwen2.5-0.5B",
+        block_size=128,
+        max_model_len=1024,  # max length of prompt
+        tensor_parallel_size=1,  # number of NPUs to be used
+        max_num_seqs=26,  # max batch number
+        enforce_eager=
+        True,  # Force PyTorch eager execution to debug intermediate tensors (disables graph optimizations)
+        trust_remote_code=
+        True,  # If the model is a cuscd tom model not yet available in the HuggingFace transformers library
+        num_scheduler_steps=8,
+        gpu_memory_utilization=0.5)
+
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+if __name__ == "__main__":
+    main()