Fix e2e data parallel test: add resource release code (#1881)
### What this PR does / why we need it?
Fix e2e data parallel test: add resource release code and give more time
to engine to pause their processing loops before exiting.
### Does this PR introduce _any_ user-facing change?
No
- vLLM version: v0.9.2
- vLLM main:
5895afd780
Signed-off-by: leo-pony <nengjunma@outlook.com>
This commit is contained in:
@@ -56,14 +56,19 @@ Multi-node:
|
||||
|
||||
import os
|
||||
from time import sleep
|
||||
import contextlib
|
||||
import gc
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.utils import get_open_port
|
||||
from vllm.distributed.parallel_state import ( # noqa E402
|
||||
destroy_distributed_environment, destroy_model_parallel)
|
||||
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
|
||||
def parse_args():
|
||||
import argparse
|
||||
|
||||
@@ -110,6 +115,15 @@ def parse_args():
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def cleanup_env_and_memory():
|
||||
destroy_model_parallel()
|
||||
destroy_distributed_environment()
|
||||
with contextlib.suppress(AssertionError):
|
||||
torch.distributed.destroy_process_group()
|
||||
gc.collect()
|
||||
torch.npu.empty_cache()
|
||||
torch.npu.reset_peak_memory_stats()
|
||||
|
||||
def main(
|
||||
model,
|
||||
dp_size,
|
||||
@@ -185,8 +199,9 @@ def main(
|
||||
f"Generated text: {generated_text!r}")
|
||||
|
||||
# Give engines time to pause their processing loops before exiting.
|
||||
sleep(1)
|
||||
|
||||
sleep(5)
|
||||
del llm
|
||||
cleanup_env_and_memory()
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
|
||||
Reference in New Issue
Block a user