add qwen3

This commit is contained in:
Chranos
2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions

View File

@@ -0,0 +1,52 @@
import torch
import sys
import ray
import gc
import contextlib
import os
os.environ['CONTEXT_PARALLEL_EN'] = "True"
from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
def cleanup():
"""Release occupied resources and reset parallel_state"""
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import destroy_model_parallel
destroy_model_parallel()
from vllm.distributed import destroy_distributed_environment
destroy_distributed_environment()
with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group()
gc.collect()
if not current_platform.is_cpu():
torch.cuda.empty_cache()
if ray.is_initialized():
ray.shutdown()
def run_vllm(prompts, sampling_params, tp, cp):
"""Run LLM"""
llm = LLM(model="/data/AE/llm/models/Llama-2-7b-hf/",
enforce_eager=True,
tensor_parallel_size = tp,
context_parallel_size = cp,
distributed_executor_backend='ray')
outputs = llm.generate(prompts, sampling_params)
return outputs
def test_context_parallel():
"""Compare the output results of cp1 and cp2"""
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, max_tokens=16)
outputs_1 = run_vllm(prompts, sampling_params, tp=1, cp=2)
cleanup()
outputs_2 = run_vllm(prompts, sampling_params, tp=1, cp=1)
cleanup()
generated_text_1 = [output.outputs[0].text for output in outputs_1]
generated_text_2 = [output.outputs[0].text for output in outputs_2]
assert generated_text_1 == generated_text_2

View File

@@ -0,0 +1,51 @@
import torch
import sys
import ray
import gc
import contextlib
import os
os.environ['CONTEXT_PARALLEL_EN'] = "True"
from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
def cleanup():
"""Release occupied resources and reset parallel_state"""
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import destroy_model_parallel
destroy_model_parallel()
from vllm.distributed import destroy_distributed_environment
destroy_distributed_environment()
with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group()
gc.collect()
if not current_platform.is_cpu():
torch.cuda.empty_cache()
if ray.is_initialized():
ray.shutdown()
def run_vllm(prompts, sampling_params, tp, cp, use_kv8=False):
"""Run LLM"""
kwargs = dict()
kwargs['model']="/data/AE/llm/models/Llama-2-7b-hf/"
kwargs['enforce_eager']=True,
kwargs['tensor_parallel_size'] = tp
kwargs['context_parallel_size'] = cp
kwargs['distributed_executor_backend']='ray'
kwargs['kv_cache_dtype'] = 'int8'
llm = LLM(**kwargs)
outputs = llm.generate(prompts, sampling_params)
return outputs
def test_context_parallel_with_kv8():
"""Compare the output results of cp1 and cp2 with kv cache int8."""
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, max_tokens=16)
outputs_1 = run_vllm(prompts, sampling_params, tp=1, cp=2)
cleanup()