add qwen3
This commit is contained in:
@@ -0,0 +1,52 @@
|
||||
import torch
|
||||
import sys
|
||||
import ray
|
||||
import gc
|
||||
import contextlib
|
||||
import os
|
||||
os.environ['CONTEXT_PARALLEL_EN'] = "True"
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
def cleanup():
|
||||
"""Release occupied resources and reset parallel_state"""
|
||||
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import destroy_model_parallel
|
||||
destroy_model_parallel()
|
||||
from vllm.distributed import destroy_distributed_environment
|
||||
destroy_distributed_environment()
|
||||
with contextlib.suppress(AssertionError):
|
||||
torch.distributed.destroy_process_group()
|
||||
gc.collect()
|
||||
if not current_platform.is_cpu():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
if ray.is_initialized():
|
||||
ray.shutdown()
|
||||
|
||||
def run_vllm(prompts, sampling_params, tp, cp):
|
||||
"""Run LLM"""
|
||||
llm = LLM(model="/data/AE/llm/models/Llama-2-7b-hf/",
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size = tp,
|
||||
context_parallel_size = cp,
|
||||
distributed_executor_backend='ray')
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
return outputs
|
||||
|
||||
def test_context_parallel():
|
||||
"""Compare the output results of cp1 and cp2"""
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0.8, max_tokens=16)
|
||||
outputs_1 = run_vllm(prompts, sampling_params, tp=1, cp=2)
|
||||
cleanup()
|
||||
outputs_2 = run_vllm(prompts, sampling_params, tp=1, cp=1)
|
||||
cleanup()
|
||||
generated_text_1 = [output.outputs[0].text for output in outputs_1]
|
||||
generated_text_2 = [output.outputs[0].text for output in outputs_2]
|
||||
assert generated_text_1 == generated_text_2
|
||||
@@ -0,0 +1,51 @@
|
||||
import torch
|
||||
import sys
|
||||
import ray
|
||||
import gc
|
||||
import contextlib
|
||||
import os
|
||||
os.environ['CONTEXT_PARALLEL_EN'] = "True"
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
def cleanup():
|
||||
"""Release occupied resources and reset parallel_state"""
|
||||
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import destroy_model_parallel
|
||||
destroy_model_parallel()
|
||||
from vllm.distributed import destroy_distributed_environment
|
||||
destroy_distributed_environment()
|
||||
with contextlib.suppress(AssertionError):
|
||||
torch.distributed.destroy_process_group()
|
||||
gc.collect()
|
||||
if not current_platform.is_cpu():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
if ray.is_initialized():
|
||||
ray.shutdown()
|
||||
|
||||
def run_vllm(prompts, sampling_params, tp, cp, use_kv8=False):
|
||||
"""Run LLM"""
|
||||
kwargs = dict()
|
||||
kwargs['model']="/data/AE/llm/models/Llama-2-7b-hf/"
|
||||
kwargs['enforce_eager']=True,
|
||||
kwargs['tensor_parallel_size'] = tp
|
||||
kwargs['context_parallel_size'] = cp
|
||||
kwargs['distributed_executor_backend']='ray'
|
||||
kwargs['kv_cache_dtype'] = 'int8'
|
||||
|
||||
llm = LLM(**kwargs)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
return outputs
|
||||
|
||||
def test_context_parallel_with_kv8():
|
||||
"""Compare the output results of cp1 and cp2 with kv cache int8."""
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0.8, max_tokens=16)
|
||||
outputs_1 = run_vllm(prompts, sampling_params, tp=1, cp=2)
|
||||
cleanup()
|
||||
@@ -0,0 +1,76 @@
|
||||
import torch
|
||||
import sys
|
||||
import ray
|
||||
import gc
|
||||
import contextlib
|
||||
import numpy as np
|
||||
import os
|
||||
os.environ['EXPERT_PARALLEL_EN'] = "True"
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
def string_list_to_float(text_list: list):
|
||||
'''
|
||||
convert string list to float list
|
||||
'''
|
||||
txt = np.array(text_list)
|
||||
max_len = max(len(s) for s in txt)
|
||||
string_to_float = lambda s: np.array([ord(char) for char in s.ljust(max_len)])
|
||||
txt_char = np.array([string_to_float(s) for s in txt])
|
||||
txt_float = txt_char.astype('float32')
|
||||
return txt_float
|
||||
|
||||
def compute_diff_text(baseline_text: list, compare_text: list):
|
||||
'''
|
||||
compute the outputs diff1 and diff2
|
||||
'''
|
||||
baseline = string_list_to_float(baseline_text)
|
||||
compare = string_list_to_float(compare_text)
|
||||
error = np.abs(baseline - compare)
|
||||
diff1 = np.sum(error) / np.sum(np.abs(baseline))
|
||||
diff2 = np.sqrt(np.sum(error**2)/np.sum(baseline**2))
|
||||
return diff1, diff2
|
||||
|
||||
def cleanup():
|
||||
'''Release occupied resources and reset parallel_state'''
|
||||
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import destroy_model_parallel
|
||||
destroy_model_parallel()
|
||||
from vllm.distributed import destroy_distributed_environment
|
||||
destroy_distributed_environment()
|
||||
with contextlib.suppress(AssertionError):
|
||||
torch.distributed.destroy_process_group()
|
||||
gc.collect()
|
||||
if not current_platform.is_cpu():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
if ray.is_initialized():
|
||||
ray.shutdown()
|
||||
|
||||
def run_vllm(prompts, sampling_params, tp, mtp=-1, mep=-1, model_dir="/data/AE/llm/models/Qwen1.5-MoE-A2.7B/"):
|
||||
'''Run LLM'''
|
||||
llm = LLM(model=model_dir,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=tp,
|
||||
moe_tp_size=mtp,
|
||||
moe_ep_size=mep)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
return outputs
|
||||
|
||||
def test_expert_parallel():
|
||||
"""Compare the output results of tp4 and mtp=1, 2"""
|
||||
qwen2_moe_model_dir = "/data/AE/llm/models/Qwen1.5-MoE-A2.7B"
|
||||
eps = 1e-6
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0.8, max_tokens=1)
|
||||
outputs_1 = run_vllm(prompts, sampling_params, tp=2, mtp=1, model_dir=qwen2_moe_model_dir)
|
||||
cleanup()
|
||||
outputs_2 = run_vllm(prompts, sampling_params, tp=2, mtp=2, model_dir=qwen2_moe_model_dir)
|
||||
cleanup()
|
||||
generated_text_1 = [output.outputs[0].text for output in outputs_1]
|
||||
generated_text_2 = [output.outputs[0].text for output in outputs_2]
|
||||
diff1, diff2 = compute_diff_text(generated_text_1, generated_text_2)
|
||||
assert diff1 <= eps and diff2 <= eps, (
|
||||
f"qwen2_moe generated_1({generated_text_1}) and generated_2{generated_text_2} diff error")
|
||||
Reference in New Issue
Block a user