update qwen
This commit is contained in:
17
Dockerfile_kunlunxin
Normal file
17
Dockerfile_kunlunxin
Normal file
@@ -0,0 +1,17 @@
|
||||
FROM harbor.4pd.io/hardcore-tech/mlu370x4-pytorchv25.01-torch2.5.0-torchmlu1.24.1-ubuntu22.04-py310:0.0.1
|
||||
|
||||
WORKDIR /workspace
|
||||
ADD . /workspace
|
||||
|
||||
#RUN pip install git+https://github.com/huggingface/transformers
|
||||
RUN pip install --no-cache-dir ./transformers-main.zip
|
||||
|
||||
RUN pip install accelerate
|
||||
RUN pip install qwen-vl-utils[decord]==0.0.8
|
||||
RUN pip install flask==3.1.1
|
||||
|
||||
EXPOSE 80
|
||||
ENTRYPOINT ["python3", "Qwen2.5-VL-32B-Instruct-test.py"]
|
||||
|
||||
|
||||
|
||||
259
Qwen2.5-VL-32B-Instruct-test.py
Normal file
259
Qwen2.5-VL-32B-Instruct-test.py
Normal file
@@ -0,0 +1,259 @@
|
||||
import torch
|
||||
import time
|
||||
import os
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
from flask import Flask, request, jsonify
|
||||
|
||||
# 引入寒武纪 MLU 支持
|
||||
try:
|
||||
import torch_mlu
|
||||
print(f"成功导入 torch_mlu,版本: {getattr(torch_mlu, '__version__', 'unknown')}")
|
||||
|
||||
def check_mlu_available():
|
||||
try:
|
||||
test_tensor = torch.randn(2, 2).mlu()
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"MLU不可用: {e}")
|
||||
return False
|
||||
|
||||
MLU_AVAILABLE = check_mlu_available()
|
||||
print(f"MLU设备可用: {MLU_AVAILABLE}")
|
||||
except ImportError as e:
|
||||
torch_mlu = None
|
||||
MLU_AVAILABLE = False
|
||||
print(f"警告: 未找到 torch_mlu 模块: {e}")
|
||||
|
||||
|
||||
print(f"MLU count: {torch.mlu.device_count()}")
|
||||
|
||||
|
||||
# 设置线程数
|
||||
os.environ["OMP_NUM_THREADS"] = "4"
|
||||
torch.set_num_threads(4)
|
||||
|
||||
# 导入 Qwen-VL 特定组件
|
||||
try:
|
||||
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
|
||||
from qwen_vl_utils import process_vision_info
|
||||
except ImportError:
|
||||
raise ImportError("请安装依赖: pip install transformers qwen-vl-utils")
|
||||
|
||||
# 全局变量
|
||||
MODEL_PATH = os.environ.get("MODEL_PATH", "/models")
|
||||
|
||||
class QwenVLMLUClassifier:
|
||||
def __init__(self, model_path: str):
|
||||
self.use_mlu = MLU_AVAILABLE
|
||||
print(f"初始化模型,使用设备: {'MLU' if self.use_mlu else 'CPU'}")
|
||||
|
||||
# 加载 processor
|
||||
print(f"从 {model_path} 加载处理器...")
|
||||
self.processor = AutoProcessor.from_pretrained(model_path)
|
||||
|
||||
print("加载模型中...")
|
||||
|
||||
# 关键:使用 device_map="auto" 让 accelerate 自动分配模型到多卡
|
||||
self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
||||
model_path,
|
||||
torch_dtype=torch.float16,
|
||||
low_cpu_mem_usage=True,
|
||||
device_map="auto", # ✅ 自动分配到所有可用 MLU 卡
|
||||
# 注意:不要使用 flash_attention_2(MLU 不支持)
|
||||
)
|
||||
|
||||
# 禁止手动移动模型!device_map 已经完成设备分配
|
||||
# ❌ self.model = self.model.mlu() # 会破坏 device_map
|
||||
|
||||
self.model = self.model.eval()
|
||||
print("模型加载完成,已根据 device_map 分配到设备")
|
||||
|
||||
# 打印每层所在的设备(调试用)
|
||||
print("模型各层设备分布:")
|
||||
for name, module in self.model.named_modules():
|
||||
if hasattr(module, "weight") and module.weight is not None:
|
||||
print(f"{name}: {module.weight.device}")
|
||||
elif hasattr(module, "_hf_hook") and hasattr(module._hf_hook, "execution_device"):
|
||||
print(f"{name}: {module._hf_hook.execution_device}")
|
||||
if len(list(module.children())) == 0: # 只打印叶节点
|
||||
break
|
||||
|
||||
def predict(self, image: Image.Image, prompt: str = "Describe this image.") -> dict:
|
||||
start_time = time.perf_counter()
|
||||
|
||||
try:
|
||||
# 构造 messages
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image", "image": image},
|
||||
{"type": "text", "text": prompt},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# 应用 chat template
|
||||
text = self.processor.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
# 处理视觉输入
|
||||
image_inputs, video_inputs = process_vision_info(messages)
|
||||
|
||||
# 构建模型输入
|
||||
inputs = self.processor(
|
||||
text=[text],
|
||||
images=image_inputs,
|
||||
videos=video_inputs,
|
||||
padding=True,
|
||||
#truncation=True, # ✅ 启用截断
|
||||
#max_length=2048, # ✅ 限制最大长度
|
||||
max_pixels=384*384, # ✅ 控制图像大小(关键!)
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
# ✅ 只对 image_grid_thw 等字段转 int32(如果存在)
|
||||
for key in ['image_grid_thw', 'video_grid_thw']:
|
||||
if key in inputs and inputs[key].dtype == torch.long:
|
||||
# 检查范围
|
||||
val = inputs[key]
|
||||
if val.max() >= 2147483647 or val.min() < -2147483648:
|
||||
print(f"Warning: {key} out of int32 range, clamping...")
|
||||
val = val.clamp(-2147483648, 2147483647)
|
||||
inputs[key] = val.to(torch.int32)
|
||||
|
||||
|
||||
|
||||
# ✅ 让 generate 自动处理设备(Hugging Face 内部会 dispatch 到正确设备)
|
||||
# 不要手动 .to("mlu:x"),否则会出错!
|
||||
inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
|
||||
|
||||
with torch.no_grad():
|
||||
ts = time.time()
|
||||
generated_ids = self.model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=128,
|
||||
# 如果你想强制使用特定设备生成,可以设置:
|
||||
# synced_gpus=True, # 多卡同步生成(可选)
|
||||
)
|
||||
print(f"生成耗时: {time.time() - ts:.3f}s")
|
||||
|
||||
# 解码输出
|
||||
generated_ids_trimmed = [
|
||||
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
|
||||
]
|
||||
output_texts = self.processor.batch_decode(
|
||||
generated_ids_trimmed,
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=False
|
||||
)
|
||||
response = output_texts[0].strip()
|
||||
|
||||
processing_time = round(time.perf_counter() - start_time, 4)
|
||||
|
||||
return {
|
||||
"response": response,
|
||||
"device_used": "mlu", # 因为 device_map="auto" 且 MLU 可用
|
||||
"processing_time": processing_time,
|
||||
"status": "success"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"response": "",
|
||||
"error": str(e),
|
||||
"device_used": "mlu" if self.use_mlu else "cpu",
|
||||
"processing_time": 0.0,
|
||||
"status": "error"
|
||||
}
|
||||
|
||||
|
||||
# 初始化 Flask
|
||||
app = Flask(__name__)
|
||||
classifier = None
|
||||
|
||||
@app.before_request
|
||||
def ensure_model_loaded():
|
||||
global classifier
|
||||
if request.endpoint == 'predict' and classifier is None:
|
||||
return jsonify({"status": "error", "message": "模型未加载"}), 500
|
||||
|
||||
@app.route('/predict', methods=['POST'])
|
||||
def predict():
|
||||
if 'image' not in request.files:
|
||||
return jsonify({
|
||||
"status": "error",
|
||||
"message": "请求中未包含图片"
|
||||
}), 400
|
||||
|
||||
try:
|
||||
image_file = request.files['image']
|
||||
image = Image.open(BytesIO(image_file.read())).convert("RGB")
|
||||
prompt = request.form.get('prompt', 'Describe this image.')
|
||||
result = classifier.predict(image, prompt)
|
||||
|
||||
if result["status"] == "success":
|
||||
return jsonify({
|
||||
"status": "success",
|
||||
"response": result["response"],
|
||||
"device_used": result["device_used"],
|
||||
"processing_time": result["processing_time"]
|
||||
})
|
||||
else:
|
||||
return jsonify({
|
||||
"status": "error",
|
||||
"error": result["error"],
|
||||
"device_used": result["device_used"]
|
||||
}), 500
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
"status": "error",
|
||||
"error": f"处理失败: {str(e)}"
|
||||
}), 500
|
||||
|
||||
@app.route('/health', methods=['GET'])
|
||||
def health_check():
|
||||
return jsonify({
|
||||
"status": "healthy" if classifier is not None else "unhealthy",
|
||||
"model_loaded": classifier is not None,
|
||||
"using_mlu": classifier.use_mlu if classifier else False,
|
||||
"mlu_available": MLU_AVAILABLE,
|
||||
"timestamp": time.time()
|
||||
})
|
||||
|
||||
@app.route('/info', methods=['GET'])
|
||||
def device_info():
|
||||
return jsonify({
|
||||
"model": MODEL_PATH,
|
||||
"torch_version": torch.__version__,
|
||||
"torch_mlu_version": getattr(torch_mlu, '__version__', None) if torch_mlu else None,
|
||||
"mlu_available": MLU_AVAILABLE,
|
||||
"using_device": "mlu" if classifier and classifier.use_mlu else "cpu",
|
||||
"model_loaded": classifier is not None,
|
||||
"timestamp": time.time()
|
||||
})
|
||||
|
||||
@app.route('/test', methods=['GET'])
|
||||
def test_mlu():
|
||||
if not MLU_AVAILABLE:
|
||||
return jsonify({"status": "error", "message": "MLU不可用"}), 500
|
||||
try:
|
||||
x = torch.randn(2, 2).mlu()
|
||||
y = x + x
|
||||
return jsonify({"status": "success", "result": y.cpu().tolist()})
|
||||
except Exception as e:
|
||||
return jsonify({"status": "error", "message": str(e)}), 500
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(f"正在加载模型: {MODEL_PATH}")
|
||||
try:
|
||||
classifier = QwenVLMLUClassifier(MODEL_PATH)
|
||||
print("模型加载成功,启动 Flask 服务...")
|
||||
app.run(host='0.0.0.0', port=80, debug=False)
|
||||
except Exception as e:
|
||||
print(f" 模型加载失败: {e}")
|
||||
exit(1)
|
||||
38
README.md
Normal file
38
README.md
Normal file
@@ -0,0 +1,38 @@
|
||||
# enginex-mlu370-vl-qwen
|
||||
|
||||
运行于寒武纪 mlu370 算力卡的【视觉多模态理解】引擎,支持 Qwen2.5-VL-7B-Instruct、Qwen2.5-VL-32B-Instruct、Qwen2.5-VL-72B-Instruct 模型
|
||||
|
||||
## QuickStart
|
||||
|
||||
1、从 modelscope上下载模型 Qwen2.5-VL-7B-Instruct、Qwen2.5-VL-32B-Instruct、Qwen2.5-VL-72B-Instruct
|
||||
到目录 /mnt/contest_ceph/zhoushasha/models/Qwen/Qwen2.5-VL-7B-Instruct
|
||||
|
||||
2、使用Dockerfile生成镜像
|
||||
使用 Dockerfile 生成 镜像
|
||||
```python
|
||||
docker build -f Dockerfile -t test-cambricon:Qwen2.5-VL-32B-Instruct .
|
||||
```
|
||||
|
||||
3、启动docker
|
||||
```python
|
||||
docker run -it --privileged \
|
||||
-p 10091:80 \
|
||||
--device=/dev/cambricon_dev0:/dev/cambricon_dev0 \
|
||||
--device=/dev/cambricon_dev1:/dev/cambricon_dev1 \
|
||||
--device=/dev/cambricon_dev2:/dev/cambricon_dev2 \
|
||||
--device=/dev/cambricon_dev3:/dev/cambricon_dev3 \
|
||||
--device=/dev/cambricon_ctl \
|
||||
--device=/dev/cambricon_ipcm0:/dev/cambricon_ipcm0 \
|
||||
--device=/dev/cambricon_ipcm1:/dev/cambricon_ipcm1 \
|
||||
--device=/dev/cambricon_ipcm2:/dev/cambricon_ipcm2 \
|
||||
--device=/dev/cambricon_ipcm3:/dev/cambricon_ipcm3 \
|
||||
-v /mnt/contest_ceph/zhoushasha/models/Qwen/Qwen2.5-VL-32B-Instruct:/models:ro \
|
||||
test-cambricon:Qwen2.5-VL-32B-Instruct
|
||||
```
|
||||
|
||||
4、测试服务
|
||||
```python
|
||||
curl -X POST http://localhost:10091/predict \
|
||||
-F "image=@demo.jpeg" \
|
||||
-F "prompt=What is happening in this image?"
|
||||
```
|
||||
Reference in New Issue
Block a user