commit eaab9c7f3d61b6303508807a8916047a2f4efaa2 Author: zhousha <736730048@qq.com> Date: Thu Oct 16 18:33:26 2025 +0800 update qwen diff --git a/Dockerfile_kunlunxin b/Dockerfile_kunlunxin new file mode 100644 index 0000000..4751e59 --- /dev/null +++ b/Dockerfile_kunlunxin @@ -0,0 +1,17 @@ +FROM harbor.4pd.io/hardcore-tech/mlu370x4-pytorchv25.01-torch2.5.0-torchmlu1.24.1-ubuntu22.04-py310:0.0.1 + +WORKDIR /workspace +ADD . /workspace + +#RUN pip install git+https://github.com/huggingface/transformers +RUN pip install --no-cache-dir ./transformers-main.zip + +RUN pip install accelerate +RUN pip install qwen-vl-utils[decord]==0.0.8 +RUN pip install flask==3.1.1 + +EXPOSE 80 +ENTRYPOINT ["python3", "Qwen2.5-VL-32B-Instruct-test.py"] + + + diff --git a/Qwen2.5-VL-32B-Instruct-test.py b/Qwen2.5-VL-32B-Instruct-test.py new file mode 100644 index 0000000..11cc354 --- /dev/null +++ b/Qwen2.5-VL-32B-Instruct-test.py @@ -0,0 +1,259 @@ +import torch +import time +import os +from PIL import Image +from io import BytesIO +from flask import Flask, request, jsonify + +# 引入寒武纪 MLU 支持 +try: + import torch_mlu + print(f"成功导入 torch_mlu,版本: {getattr(torch_mlu, '__version__', 'unknown')}") + + def check_mlu_available(): + try: + test_tensor = torch.randn(2, 2).mlu() + return True + except Exception as e: + print(f"MLU不可用: {e}") + return False + + MLU_AVAILABLE = check_mlu_available() + print(f"MLU设备可用: {MLU_AVAILABLE}") +except ImportError as e: + torch_mlu = None + MLU_AVAILABLE = False + print(f"警告: 未找到 torch_mlu 模块: {e}") + + +print(f"MLU count: {torch.mlu.device_count()}") + + +# 设置线程数 +os.environ["OMP_NUM_THREADS"] = "4" +torch.set_num_threads(4) + +# 导入 Qwen-VL 特定组件 +try: + from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor + from qwen_vl_utils import process_vision_info +except ImportError: + raise ImportError("请安装依赖: pip install transformers qwen-vl-utils") + +# 全局变量 +MODEL_PATH = os.environ.get("MODEL_PATH", "/models") + +class QwenVLMLUClassifier: + def __init__(self, model_path: str): + self.use_mlu = MLU_AVAILABLE + print(f"初始化模型,使用设备: {'MLU' if self.use_mlu else 'CPU'}") + + # 加载 processor + print(f"从 {model_path} 加载处理器...") + self.processor = AutoProcessor.from_pretrained(model_path) + + print("加载模型中...") + + # 关键:使用 device_map="auto" 让 accelerate 自动分配模型到多卡 + self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained( + model_path, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + device_map="auto", # ✅ 自动分配到所有可用 MLU 卡 + # 注意:不要使用 flash_attention_2(MLU 不支持) + ) + + # 禁止手动移动模型!device_map 已经完成设备分配 + # ❌ self.model = self.model.mlu() # 会破坏 device_map + + self.model = self.model.eval() + print("模型加载完成,已根据 device_map 分配到设备") + + # 打印每层所在的设备(调试用) + print("模型各层设备分布:") + for name, module in self.model.named_modules(): + if hasattr(module, "weight") and module.weight is not None: + print(f"{name}: {module.weight.device}") + elif hasattr(module, "_hf_hook") and hasattr(module._hf_hook, "execution_device"): + print(f"{name}: {module._hf_hook.execution_device}") + if len(list(module.children())) == 0: # 只打印叶节点 + break + + def predict(self, image: Image.Image, prompt: str = "Describe this image.") -> dict: + start_time = time.perf_counter() + + try: + # 构造 messages + messages = [ + { + "role": "user", + "content": [ + {"type": "image", "image": image}, + {"type": "text", "text": prompt}, + ], + } + ] + + # 应用 chat template + text = self.processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + # 处理视觉输入 + image_inputs, video_inputs = process_vision_info(messages) + + # 构建模型输入 + inputs = self.processor( + text=[text], + images=image_inputs, + videos=video_inputs, + padding=True, + #truncation=True, # ✅ 启用截断 + #max_length=2048, # ✅ 限制最大长度 + max_pixels=384*384, # ✅ 控制图像大小(关键!) + return_tensors="pt", + ) + + # ✅ 只对 image_grid_thw 等字段转 int32(如果存在) + for key in ['image_grid_thw', 'video_grid_thw']: + if key in inputs and inputs[key].dtype == torch.long: + # 检查范围 + val = inputs[key] + if val.max() >= 2147483647 or val.min() < -2147483648: + print(f"Warning: {key} out of int32 range, clamping...") + val = val.clamp(-2147483648, 2147483647) + inputs[key] = val.to(torch.int32) + + + + # ✅ 让 generate 自动处理设备(Hugging Face 内部会 dispatch 到正确设备) + # 不要手动 .to("mlu:x"),否则会出错! + inputs = {k: v.to(self.model.device) for k, v in inputs.items()} + + with torch.no_grad(): + ts = time.time() + generated_ids = self.model.generate( + **inputs, + max_new_tokens=128, + # 如果你想强制使用特定设备生成,可以设置: + # synced_gpus=True, # 多卡同步生成(可选) + ) + print(f"生成耗时: {time.time() - ts:.3f}s") + + # 解码输出 + generated_ids_trimmed = [ + out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids) + ] + output_texts = self.processor.batch_decode( + generated_ids_trimmed, + skip_special_tokens=True, + clean_up_tokenization_spaces=False + ) + response = output_texts[0].strip() + + processing_time = round(time.perf_counter() - start_time, 4) + + return { + "response": response, + "device_used": "mlu", # 因为 device_map="auto" 且 MLU 可用 + "processing_time": processing_time, + "status": "success" + } + + except Exception as e: + return { + "response": "", + "error": str(e), + "device_used": "mlu" if self.use_mlu else "cpu", + "processing_time": 0.0, + "status": "error" + } + + +# 初始化 Flask +app = Flask(__name__) +classifier = None + +@app.before_request +def ensure_model_loaded(): + global classifier + if request.endpoint == 'predict' and classifier is None: + return jsonify({"status": "error", "message": "模型未加载"}), 500 + +@app.route('/predict', methods=['POST']) +def predict(): + if 'image' not in request.files: + return jsonify({ + "status": "error", + "message": "请求中未包含图片" + }), 400 + + try: + image_file = request.files['image'] + image = Image.open(BytesIO(image_file.read())).convert("RGB") + prompt = request.form.get('prompt', 'Describe this image.') + result = classifier.predict(image, prompt) + + if result["status"] == "success": + return jsonify({ + "status": "success", + "response": result["response"], + "device_used": result["device_used"], + "processing_time": result["processing_time"] + }) + else: + return jsonify({ + "status": "error", + "error": result["error"], + "device_used": result["device_used"] + }), 500 + + except Exception as e: + return jsonify({ + "status": "error", + "error": f"处理失败: {str(e)}" + }), 500 + +@app.route('/health', methods=['GET']) +def health_check(): + return jsonify({ + "status": "healthy" if classifier is not None else "unhealthy", + "model_loaded": classifier is not None, + "using_mlu": classifier.use_mlu if classifier else False, + "mlu_available": MLU_AVAILABLE, + "timestamp": time.time() + }) + +@app.route('/info', methods=['GET']) +def device_info(): + return jsonify({ + "model": MODEL_PATH, + "torch_version": torch.__version__, + "torch_mlu_version": getattr(torch_mlu, '__version__', None) if torch_mlu else None, + "mlu_available": MLU_AVAILABLE, + "using_device": "mlu" if classifier and classifier.use_mlu else "cpu", + "model_loaded": classifier is not None, + "timestamp": time.time() + }) + +@app.route('/test', methods=['GET']) +def test_mlu(): + if not MLU_AVAILABLE: + return jsonify({"status": "error", "message": "MLU不可用"}), 500 + try: + x = torch.randn(2, 2).mlu() + y = x + x + return jsonify({"status": "success", "result": y.cpu().tolist()}) + except Exception as e: + return jsonify({"status": "error", "message": str(e)}), 500 + + +if __name__ == "__main__": + print(f"正在加载模型: {MODEL_PATH}") + try: + classifier = QwenVLMLUClassifier(MODEL_PATH) + print("模型加载成功,启动 Flask 服务...") + app.run(host='0.0.0.0', port=80, debug=False) + except Exception as e: + print(f" 模型加载失败: {e}") + exit(1) diff --git a/README.md b/README.md new file mode 100644 index 0000000..a126a74 --- /dev/null +++ b/README.md @@ -0,0 +1,38 @@ +# enginex-mlu370-vl-qwen + +运行于寒武纪 mlu370 算力卡的【视觉多模态理解】引擎,支持 Qwen2.5-VL-7B-Instruct、Qwen2.5-VL-32B-Instruct、Qwen2.5-VL-72B-Instruct 模型 + +## QuickStart + +1、从 modelscope上下载模型 Qwen2.5-VL-7B-Instruct、Qwen2.5-VL-32B-Instruct、Qwen2.5-VL-72B-Instruct +到目录 /mnt/contest_ceph/zhoushasha/models/Qwen/Qwen2.5-VL-7B-Instruct + +2、使用Dockerfile生成镜像 +使用 Dockerfile 生成 镜像 +```python +docker build -f Dockerfile -t test-cambricon:Qwen2.5-VL-32B-Instruct . +``` + +3、启动docker +```python +docker run -it --privileged \ + -p 10091:80 \ + --device=/dev/cambricon_dev0:/dev/cambricon_dev0 \ + --device=/dev/cambricon_dev1:/dev/cambricon_dev1 \ + --device=/dev/cambricon_dev2:/dev/cambricon_dev2 \ + --device=/dev/cambricon_dev3:/dev/cambricon_dev3 \ + --device=/dev/cambricon_ctl \ + --device=/dev/cambricon_ipcm0:/dev/cambricon_ipcm0 \ + --device=/dev/cambricon_ipcm1:/dev/cambricon_ipcm1 \ + --device=/dev/cambricon_ipcm2:/dev/cambricon_ipcm2 \ + --device=/dev/cambricon_ipcm3:/dev/cambricon_ipcm3 \ + -v /mnt/contest_ceph/zhoushasha/models/Qwen/Qwen2.5-VL-32B-Instruct:/models:ro \ + test-cambricon:Qwen2.5-VL-32B-Instruct +``` + +4、测试服务 +```python +curl -X POST http://localhost:10091/predict \ + -F "image=@demo.jpeg" \ + -F "prompt=What is happening in this image?" +``` diff --git a/demo.jpeg b/demo.jpeg new file mode 100644 index 0000000..9fdc040 Binary files /dev/null and b/demo.jpeg differ