update qwen

2025-10-16 18:33:26 +08:00
commit eaab9c7f3d
4 changed files with 314 additions and 0 deletions
--- a/17
+++ b/17
@@ -0,0 +1,17 @@
+FROM harbor.4pd.io/hardcore-tech/mlu370x4-pytorchv25.01-torch2.5.0-torchmlu1.24.1-ubuntu22.04-py310:0.0.1
+
+WORKDIR /workspace
+ADD . /workspace
+
+#RUN pip install git+https://github.com/huggingface/transformers
+RUN pip install --no-cache-dir ./transformers-main.zip
+
+RUN pip install accelerate
+RUN pip install qwen-vl-utils[decord]==0.0.8
+RUN pip install flask==3.1.1
+
+EXPOSE 80
+ENTRYPOINT ["python3", "Qwen2.5-VL-32B-Instruct-test.py"]
+                                                                    
+                                                                          
+
--- a/Qwen2.5-VL-32B-Instruct-test.py
+++ b/Qwen2.5-VL-32B-Instruct-test.py
@@ -0,0 +1,259 @@
+import torch
+import time
+import os
+from PIL import Image
+from io import BytesIO
+from flask import Flask, request, jsonify
+
+# 引入寒武纪 MLU 支持
+try:
+    import torch_mlu
+    print(f"成功导入 torch_mlu，版本: {getattr(torch_mlu, '__version__', 'unknown')}")
+    
+    def check_mlu_available():
+        try:
+            test_tensor = torch.randn(2, 2).mlu()
+            return True
+        except Exception as e:
+            print(f"MLU不可用: {e}")
+            return False
+
+    MLU_AVAILABLE = check_mlu_available()
+    print(f"MLU设备可用: {MLU_AVAILABLE}")
+except ImportError as e:
+    torch_mlu = None
+    MLU_AVAILABLE = False
+    print(f"警告: 未找到 torch_mlu 模块: {e}")
+
+
+print(f"MLU count: {torch.mlu.device_count()}")
+
+
+# 设置线程数
+os.environ["OMP_NUM_THREADS"] = "4"
+torch.set_num_threads(4)
+
+# 导入 Qwen-VL 特定组件
+try:
+    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+    from qwen_vl_utils import process_vision_info
+except ImportError:
+    raise ImportError("请安装依赖: pip install transformers qwen-vl-utils")
+
+# 全局变量
+MODEL_PATH = os.environ.get("MODEL_PATH", "/models")
+
+class QwenVLMLUClassifier:
+    def __init__(self, model_path: str):
+        self.use_mlu = MLU_AVAILABLE
+        print(f"初始化模型，使用设备: {'MLU' if self.use_mlu else 'CPU'}")
+
+        # 加载 processor
+        print(f"从 {model_path} 加载处理器...")
+        self.processor = AutoProcessor.from_pretrained(model_path)
+
+        print("加载模型中...")
+
+        # 关键：使用 device_map="auto" 让 accelerate 自动分配模型到多卡
+        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_path,
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            device_map="auto",  # ✅ 自动分配到所有可用 MLU 卡
+            # 注意：不要使用 flash_attention_2（MLU 不支持）
+        )
+
+        # 禁止手动移动模型！device_map 已经完成设备分配
+        # ❌ self.model = self.model.mlu()  # 会破坏 device_map
+
+        self.model = self.model.eval()
+        print("模型加载完成，已根据 device_map 分配到设备")
+        
+        # 打印每层所在的设备（调试用）
+        print("模型各层设备分布:")
+        for name, module in self.model.named_modules():
+            if hasattr(module, "weight") and module.weight is not None:
+                print(f"{name}: {module.weight.device}")
+            elif hasattr(module, "_hf_hook") and hasattr(module._hf_hook, "execution_device"):
+                print(f"{name}: {module._hf_hook.execution_device}")
+            if len(list(module.children())) == 0:  # 只打印叶节点
+                break
+
+    def predict(self, image: Image.Image, prompt: str = "Describe this image.") -> dict:
+        start_time = time.perf_counter()
+
+        try:
+            # 构造 messages
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": image},
+                        {"type": "text", "text": prompt},
+                    ],
+                }
+            ]
+
+            # 应用 chat template
+            text = self.processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+
+            # 处理视觉输入
+            image_inputs, video_inputs = process_vision_info(messages)
+
+            # 构建模型输入
+            inputs = self.processor(
+                text=[text],
+                images=image_inputs,
+                videos=video_inputs,
+                padding=True,
+                #truncation=True,                # ✅ 启用截断
+                #max_length=2048,                # ✅ 限制最大长度
+                max_pixels=384*384,              # ✅ 控制图像大小（关键！）
+                return_tensors="pt",
+            )
+            
+            # ✅ 只对 image_grid_thw 等字段转 int32（如果存在）
+            for key in ['image_grid_thw', 'video_grid_thw']:
+                if key in inputs and inputs[key].dtype == torch.long:
+                    # 检查范围
+                    val = inputs[key]
+                    if val.max() >= 2147483647 or val.min() < -2147483648:
+                        print(f"Warning: {key} out of int32 range, clamping...")
+                        val = val.clamp(-2147483648, 2147483647)
+                    inputs[key] = val.to(torch.int32)            
+
+
+
+            # ✅ 让 generate 自动处理设备（Hugging Face 内部会 dispatch 到正确设备）
+            # 不要手动 .to("mlu:x")，否则会出错！
+            inputs = {k: v.to(self.model.device) for k, v in inputs.items()} 
+
+            with torch.no_grad():
+                ts = time.time()
+                generated_ids = self.model.generate(
+                    **inputs,
+                    max_new_tokens=128,
+                    # 如果你想强制使用特定设备生成，可以设置：
+                    # synced_gpus=True,  # 多卡同步生成（可选）
+                )
+                print(f"生成耗时: {time.time() - ts:.3f}s")
+
+            # 解码输出
+            generated_ids_trimmed = [
+                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
+            ]
+            output_texts = self.processor.batch_decode(
+                generated_ids_trimmed,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False
+            )
+            response = output_texts[0].strip()
+
+            processing_time = round(time.perf_counter() - start_time, 4)
+
+            return {
+                "response": response,
+                "device_used": "mlu",  # 因为 device_map="auto" 且 MLU 可用
+                "processing_time": processing_time,
+                "status": "success"
+            }
+
+        except Exception as e:
+            return {
+                "response": "",
+                "error": str(e),
+                "device_used": "mlu" if self.use_mlu else "cpu",
+                "processing_time": 0.0,
+                "status": "error"
+            }
+
+
+# 初始化 Flask
+app = Flask(__name__)
+classifier = None
+
+@app.before_request
+def ensure_model_loaded():
+    global classifier
+    if request.endpoint == 'predict' and classifier is None:
+        return jsonify({"status": "error", "message": "模型未加载"}), 500
+
+@app.route('/predict', methods=['POST'])
+def predict():
+    if 'image' not in request.files:
+        return jsonify({
+            "status": "error",
+            "message": "请求中未包含图片"
+        }), 400
+
+    try:
+        image_file = request.files['image']
+        image = Image.open(BytesIO(image_file.read())).convert("RGB")
+        prompt = request.form.get('prompt', 'Describe this image.')
+        result = classifier.predict(image, prompt)
+
+        if result["status"] == "success":
+            return jsonify({
+                "status": "success",
+                "response": result["response"],
+                "device_used": result["device_used"],
+                "processing_time": result["processing_time"]
+            })
+        else:
+            return jsonify({
+                "status": "error",
+                "error": result["error"],
+                "device_used": result["device_used"]
+            }), 500
+
+    except Exception as e:
+        return jsonify({
+            "status": "error",
+            "error": f"处理失败: {str(e)}"
+        }), 500
+
+@app.route('/health', methods=['GET'])
+def health_check():
+    return jsonify({
+        "status": "healthy" if classifier is not None else "unhealthy",
+        "model_loaded": classifier is not None,
+        "using_mlu": classifier.use_mlu if classifier else False,
+        "mlu_available": MLU_AVAILABLE,
+        "timestamp": time.time()
+    })
+
+@app.route('/info', methods=['GET'])
+def device_info():
+    return jsonify({
+        "model": MODEL_PATH,
+        "torch_version": torch.__version__,
+        "torch_mlu_version": getattr(torch_mlu, '__version__', None) if torch_mlu else None,
+        "mlu_available": MLU_AVAILABLE,
+        "using_device": "mlu" if classifier and classifier.use_mlu else "cpu",
+        "model_loaded": classifier is not None,
+        "timestamp": time.time()
+    })
+
+@app.route('/test', methods=['GET'])
+def test_mlu():
+    if not MLU_AVAILABLE:
+        return jsonify({"status": "error", "message": "MLU不可用"}), 500
+    try:
+        x = torch.randn(2, 2).mlu()
+        y = x + x
+        return jsonify({"status": "success", "result": y.cpu().tolist()})
+    except Exception as e:
+        return jsonify({"status": "error", "message": str(e)}), 500
+
+
+if __name__ == "__main__":
+    print(f"正在加载模型: {MODEL_PATH}")
+    try:
+        classifier = QwenVLMLUClassifier(MODEL_PATH)
+        print("模型加载成功，启动 Flask 服务...")
+        app.run(host='0.0.0.0', port=80, debug=False)
+    except Exception as e:
+        print(f" 模型加载失败: {e}")
+        exit(1)
--- a/README.md
+++ b/README.md
@@ -0,0 +1,38 @@
+# enginex-mlu370-vl-qwen
+
+运行于寒武纪 mlu370 算力卡的【视觉多模态理解】引擎，支持 Qwen2.5-VL-7B-Instruct、Qwen2.5-VL-32B-Instruct、Qwen2.5-VL-72B-Instruct 模型
+
+## QuickStart
+
+1、从 modelscope上下载模型 Qwen2.5-VL-7B-Instruct、Qwen2.5-VL-32B-Instruct、Qwen2.5-VL-72B-Instruct
+到目录 /mnt/contest_ceph/zhoushasha/models/Qwen/Qwen2.5-VL-7B-Instruct
+
+2、使用Dockerfile生成镜像
+使用 Dockerfile 生成 镜像
+```python
+docker build -f Dockerfile -t test-cambricon:Qwen2.5-VL-32B-Instruct .
+```
+
+3、启动docker
+```python
+docker run -it --privileged \
+  -p 10091:80 \
+  --device=/dev/cambricon_dev0:/dev/cambricon_dev0 \
+  --device=/dev/cambricon_dev1:/dev/cambricon_dev1 \
+  --device=/dev/cambricon_dev2:/dev/cambricon_dev2 \
+  --device=/dev/cambricon_dev3:/dev/cambricon_dev3 \
+  --device=/dev/cambricon_ctl \
+  --device=/dev/cambricon_ipcm0:/dev/cambricon_ipcm0 \
+  --device=/dev/cambricon_ipcm1:/dev/cambricon_ipcm1 \
+  --device=/dev/cambricon_ipcm2:/dev/cambricon_ipcm2 \
+  --device=/dev/cambricon_ipcm3:/dev/cambricon_ipcm3 \
+  -v /mnt/contest_ceph/zhoushasha/models/Qwen/Qwen2.5-VL-32B-Instruct:/models:ro \
+  test-cambricon:Qwen2.5-VL-32B-Instruct
+```
+
+4、测试服务
+```python
+curl -X POST http://localhost:10091/predict \
+  -F "image=@demo.jpeg" \
+  -F "prompt=What is happening in this image?"
+```
--- a/demo.jpeg
+++ b/demo.jpeg