update

2025-09-15 16:08:27 +08:00
parent 873859adcb
commit b7dce67c2c
5 changed files with 434 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+*.zip
--- a/026_0010.jpg
+++ b/026_0010.jpg
--- a/20
+++ b/20
@@ -0,0 +1,20 @@
+FROM harbor.4pd.io/mic-llm-x/combricon-mlu370x4-base:v0.2.0-tgiv1.4.3-btv0.6.0-pt2.1-x86_64-ubuntu22.04-py310
+
+
+RUN if [ ! -d "/torch/venv3/pytorch" ]; then echo "虚拟环境路径缺失！"; exit 1; fi
+
+
+WORKDIR /workspace/
+COPY ./model_test_caltech_http_mlu370.py /workspace/
+COPY ./microsoft_beit_base_patch16_224_pt22k_ft22k /model
+
+
+# 安装transformers 4.46.3
+RUN pip install transformers==4.46.3 uvicorn\[standard\] fastapi flask==3.1.1
+
+EXPOSE 80
+
+ENTRYPOINT ["python3", "model_test_caltech_http_mlu370.py"]
+
+
+
--- a/README.md
+++ b/README.md
@@ -1,2 +1,36 @@
-# enginex-mlu370-vc
+# enginex-ascend-910-vc-cnn

+运行于【寒武纪】系列算力卡的【视觉分类】引擎，基于 CNN 架构，支持 BEiT、MobileViT 等流行模型
+
+## QuickStart
+
+1、从 modelscope上下载视觉分类的模型，例如 microsoft/beit-base-patch16-224
+```python
+modelscope download --model microsoft/beit-base-patch16-224 README.md --local_dir /mnt/contest_ceph/zhoushasha/models/microsoft/beit_base_patch16_224_pt22k_ft22k
+```
+
+2、使用Dockerfile生成镜像
+从仓库的【软件包】栏目下载基础镜像 combricon-mlu370x4-base:v0.2.0-tgiv1.4.3-btv0.6.0-pt2.1-x86_64-ubuntu22.04-py310
+使用 Dockerfile_mlu370 生成 镜像
+```python
+docker build -f Dockerfile_mlu370 -t combricon-mlu370x4-base-zhoushasha:v4.0 .
+```
+注意 Dockerfile_mlu370 中已预先将模型 microsoft_beit_base_patch16_224_pt22k_ft22k 放在了 /model 下面
+
+3、启动docker
+```python
+docker run -it \
+  -p 10091:80 \
+  --privileged \
+  --name test_mlu370 \
+  --device /dev/cambricon_ctl \
+  --device /dev/cambricon[0-3] \
+  -v /usr/bin/cnmon:/usr/bin/cnmon \
+  combricon-mlu370x4-base-zhoushasha:v4.0
+```
+
+4、测试服务
+```python
+curl -X POST http://localhost:10091/v1/private/s782b4996 \
+>   -F "image=@/home/zhoushasha/models/026_0010.jpg"
+```
--- a/model_test_caltech_http_mlu370.py
+++ b/model_test_caltech_http_mlu370.py
@@ -0,0 +1,378 @@
+import torch
+import time
+import os
+from PIL import Image
+from transformers import AutoImageProcessor, AutoModelForImageClassification
+from flask import Flask, request, jsonify
+from io import BytesIO
+
+# 引入寒武纪MLU相关模块
+try:
+    import torch_mlu
+    print(f"成功导入torch_mlu，版本: {getattr(torch_mlu, '__version__', 'unknown')}")
+    
+    # 简单的MLU可用性测试
+    def check_mlu_available():
+        try:
+            test_tensor = torch.randn(2, 2).mlu()
+            return True
+        except:
+            return False
+    
+    def get_mlu_device_count():
+        """通过尝试多个设备索引来检测可用的MLU设备数量"""
+        max_devices_to_check = 8
+        available_devices = 0
+        
+        for i in range(max_devices_to_check):
+            try:
+                test_tensor = torch.randn(2, 2).mlu(i)
+                available_devices += 1
+                print(f"MLU设备 {i} 可用")
+            except:
+                break
+        
+        return available_devices
+    
+    def get_device_name(device_index):
+        """获取设备名称"""
+        try:
+            return f"MLU-Device-{device_index}"
+        except:
+            return f"MLU-Device-{device_index} (Unknown)"
+    
+    # 创建模拟的ct模块
+    class MLUModel:
+        @staticmethod
+        def is_mlu_available():
+            return check_mlu_available()
+        
+        @staticmethod
+        def device_count():
+            return get_mlu_device_count()
+        
+        @staticmethod
+        def get_device_name(device_index):
+            return get_device_name(device_index)
+    
+    ct = MLUModel()
+    MLU_AVAILABLE = check_mlu_available()
+    print(f"MLU设备可用: {MLU_AVAILABLE}")
+    print(f"检测到 {ct.device_count()} 个MLU设备")
+    
+except ImportError:
+    torch_mlu = None
+    ct = None
+    MLU_AVAILABLE = False
+    print("警告: 未找到torch_mlu模块，无法使用MLU设备")
+except Exception as e:
+    torch_mlu = None
+    ct = None
+    MLU_AVAILABLE = False
+    print(f"MLU初始化警告: {str(e)}")
+
+# 设置CPU核心数
+os.environ["OMP_NUM_THREADS"] = "4"
+os.environ["MKL_NUM_THREADS"] = "4"
+os.environ["NUMEXPR_NUM_THREADS"] = "4"
+os.environ["OPENBLAS_NUM_THREADS"] = "4"
+os.environ["VECLIB_MAXIMUM_THREADS"] = "4"
+torch.set_num_threads(4)
+
+class MLUImageClassifier:
+    def __init__(self, model_path: str):
+        # 检测并使用MLU设备
+        self.use_mlu = self._check_mlu_availability()
+        print(f"使用设备: {'MLU' if self.use_mlu else 'CPU'}")
+        
+        # 加载处理器和模型
+        self.processor = AutoImageProcessor.from_pretrained(model_path)
+        self.model = self._load_model(model_path)
+        self.id2label = self.model.config.id2label
+        
+        # 验证模型设备
+        self._verify_model_device()
+
+    def _check_mlu_availability(self):
+        """检查MLU设备是否可用"""
+        if torch_mlu is None:
+            print("MLU不可用: torch_mlu模块未找到")
+            return False
+            
+        try:
+            # 测试MLU基本功能
+            test_tensor = torch.randn(2, 2).mlu()
+            test_result = test_tensor + test_tensor
+            print("MLU设备可用性测试通过")
+            return True
+        except Exception as e:
+            print(f"MLU设备测试失败: {e}")
+            return False
+
+    def _load_model(self, model_path: str) -> AutoModelForImageClassification:
+        """加载模型到合适的设备"""
+        try:
+            # 先在CPU加载模型
+            model = AutoModelForImageClassification.from_pretrained(
+                model_path, 
+                torch_dtype=torch.float32
+            )
+            
+            if self.use_mlu:
+                # 先将模型完全移动到CPU确保稳定
+                model = model.cpu()
+                
+                # 使用.mlu()方法将模型移动到MLU设备
+                model = model.mlu()
+                print("模型成功加载到MLU设备")
+            else:
+                model = model.cpu()
+                print("模型加载到CPU设备")
+            
+            return model.eval()
+            
+        except Exception as e:
+            print(f"模型加载失败: {str(e)}")
+            # 尝试fallback到CPU模式
+            try:
+                model = model.cpu()
+                print("Fallback到CPU模式")
+                return model.eval()
+            except:
+                raise RuntimeError(f"模型加载完全失败: {str(e)}")
+
+    def _verify_model_device(self):
+        """验证模型设备"""
+        try:
+            param = next(self.model.parameters())
+            if self.use_mlu:
+                # 对于MLU设备，通过简单操作验证
+                test_output = param + 0
+                print("MLU模型验证成功")
+            else:
+                print("CPU模型验证成功")
+                
+        except StopIteration:
+            print("警告: 模型没有可训练参数")
+        except Exception as e:
+            print(f"模型验证警告: {e}")
+
+    def _predict_with_mlu(self, image) -> dict:
+        """在MLU上执行推理"""
+        try:
+            start_time = time.perf_counter()
+            
+            # 预处理
+            inputs = self.processor(images=image, return_tensors="pt")
+            
+            if self.use_mlu:
+                # 将输入数据移动到MLU
+                inputs_mlu = {}
+                for key, value in inputs.items():
+                    if hasattr(value, 'mlu'):
+                        inputs_mlu[key] = value.mlu()
+                    else:
+                        inputs_mlu[key] = value
+                
+                # 执行推理
+                with torch.no_grad():
+                    # 首次推理(热身)
+                    ts = time.time()
+                    outputs = self.model(**inputs_mlu)
+                    #first_pass_time = time.time() - ts
+                    print('mlu370 T1', time.time() - ts, flush=True)
+                    
+                    # 多次推理(性能测试)
+                    ts = time.time()
+                    #for _ in range(5):  # 减少测试次数
+                    for i in range(800):
+                        outputs = self.model(**inputs_mlu)
+                    #batch_pass_time = time.time() - ts
+                    print('mlu370 T2', time.time() - ts, flush=True)
+            else:
+                # CPU推理
+                with torch.no_grad():
+                    ts = time.time()
+                    outputs = self.model(**inputs)
+                    #first_pass_time = time.time() - ts
+                    print('cpu T1', time.time() - ts, flush=True)
+                    
+                    ts = time.time()
+                    #for _ in range(5):
+                    
+                        outputs = self.model(**inputs)
+                    #batch_pass_time = time.time() - ts
+                    print('cpu T2', time.time() - ts, flush=True)
+                    
+            
+            # 计算结果
+            logits = outputs.logits
+            probs = torch.nn.functional.softmax(logits, dim=-1)
+            max_prob, max_idx = probs.max(dim=-1)
+            class_idx = max_idx.item()
+            
+            processing_time = round(time.perf_counter() - start_time, 6)
+            
+            return {
+                "class_id": class_idx,
+                "class_name": self.id2label.get(class_idx, f"class_{class_idx}"),
+                "confidence": float(max_prob.item()),
+                "device_used": "mlu" if self.use_mlu else "cpu",
+                "processing_time": processing_time
+                
+            }
+        except Exception as e:
+            return {
+                "class_id": -1,
+                "class_name": "error",
+                "confidence": 0.0,
+                "device_used": "mlu" if self.use_mlu else "cpu",
+                "processing_time": 0.0,
+                "error": str(e)
+            }
+
+    def predict(self, image) -> dict:
+        """预测入口"""
+        return self._predict_with_mlu(image)
+
+# 初始化Flask应用
+app = Flask(__name__)
+
+# 全局模型加载
+try:
+    MODEL_PATH = os.environ.get("MODEL_PATH", "/model")
+    print(f"从路径加载模型: {MODEL_PATH}")
+    classifier = MLUImageClassifier(MODEL_PATH)
+    print("模型加载成功")
+except Exception as e:
+    print(f"服务初始化失败: {str(e)}")
+    classifier = None
+
+@app.route('/v1/private/s782b4996', methods=['POST'])
+def predict():
+    """接收单张图片并返回GPU预测结果"""
+    if classifier is None:
+        return jsonify({
+            "status": "error",
+            "prediction": {
+                "class_id": -1,
+                "class_name": "error",
+                "confidence": 0.0,
+                "device_used": "unknown",
+                "processing_time": 0.0,
+                "error": "服务未初始化成功"
+            }
+        }), 500
+        
+    if 'image' not in request.files:
+        return jsonify({
+            "status": "error",
+            "prediction": {
+                "class_id": -1,
+                "class_name": "error",
+                "confidence": 0.0,
+                "device_used": "mlu" if classifier.use_mlu else "cpu",
+                "processing_time": 0.0,
+                "error": "请求中未包含图片"
+            }
+        }), 400
+    
+    try:
+        image_file = request.files['image']
+        image = Image.open(BytesIO(image_file.read())).convert("RGB")
+        result = classifier.predict(image)
+        
+        if 'error' in result:
+            return jsonify({
+                "status": "error",
+                "prediction": result
+            }), 500
+        else:
+            return jsonify({
+                "status": "success",
+                "prediction": result
+            })
+    except Exception as e:
+        return jsonify({
+            "status": "error",
+            "prediction": {
+                "class_id": -1,
+                "class_name": "error",
+                "confidence": 0.0,
+                "device_used": "mlu" if classifier and classifier.use_mlu else "cpu",
+                "processing_time": 0.0,
+                "error": f"处理图片失败: {str(e)}"
+            }
+        }), 500
+
+@app.route('/health', methods=['GET'])
+def health_check():
+    """健康检查接口"""
+    mlu_available = False
+    mlu_info = {}
+    
+    if torch_mlu is not None and hasattr(ct, 'is_mlu_available'):
+        try:
+            mlu_available = ct.is_mlu_available()
+            mlu_info = {
+                "device_count": ct.device_count(),
+                "devices": [ct.get_device_name(i) for i in range(ct.device_count())]
+            }
+        except Exception as e:
+            mlu_info["error"] = str(e)
+    
+    return jsonify({
+        "status": "healthy" if classifier is not None else "degraded",
+        "mlu_available": mlu_available,
+        "mlu_info": mlu_info,
+        "model_loaded": classifier is not None,
+        "using_mlu": classifier.use_mlu if classifier else False,
+        "timestamp": time.time()
+    })
+
+@app.route('/test', methods=['GET'])
+def test_mlu():
+    """MLU测试接口"""
+    try:
+        if torch_mlu is None:
+            return jsonify({
+                "status": "error",
+                "message": "torch_mlu模块未找到",
+                "mlu_working": False
+            }), 500
+            
+        # 测试MLU基本功能
+        test_tensor = torch.randn(3, 3).mlu()
+        result_tensor = test_tensor + test_tensor
+        result_cpu = result_tensor.cpu()
+        
+        return jsonify({
+            "status": "success",
+            "message": "MLU测试通过",
+            "result_shape": str(result_cpu.shape),
+            "mlu_working": True
+        })
+    except Exception as e:
+        return jsonify({
+            "status": "error",
+            "message": f"MLU测试失败: {str(e)}",
+            "mlu_working": False
+        }), 500
+
+@app.route('/info', methods=['GET'])
+def device_info():
+    """设备信息接口"""
+    info = {
+        "pytorch_version": torch.__version__,
+        "torch_mlu_available": torch_mlu is not None,
+        "mlu_devices_count": ct.device_count() if torch_mlu and hasattr(ct, 'device_count') else 0,
+        "model_loaded": classifier is not None,
+        "using_mlu": classifier.use_mlu if classifier else False,
+        "system_time": time.time()
+    }
+    return jsonify(info)
+
+if __name__ == "__main__":
+    # 启动HTTP服务 - 使用Flask内置服务器
+    print("启动MLU图像分类服务...")
+    app.run(host='0.0.0.0', port=80, debug=False)