diff --git a/Dockerfile b/Dockerfile
index ffc8f28..766e0f0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM wjie520/vllm_kunlun:base_v0.0.2
+FROM vllm_kunlun:custom_base_v0.0.3
 
 WORKDIR /workspace
 
diff --git a/README.md b/README.md
index bf0b8ef..69a8e16 100644
--- a/README.md
+++ b/README.md
@@ -11,11 +11,12 @@ One of the key features of this project is efficient memory coordination, enabli
 
 ### Build from Dockerfile
 
-Clone this repository:
+1. Get or build base image (base with customized xpytorch, ops, etc.). Ref: [installation](https://vllm-kunlun.readthedocs.io/en/latest/installation.html).
 
-```bash
-docker build -t $build_image -f ./Dockerfile .
-```
+2. Clone this repository and build
+    ```bash
+    docker build -t $build_image -f ./Dockerfile .
+    ```
 
 ## Usage
 
diff --git a/vllm_kunlun/platforms/kunlun.py b/vllm_kunlun/platforms/kunlun.py
index b5074ef..dada458 100644
--- a/vllm_kunlun/platforms/kunlun.py
+++ b/vllm_kunlun/platforms/kunlun.py
@@ -8,9 +8,6 @@ import vllm.envs as envs
 from vllm.logger import init_logger
 
 
-# fix bfloat16 double size issue
-torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
-
 logger = init_logger(__name__)
 
 class KunlunPlatform(Platform):