diff --git a/docs/source/tutorials/hardwares/310p.md b/docs/source/tutorials/hardwares/310p.md index 2cbbc40b..1e32995b 100644 --- a/docs/source/tutorials/hardwares/310p.md +++ b/docs/source/tutorials/hardwares/310p.md @@ -49,6 +49,25 @@ export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 ### Online Inference on NPU +```{warning} +For Atlas 300I (310P), do not rely on `max-model-len` auto detection +(omit `--max-model-len`), because it may cause OOM. + +Reason (current 310P attention path): +- `AscendAttentionMetadataBuilder310` passes `model_config.max_model_len` + to `AttentionMaskBuilder310`. +- `AttentionMaskBuilder310` builds a full causal mask with shape + `[max_model_len, max_model_len]` in float16, then casts it to FRACTAL_NZ. +- In 310P `attention_v1` prefill/chunked-prefill + (`_npu_flash_attention` / `_npu_paged_attention_splitfuse`), + this explicit mask tensor is consumed directly, and there is no + compressed-mask path. + +So if auto resolves to a large context length, the mask allocation +(`O(max_model_len^2)`) can exceed NPU memory and trigger OOM. +Always set a conservative explicit value, for example `--max-model-len 4096`. +``` + Run the following script to start the vLLM server on NPU (Qwen3-0.6B:1 card, Qwen2.5-7B-Instruct:2 cards, Pangu-Pro-MoE-72B: 8 cards): :::::{tab-set} @@ -64,9 +83,9 @@ Run the following command to start the vLLM server: :substitutions: vllm serve Qwen/Qwen3-0.6B \ --tensor-parallel-size 1 \ + --max-model-len 4096 \ --enforce-eager \ - --dtype float16 \ - --compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}' + --dtype float16 ``` Once your server is started, you can query the model with input prompts. @@ -94,9 +113,9 @@ Run the following command to start the vLLM server: :substitutions: vllm serve Qwen/Qwen2.5-7B-Instruct \ --tensor-parallel-size 2 \ + --max-model-len 4096 \ --enforce-eager \ - --dtype float16 \ - --compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}' + --dtype float16 ``` Once your server is started, you can query the model with input prompts. @@ -124,9 +143,9 @@ Run the following command to start the vLLM server: :substitutions: vllm serve Qwen/Qwen2.5-VL-3B-Instruct \ --tensor-parallel-size 1 \ + --max-model-len 4096 \ --enforce-eager \ - --dtype float16 \ - --compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}' + --dtype float16 ``` Once your server is started, you can query the model with input prompts. @@ -183,9 +202,9 @@ sampling_params = SamplingParams(max_completion_tokens=100, temperature=0.0) llm = LLM( model="Qwen/Qwen3-0.6B", tensor_parallel_size=1, + max_model_len=4096, enforce_eager=True, # For 300I series, only eager mode is supported. dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 300I series - compilation_config={"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}, # High performance for 300I series ) # Generate texts from the prompts. outputs = llm.generate(prompts, sampling_params) @@ -226,9 +245,9 @@ sampling_params = SamplingParams(max_completion_tokens=100, temperature=0.0) llm = LLM( model="Qwen/Qwen2.5-7B-Instruct", tensor_parallel_size=2, + max_model_len=4096, enforce_eager=True, # For 300I series, only eager mode is supported. dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 300I series - compilation_config={"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}, # High performance for 300I series ) # Generate texts from the prompts. outputs = llm.generate(prompts, sampling_params) @@ -269,9 +288,9 @@ sampling_params = SamplingParams(max_completion_tokens=100, top_p=0.95, top_k=50 llm = LLM( model="Qwen/Qwen2.5-VL-3B-Instruct", tensor_parallel_size=1, + max_model_len=4096, enforce_eager=True, # For 300I series, only eager mode is supported. dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 300I series - compilation_config={"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}, # High performance for 300I series ) # Generate texts from the prompts. outputs = llm.generate(prompts, sampling_params)