[Docs] Update Altlas 300I series doc and fix CI lint (#1537)
### What this PR does / why we need it? - Update Altlas 300I series doc: cleanup unused parameters and enable optimized ops - Fix code spell CI ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed --------- Signed-off-by: leo-pony <nengjunma@outlook.com> Signed-off-by: Yikun Jiang <yikunkero@gmail.com> Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
This commit is contained in:
2
.github/workflows/doc_codespell.yaml
vendored
2
.github/workflows/doc_codespell.yaml
vendored
@@ -28,6 +28,6 @@ jobs:
|
|||||||
- name: Run codespell check
|
- name: Run codespell check
|
||||||
run: |
|
run: |
|
||||||
CODESPELL_EXCLUDES=('--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**')
|
CODESPELL_EXCLUDES=('--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**')
|
||||||
CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn')
|
CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,assertIn')
|
||||||
|
|
||||||
codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" "${CODESPELL_IGNORE_WORDS[@]}"
|
codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" "${CODESPELL_IGNORE_WORDS[@]}"
|
||||||
|
|||||||
@@ -61,31 +61,24 @@ Run the following command to start the vLLM server:
|
|||||||
```{code-block} bash
|
```{code-block} bash
|
||||||
:substitutions:
|
:substitutions:
|
||||||
export VLLM_USE_V1=1
|
export VLLM_USE_V1=1
|
||||||
export MODEL="Qwen/Qwen3-0.6B"
|
vllm serve Qwen/Qwen3-0.6B \
|
||||||
python -m vllm.entrypoints.api_server \
|
|
||||||
--model $MODEL \
|
|
||||||
--tensor-parallel-size 1 \
|
--tensor-parallel-size 1 \
|
||||||
--max-num-batched-tokens 2048 \
|
|
||||||
--gpu-memory-utilization 0.5 \
|
|
||||||
--max-num-seqs 4 \
|
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--trust-remote-code \
|
|
||||||
--max-model-len 1024 \
|
|
||||||
--disable-custom-all-reduce \
|
|
||||||
--dtype float16 \
|
--dtype float16 \
|
||||||
--port 8000 \
|
--compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}'
|
||||||
--compilation-config '{"custom_ops":["+rms_norm", "+rotary_embedding"]}'
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Once your server is started, you can query the model with input prompts
|
Once your server is started, you can query the model with input prompts
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl http://localhost:8000/generate \
|
curl http://localhost:8000/v1/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{
|
-d '{
|
||||||
"prompt": "Hello, my name is ?",
|
"prompt": "The future of AI is",
|
||||||
"max_tokens": 20,
|
"max_tokens": 64,
|
||||||
"temperature": 0
|
"top_p": 0.95,
|
||||||
|
"top_k": 50,
|
||||||
|
"temperature": 0.6
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
::::
|
::::
|
||||||
@@ -98,31 +91,24 @@ Run the following command to start the vLLM server:
|
|||||||
```{code-block} bash
|
```{code-block} bash
|
||||||
:substitutions:
|
:substitutions:
|
||||||
export VLLM_USE_V1=1
|
export VLLM_USE_V1=1
|
||||||
export MODEL="Qwen/Qwen2.5-7B-Instruct"
|
vllm serve Qwen/Qwen2.5-7B-Instruct \
|
||||||
python -m vllm.entrypoints.api_server \
|
|
||||||
--model $MODEL \
|
|
||||||
--tensor-parallel-size 2 \
|
--tensor-parallel-size 2 \
|
||||||
--max-num-batched-tokens 2048 \
|
|
||||||
--gpu-memory-utilization 0.5 \
|
|
||||||
--max-num-seqs 4 \
|
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--trust-remote-code \
|
|
||||||
--max-model-len 1024 \
|
|
||||||
--disable-custom-all-reduce \
|
|
||||||
--dtype float16 \
|
--dtype float16 \
|
||||||
--port 8000 \
|
--compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}'
|
||||||
--compilation-config '{"custom_ops":["+rms_norm", "+rotary_embedding"]}'
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Once your server is started, you can query the model with input prompts
|
Once your server is started, you can query the model with input prompts
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl http://localhost:8000/generate \
|
curl http://localhost:8000/v1/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{
|
-d '{
|
||||||
"prompt": "Hello, my name is ?",
|
"prompt": "The future of AI is",
|
||||||
"max_tokens": 20,
|
"max_tokens": 64,
|
||||||
"temperature": 0
|
"top_p": 0.95,
|
||||||
|
"top_k": 50,
|
||||||
|
"temperature": 0.6
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -206,14 +192,10 @@ sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
|
|||||||
# Create an LLM.
|
# Create an LLM.
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
model="Qwen/Qwen3-0.6B",
|
model="Qwen/Qwen3-0.6B",
|
||||||
max_model_len=4096,
|
|
||||||
max_num_seqs=4,
|
|
||||||
trust_remote_code=True,
|
|
||||||
tensor_parallel_size=1,
|
tensor_parallel_size=1,
|
||||||
enforce_eager=True, # For 300I series, only eager mode is supported.
|
enforce_eager=True, # For 300I series, only eager mode is supported.
|
||||||
dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 300I series
|
dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 300I series
|
||||||
disable_custom_all_reduce=True, # IMPORTANT cause 300I series needed
|
compilation_config={"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}, # High performance for 300I series
|
||||||
compilation_config={"custom_ops":["+rms_norm", "+rotary_embedding"]}, # IMPORTANT cause 300I series needed custom ops
|
|
||||||
)
|
)
|
||||||
# Generate texts from the prompts.
|
# Generate texts from the prompts.
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
@@ -253,14 +235,10 @@ sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
|
|||||||
# Create an LLM.
|
# Create an LLM.
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
model="Qwen/Qwen2.5-7B-Instruct",
|
model="Qwen/Qwen2.5-7B-Instruct",
|
||||||
max_model_len=4096,
|
|
||||||
max_num_seqs=4,
|
|
||||||
trust_remote_code=True,
|
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
enforce_eager=True, # For 300I series, only eager mode is supported.
|
enforce_eager=True, # For 300I series, only eager mode is supported.
|
||||||
dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 300I series
|
dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 300I series
|
||||||
disable_custom_all_reduce=True, # IMPORTANT cause 300I series needed
|
compilation_config={"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}, # High performance for 300I series
|
||||||
compilation_config={"custom_ops":["+rms_norm", "+rotary_embedding"]}, # IMPORTANT cause 300I series needed custom ops
|
|
||||||
)
|
)
|
||||||
# Generate texts from the prompts.
|
# Generate texts from the prompts.
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|||||||
@@ -145,7 +145,7 @@ CODESPELL_EXCLUDES=(
|
|||||||
)
|
)
|
||||||
|
|
||||||
CODESPELL_IGNORE_WORDS=(
|
CODESPELL_IGNORE_WORDS=(
|
||||||
'-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn'
|
'-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,assertIn'
|
||||||
)
|
)
|
||||||
|
|
||||||
# check spelling of specified files
|
# check spelling of specified files
|
||||||
|
|||||||
Reference in New Issue
Block a user