diff --git a/.github/workflows/doc_codespell.yaml b/.github/workflows/doc_codespell.yaml index 3b7a9d2..930603c 100644 --- a/.github/workflows/doc_codespell.yaml +++ b/.github/workflows/doc_codespell.yaml @@ -28,6 +28,6 @@ jobs: - name: Run codespell check run: | CODESPELL_EXCLUDES=('--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**') - CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn') + CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,assertIn') codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" "${CODESPELL_IGNORE_WORDS[@]}" diff --git a/docs/source/tutorials/single_node_300i.md b/docs/source/tutorials/single_node_300i.md index 5f3c86b..7ea2bd9 100644 --- a/docs/source/tutorials/single_node_300i.md +++ b/docs/source/tutorials/single_node_300i.md @@ -61,31 +61,24 @@ Run the following command to start the vLLM server: ```{code-block} bash :substitutions: export VLLM_USE_V1=1 -export MODEL="Qwen/Qwen3-0.6B" -python -m vllm.entrypoints.api_server \ - --model $MODEL \ +vllm serve Qwen/Qwen3-0.6B \ --tensor-parallel-size 1 \ - --max-num-batched-tokens 2048 \ - --gpu-memory-utilization 0.5 \ - --max-num-seqs 4 \ --enforce-eager \ - --trust-remote-code \ - --max-model-len 1024 \ - --disable-custom-all-reduce \ --dtype float16 \ - --port 8000 \ - --compilation-config '{"custom_ops":["+rms_norm", "+rotary_embedding"]}' + --compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}' ``` Once your server is started, you can query the model with input prompts ```bash -curl http://localhost:8000/generate \ +curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "prompt": "Hello, my name is ?", - "max_tokens": 20, - "temperature": 0 + "prompt": "The future of AI is", + "max_tokens": 64, + "top_p": 0.95, + "top_k": 50, + "temperature": 0.6 }' ``` :::: @@ -98,31 +91,24 @@ Run the following command to start the vLLM server: ```{code-block} bash :substitutions: export VLLM_USE_V1=1 -export MODEL="Qwen/Qwen2.5-7B-Instruct" -python -m vllm.entrypoints.api_server \ - --model $MODEL \ +vllm serve Qwen/Qwen2.5-7B-Instruct \ --tensor-parallel-size 2 \ - --max-num-batched-tokens 2048 \ - --gpu-memory-utilization 0.5 \ - --max-num-seqs 4 \ --enforce-eager \ - --trust-remote-code \ - --max-model-len 1024 \ - --disable-custom-all-reduce \ --dtype float16 \ - --port 8000 \ - --compilation-config '{"custom_ops":["+rms_norm", "+rotary_embedding"]}' + --compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}' ``` Once your server is started, you can query the model with input prompts ```bash -curl http://localhost:8000/generate \ +curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "prompt": "Hello, my name is ?", - "max_tokens": 20, - "temperature": 0 + "prompt": "The future of AI is", + "max_tokens": 64, + "top_p": 0.95, + "top_k": 50, + "temperature": 0.6 }' ``` @@ -206,14 +192,10 @@ sampling_params = SamplingParams(max_tokens=100, temperature=0.0) # Create an LLM. llm = LLM( model="Qwen/Qwen3-0.6B", - max_model_len=4096, - max_num_seqs=4, - trust_remote_code=True, tensor_parallel_size=1, enforce_eager=True, # For 300I series, only eager mode is supported. dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 300I series - disable_custom_all_reduce=True, # IMPORTANT cause 300I series needed - compilation_config={"custom_ops":["+rms_norm", "+rotary_embedding"]}, # IMPORTANT cause 300I series needed custom ops + compilation_config={"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}, # High performance for 300I series ) # Generate texts from the prompts. outputs = llm.generate(prompts, sampling_params) @@ -253,14 +235,10 @@ sampling_params = SamplingParams(max_tokens=100, temperature=0.0) # Create an LLM. llm = LLM( model="Qwen/Qwen2.5-7B-Instruct", - max_model_len=4096, - max_num_seqs=4, - trust_remote_code=True, tensor_parallel_size=2, enforce_eager=True, # For 300I series, only eager mode is supported. dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 300I series - disable_custom_all_reduce=True, # IMPORTANT cause 300I series needed - compilation_config={"custom_ops":["+rms_norm", "+rotary_embedding"]}, # IMPORTANT cause 300I series needed custom ops + compilation_config={"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}, # High performance for 300I series ) # Generate texts from the prompts. outputs = llm.generate(prompts, sampling_params) diff --git a/format.sh b/format.sh index 2a00ce3..de083c1 100755 --- a/format.sh +++ b/format.sh @@ -145,7 +145,7 @@ CODESPELL_EXCLUDES=( ) CODESPELL_IGNORE_WORDS=( - '-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn' + '-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,assertIn' ) # check spelling of specified files