diff --git a/docs/source/tutorials/310p.md b/docs/source/tutorials/310p.md index a157cd1e..c32347c2 100644 --- a/docs/source/tutorials/310p.md +++ b/docs/source/tutorials/310p.md @@ -3,8 +3,6 @@ ```{note} 1. This Atlas 300I series is currently experimental. In future versions, there may be behavioral changes related to model coverage and performance improvement. 2. Currently, the 310I series only supports eager mode and the float16 data type. -3. There are some known issues for running vLLM on 310p series, you can refer to vllm-ascend [#3316](https://github.com/vllm-project/vllm-ascend/issues/3316) and - [#2795](https://github.com/vllm-project/vllm-ascend/issues/2795). You can use v0.10.0rc1 version first. ``` ## Run vLLM on Atlas 300I Series @@ -145,47 +143,6 @@ curl http://localhost:8000/v1/completions \ }' ``` -:::: - -::::{tab-item} Pangu-Pro-MoE-72B -:sync: pangu - -Download the model: - -```bash -git lfs install -git clone https://gitcode.com/ascend-tribe/pangu-pro-moe-model.git -``` - -Run the following command to start the vLLM server: - -```{code-block} bash - :substitutions: - -vllm serve /home/pangu-pro-moe-mode/ \ ---tensor-parallel-size 4 \ ---enable-expert-parallel \ ---dtype "float16" \ ---trust-remote-code \ ---enforce-eager - -``` - -Once your server is started, you can query the model with input prompts. - -```bash -export question="你是谁?" -curl http://localhost:8000/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "prompt": "[unused9]系统:[unused10][unused9]用户:'${question}'[unused10][unused9]助手:", - "max_completion_tokens": 64, - "top_p": 0.95, - "top_k": 50, - "temperature": 0.6 - }' -``` - :::: ::::: @@ -326,72 +283,6 @@ del llm clean_up() ``` -:::: - -::::{tab-item} Pangu-Pro-MoE-72B -:sync: pangu - -Download the model: - -```bash -git lfs install -git clone https://gitcode.com/ascend-tribe/pangu-pro-moe-model.git -``` - -```{code-block} python - :substitutions: - -import gc -from transformers import AutoTokenizer -import torch - -from vllm import LLM, SamplingParams -from vllm.distributed.parallel_state import (destroy_distributed_environment, - destroy_model_parallel) - -def clean_up(): - destroy_model_parallel() - destroy_distributed_environment() - gc.collect() - torch.npu.empty_cache() - - -if __name__ == "__main__": - - tokenizer = AutoTokenizer.from_pretrained("/home/pangu-pro-moe-mode/", trust_remote_code=True) - tests = [ - "Hello, my name is", - "The future of AI is", - ] - prompts = [] - for text in tests: - messages = [ - {"role": "system", "content": ""}, # Optionally customize system content - {"role": "user", "content": text} - ] - prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # 推荐使用官方的template - prompts.append(prompt) - sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=40) - - llm = LLM(model="/home/pangu-pro-moe-mode/", - tensor_parallel_size=8, - distributed_executor_backend="mp", - enable_expert_parallel=True, - dtype="float16", - max_model_len=1024, - trust_remote_code=True, - enforce_eager=True) - - outputs = llm.generate(prompts, sampling_params) - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - del llm - clean_up() -``` - :::: ::::: diff --git a/vllm_ascend/_310p/worker_310p.py b/vllm_ascend/_310p/worker_310p.py index acb75a2b..e8615b34 100644 --- a/vllm_ascend/_310p/worker_310p.py +++ b/vllm_ascend/_310p/worker_310p.py @@ -34,4 +34,4 @@ class NPUWorker310(NPUWorker): def _warm_up_atb(self): # 310p device do not support torch_npu._npu_matmul_add_fp32 atb ops - logger.info("Skip warm-up atb ops for 310P device") + logger.info("Skip warm-up atb ops for 310P device.")