xc-llm-ascend/vllm_ascend/ops/vocab_parallel_embedding.py

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from typing import Tuple

import torch
from vllm.distributed import tensor_model_parallel_all_reduce
from vllm.model_executor.layers.vocab_parallel_embedding import \
    VocabParallelEmbedding


def get_masked_input_and_mask(
        input_: torch.Tensor, org_vocab_start_index: int,
        org_vocab_end_index: int, num_org_vocab_padding: int,
        added_vocab_start_index: int,
        added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
    # torch.compile will fuse all of the pointwise ops below
    # into a single kernel, making it very fast
    org_vocab_mask = (input_ >= org_vocab_start_index) & (
        input_ < org_vocab_end_index)
    # Adapt: avoid create added_vocab_mask when added_vocab_start_index == added_vocab_end_index.
    if added_vocab_start_index == added_vocab_end_index:
        valid_offset = (org_vocab_start_index * org_vocab_mask)
        vocab_mask = org_vocab_mask
    else:
        added_vocab_mask = (input_ >= added_vocab_start_index) & (
            input_ < added_vocab_end_index)
        added_offset = added_vocab_start_index - (
            org_vocab_end_index -
            org_vocab_start_index) - num_org_vocab_padding
        valid_offset = (org_vocab_start_index *
                        org_vocab_mask) + (added_offset * added_vocab_mask)
        vocab_mask = org_vocab_mask | added_vocab_mask
    # Adapt end.
    input_ = vocab_mask * (input_ - valid_offset)
    return input_, ~vocab_mask


def vocab_parallel_embedding_forward(self, input_):
    if self.tp_size > 1:
        # Build the mask.
        masked_input, input_mask = get_masked_input_and_mask(
            input_, self.shard_indices.org_vocab_start_index,
            self.shard_indices.org_vocab_end_index,
            self.shard_indices.num_org_vocab_padding,
            self.shard_indices.added_vocab_start_index,
            self.shard_indices.added_vocab_end_index)
    else:
        masked_input = input_
    # Get the embeddings.
    output_parallel = self.quant_method.embedding(self, masked_input.long())
    # Mask the output embedding.
    if self.tp_size > 1:
        output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)
    # Reduce across all the model parallel GPUs.
    output = tensor_model_parallel_all_reduce(output_parallel)
    return output


VocabParallelEmbedding.forward = vocab_parallel_embedding_forward
port deepseekv2 and mtp to main branch (#429) ### What this PR does / why we need it? This PR ports all the deepseek graph mode code and mtp code from v0.7.3 to the main branch --------- Signed-off-by: SidaoY <1024863041@qq.com> Signed-off-by: linfeng-yuan <1102311262@qq.com> Signed-off-by: Yizhou Liu <liuyizhou5@h-partners.com> Signed-off-by: mengwei805 <mengwei25@huawei.com> Signed-off-by: libaokui <libaokui@huawei.com> Signed-off-by: q00832892 <qiaoyang19@huawei.com> Signed-off-by: ganyi <pleaplusone.gy@gmail.com> Co-authored-by: SidaoY <1024863041@qq.com> Co-authored-by: linfeng-yuan <1102311262@qq.com> Co-authored-by: Yizhou Liu <liuyizhou5@h-partners.com> Co-authored-by: mengwei805 <mengwei25@huawei.com> Co-authored-by: libaokui <libaokui@huawei.com> 2025-04-19 17:38:18 +08:00			`#`
			`# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.`
			`# This file is a part of the vllm-ascend project.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

			`from typing import Tuple`

			`import torch`
			`from vllm.distributed import tensor_model_parallel_all_reduce`
			`from vllm.model_executor.layers.vocab_parallel_embedding import \`
			`VocabParallelEmbedding`


			`def get_masked_input_and_mask(`
			`input_: torch.Tensor, org_vocab_start_index: int,`
			`org_vocab_end_index: int, num_org_vocab_padding: int,`
			`added_vocab_start_index: int,`
			`added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:`
			`# torch.compile will fuse all of the pointwise ops below`
			`# into a single kernel, making it very fast`
[1/N][CI] Move linting system to pre-commits hooks (#1256) ### What this PR does / why we need it? Follow vllm-project/vllm lint way: https://github.com/vllm-project/vllm/blob/main/.pre-commit-config.yaml Enable pre-commit to avoid some low level error AMAP. This pr is one step of #1241, The purpose is make linting system more clear and convenient, on this step, Mainly did the following things: yapf, actionlint, ruff, typos, isort, mypy, png-lint, signoff-commit, enforce-import-regex-instead-of-re. TODO: - clang-format(check for csrc with google style) need clean code, disable for now - pymarkdown need clean code, disable for now - shellcheck need clean code, disable for now ### Does this PR introduce _any_ user-facing change? Only developer UX change: https://vllm-ascend--1256.org.readthedocs.build/en/1256/developer_guide/contributing.html#run-lint-locally ``` pip install -r requirements-lint.txt && pre-commit install bash format.sh ``` ### How was this patch tested? CI passed with new added/existing test. Co-authored-by: Yikun [yikunkero@gmail.com](mailto:yikunkero@gmail.com) Co-authored-by: wangli [wangli858794774@gmail.com](mailto:wangli858794774@gmail.com) - vLLM version: v0.9.1 - vLLM main: https://github.com/vllm-project/vllm/commit/5358cce5ffbd4011f8fea2188995a249b43b8bfe --------- Signed-off-by: wangli <wangli858794774@gmail.com> 2025-07-10 14:17:15 +08:00			`org_vocab_mask = (input_ >= org_vocab_start_index) & (`
			`input_ < org_vocab_end_index)`
[V1] MTP supports torchair (#2145) ### What this PR does / why we need it? Support MTP with： - [x] V0 Scheduler - [x] TorchAir - [x] Single DP - [x] Multi DP - [x] Disaggregate PD Known issues： - [ ] Not support V1 Scheduler (chunked prefill), will be supported in a few weeks - [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now, need to comment out the line 171-175 in file `vllm/vllm/v1/metrics/loggers.py` ``` if (len(self.engine_indexes) > 1 and vllm_config.speculative_config is not None): raise NotImplementedError("Prometheus metrics with Spec Decoding " "with >1 EngineCore per AsyncLLM is not " "supported yet.") ``` To start an online server with torchair enabled, here is an example: ``` python -m vllm.entrypoints.openai.api_server \ --model="/weights/DeepSeek-R1_w8a8/" \ --trust-remote-code \ --max-model-len 40000 \ --tensor-parallel-size 4 \ --data_parallel_size 4 \ --max-num-seqs 16 \ --no-enable-prefix-caching \ --enable_expert_parallel \ --served-model-name deepseekr1 \ --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \ --quantization ascend \ --host 0.0.0.0 \ --port 1234 \ --additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \ --gpu_memory_utilization 0.9 ``` offline example with torchair enabled ``` from vllm import LLM, SamplingParams prompts = [ "Hello, my name is", "The president of the United States is", "The capital of France is", "The future of AI is", ] # Create a sampling params object. sampling_params = SamplingParams(max_tokens=16, temperature=0) # Create an LLM. llm = LLM( model="/home/data/DeepSeek-R1_w8a8/", tensor_parallel_size=16, max_num_seqs=16, gpu_memory_utilization=0.9, distributed_executor_backend="mp", enable_expert_parallel=True, speculative_config={ "method": "deepseek_mtp", "num_speculative_tokens": 1, }, trust_remote_code=True, enforce_eager=False, max_model_len=2000, additional_config = { 'torchair_graph_config': { 'enabled': True, "graph_batch_sizes": [16], 'enable_multistream_shared_expert': False, }, "ascend_scheduler_config": { "enabled": True }, # 'expert_tensor_parallel_size': 16, } ) # Generate texts from the prompts. # llm.start_profile() outputs = llm.generate(prompts, sampling_params) # llm.stop_profile() for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3 --------- Signed-off-by: xuyexiong <xuyexiong@huawei.com> 2025-08-06 19:37:43 +08:00			`# Adapt: avoid create added_vocab_mask when added_vocab_start_index == added_vocab_end_index.`
			`if added_vocab_start_index == added_vocab_end_index:`
			`valid_offset = (org_vocab_start_index * org_vocab_mask)`
			`vocab_mask = org_vocab_mask`
			`else:`
			`added_vocab_mask = (input_ >= added_vocab_start_index) & (`
			`input_ < added_vocab_end_index)`
			`added_offset = added_vocab_start_index - (`
			`org_vocab_end_index -`
			`org_vocab_start_index) - num_org_vocab_padding`
			`valid_offset = (org_vocab_start_index *`
			`org_vocab_mask) + (added_offset * added_vocab_mask)`
			`vocab_mask = org_vocab_mask \| added_vocab_mask`
			`# Adapt end.`
port deepseekv2 and mtp to main branch (#429) ### What this PR does / why we need it? This PR ports all the deepseek graph mode code and mtp code from v0.7.3 to the main branch --------- Signed-off-by: SidaoY <1024863041@qq.com> Signed-off-by: linfeng-yuan <1102311262@qq.com> Signed-off-by: Yizhou Liu <liuyizhou5@h-partners.com> Signed-off-by: mengwei805 <mengwei25@huawei.com> Signed-off-by: libaokui <libaokui@huawei.com> Signed-off-by: q00832892 <qiaoyang19@huawei.com> Signed-off-by: ganyi <pleaplusone.gy@gmail.com> Co-authored-by: SidaoY <1024863041@qq.com> Co-authored-by: linfeng-yuan <1102311262@qq.com> Co-authored-by: Yizhou Liu <liuyizhou5@h-partners.com> Co-authored-by: mengwei805 <mengwei25@huawei.com> Co-authored-by: libaokui <libaokui@huawei.com> 2025-04-19 17:38:18 +08:00			`input_ = vocab_mask * (input_ - valid_offset)`
			`return input_, ~vocab_mask`


			`def vocab_parallel_embedding_forward(self, input_):`
			`if self.tp_size > 1:`
			`# Build the mask.`
			`masked_input, input_mask = get_masked_input_and_mask(`
			`input_, self.shard_indices.org_vocab_start_index,`
			`self.shard_indices.org_vocab_end_index,`
			`self.shard_indices.num_org_vocab_padding,`
			`self.shard_indices.added_vocab_start_index,`
			`self.shard_indices.added_vocab_end_index)`
			`else:`
			`masked_input = input_`
			`# Get the embeddings.`
			`output_parallel = self.quant_method.embedding(self, masked_input.long())`
			`# Mask the output embedding.`
			`if self.tp_size > 1:`
			`output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)`
			`# Reduce across all the model parallel GPUs.`
			`output = tensor_model_parallel_all_reduce(output_parallel)`
			`return output`


			`VocabParallelEmbedding.forward = vocab_parallel_embedding_forward`