xc-llm-ascend/vllm_ascend/ops/vocab_parallel_embedding.py

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from typing import Tuple

import torch
from vllm.distributed import tensor_model_parallel_all_reduce
from vllm.model_executor.layers.vocab_parallel_embedding import \
    VocabParallelEmbedding


def get_masked_input_and_mask(
        input_: torch.Tensor, org_vocab_start_index: int,
        org_vocab_end_index: int, num_org_vocab_padding: int,
        added_vocab_start_index: int,
        added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
    # torch.compile will fuse all of the pointwise ops below
    # into a single kernel, making it very fast
    org_vocab_mask = (input_ >= org_vocab_start_index) & (
        input_ < org_vocab_end_index)
    added_vocab_mask = (input_ >= added_vocab_start_index) & (
        input_ < added_vocab_end_index)
    added_offset = added_vocab_start_index - (
        org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding
    valid_offset = (org_vocab_start_index *
                    org_vocab_mask) + (added_offset * added_vocab_mask)
    vocab_mask = org_vocab_mask | added_vocab_mask
    input_ = vocab_mask * (input_ - valid_offset)
    return input_, ~vocab_mask


def vocab_parallel_embedding_forward(self, input_):
    if self.tp_size > 1:
        # Build the mask.
        masked_input, input_mask = get_masked_input_and_mask(
            input_, self.shard_indices.org_vocab_start_index,
            self.shard_indices.org_vocab_end_index,
            self.shard_indices.num_org_vocab_padding,
            self.shard_indices.added_vocab_start_index,
            self.shard_indices.added_vocab_end_index)
    else:
        masked_input = input_
    # Get the embeddings.
    output_parallel = self.quant_method.embedding(self, masked_input.long())
    # Mask the output embedding.
    if self.tp_size > 1:
        output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)
    # Reduce across all the model parallel GPUs.
    output = tensor_model_parallel_all_reduce(output_parallel)
    return output


VocabParallelEmbedding.forward = vocab_parallel_embedding_forward
port deepseekv2 and mtp to main branch (#429) ### What this PR does / why we need it? This PR ports all the deepseek graph mode code and mtp code from v0.7.3 to the main branch --------- Signed-off-by: SidaoY <1024863041@qq.com> Signed-off-by: linfeng-yuan <1102311262@qq.com> Signed-off-by: Yizhou Liu <liuyizhou5@h-partners.com> Signed-off-by: mengwei805 <mengwei25@huawei.com> Signed-off-by: libaokui <libaokui@huawei.com> Signed-off-by: q00832892 <qiaoyang19@huawei.com> Signed-off-by: ganyi <pleaplusone.gy@gmail.com> Co-authored-by: SidaoY <1024863041@qq.com> Co-authored-by: linfeng-yuan <1102311262@qq.com> Co-authored-by: Yizhou Liu <liuyizhou5@h-partners.com> Co-authored-by: mengwei805 <mengwei25@huawei.com> Co-authored-by: libaokui <libaokui@huawei.com> 2025-04-19 17:38:18 +08:00			`#`
			`# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.`
			`# This file is a part of the vllm-ascend project.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

			`from typing import Tuple`

			`import torch`
			`from vllm.distributed import tensor_model_parallel_all_reduce`
			`from vllm.model_executor.layers.vocab_parallel_embedding import \`
			`VocabParallelEmbedding`


			`def get_masked_input_and_mask(`
			`input_: torch.Tensor, org_vocab_start_index: int,`
			`org_vocab_end_index: int, num_org_vocab_padding: int,`
			`added_vocab_start_index: int,`
			`added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:`
			`# torch.compile will fuse all of the pointwise ops below`
			`# into a single kernel, making it very fast`
[1/N][CI] Move linting system to pre-commits hooks (#1256) ### What this PR does / why we need it? Follow vllm-project/vllm lint way: https://github.com/vllm-project/vllm/blob/main/.pre-commit-config.yaml Enable pre-commit to avoid some low level error AMAP. This pr is one step of #1241, The purpose is make linting system more clear and convenient, on this step, Mainly did the following things: yapf, actionlint, ruff, typos, isort, mypy, png-lint, signoff-commit, enforce-import-regex-instead-of-re. TODO: - clang-format(check for csrc with google style) need clean code, disable for now - pymarkdown need clean code, disable for now - shellcheck need clean code, disable for now ### Does this PR introduce _any_ user-facing change? Only developer UX change: https://vllm-ascend--1256.org.readthedocs.build/en/1256/developer_guide/contributing.html#run-lint-locally ``` pip install -r requirements-lint.txt && pre-commit install bash format.sh ``` ### How was this patch tested? CI passed with new added/existing test. Co-authored-by: Yikun [yikunkero@gmail.com](mailto:yikunkero@gmail.com) Co-authored-by: wangli [wangli858794774@gmail.com](mailto:wangli858794774@gmail.com) - vLLM version: v0.9.1 - vLLM main: https://github.com/vllm-project/vllm/commit/5358cce5ffbd4011f8fea2188995a249b43b8bfe --------- Signed-off-by: wangli <wangli858794774@gmail.com> 2025-07-10 14:17:15 +08:00			`org_vocab_mask = (input_ >= org_vocab_start_index) & (`
			`input_ < org_vocab_end_index)`
port deepseekv2 and mtp to main branch (#429) ### What this PR does / why we need it? This PR ports all the deepseek graph mode code and mtp code from v0.7.3 to the main branch --------- Signed-off-by: SidaoY <1024863041@qq.com> Signed-off-by: linfeng-yuan <1102311262@qq.com> Signed-off-by: Yizhou Liu <liuyizhou5@h-partners.com> Signed-off-by: mengwei805 <mengwei25@huawei.com> Signed-off-by: libaokui <libaokui@huawei.com> Signed-off-by: q00832892 <qiaoyang19@huawei.com> Signed-off-by: ganyi <pleaplusone.gy@gmail.com> Co-authored-by: SidaoY <1024863041@qq.com> Co-authored-by: linfeng-yuan <1102311262@qq.com> Co-authored-by: Yizhou Liu <liuyizhou5@h-partners.com> Co-authored-by: mengwei805 <mengwei25@huawei.com> Co-authored-by: libaokui <libaokui@huawei.com> 2025-04-19 17:38:18 +08:00			`added_vocab_mask = (input_ >= added_vocab_start_index) & (`
			`input_ < added_vocab_end_index)`
			`added_offset = added_vocab_start_index - (`
			`org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding`
			`valid_offset = (org_vocab_start_index *`
			`org_vocab_mask) + (added_offset * added_vocab_mask)`
			`vocab_mask = org_vocab_mask \| added_vocab_mask`
			`input_ = vocab_mask * (input_ - valid_offset)`
			`return input_, ~vocab_mask`


			`def vocab_parallel_embedding_forward(self, input_):`
			`if self.tp_size > 1:`
			`# Build the mask.`
			`masked_input, input_mask = get_masked_input_and_mask(`
			`input_, self.shard_indices.org_vocab_start_index,`
			`self.shard_indices.org_vocab_end_index,`
			`self.shard_indices.num_org_vocab_padding,`
			`self.shard_indices.added_vocab_start_index,`
			`self.shard_indices.added_vocab_end_index)`
			`else:`
			`masked_input = input_`
			`# Get the embeddings.`
			`output_parallel = self.quant_method.embedding(self, masked_input.long())`
			`# Mask the output embedding.`
			`if self.tp_size > 1:`
			`output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)`
			`# Reduce across all the model parallel GPUs.`
			`output = tensor_model_parallel_all_reduce(output_parallel)`
			`return output`


			`VocabParallelEmbedding.forward = vocab_parallel_embedding_forward`