support multistep decode (#299)

Add multi step scheduler support for vllm-ascend

Signed-off-by: new-TonyWang <wangtonyyu222@gmail.com>
This commit is contained in:
Tony
2025-03-11 19:20:06 +08:00
committed by GitHub
parent feb6bdb12e
commit 4c9d78a035
5 changed files with 1067 additions and 10 deletions

View File

@@ -16,7 +16,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import torch
from vllm.logger import init_logger
logger = init_logger(__name__)
@@ -33,3 +33,23 @@ def try_register_lib(lib_name: str, lib_info: str = ""):
logger.info(lib_info)
except Exception:
pass
_current_stream = None
def current_stream() -> torch.npu.Stream:
"""
replace `torch.npu.current_stream()` with `vllm.utils.current_stream()`.
it turns out that `torch.npu.current_stream()` is quite expensive,
as it will construct a new stream object at each call.
here we patch `torch.npu.set_stream` to keep track of the current stream
directly, so that we can avoid calling `torch.npu.current_stream()`.
"""
global _current_stream
if _current_stream is None:
# when this function is called before any stream is set,
# we return the default stream.
_current_stream = torch.npu.current_stream()
return _current_stream