init

2026-04-02 04:53:13 +00:00
parent 80932c96e5
commit 24df76db9d
1987 changed files with 447445 additions and 0 deletions
--- a/torch_vacc/init.py
+++ b/torch_vacc/init.py
@@ -0,0 +1,124 @@
 import atexit
 import ctypes
 import os
 import sys
 import types
 import torch
 import torch.distributed
 from .version import __version__
 def register_runtime_libraries() -> None:
    try:
        libpython_so = f"libpython{sys.version_info.major}.{sys.version_info.minor}.so"
        base_prefix = getattr(sys, "base_prefix", sys.prefix)
        if not base_prefix.startswith("/usr"):  # like conda or virtualenv
            ctypes.CDLL(os.path.join(base_prefix, "lib", libpython_so))
        this_path = os.path.dirname(os.path.realpath(__file__))
        rt_dll_dpath = os.path.join(this_path, "_vacc_libs")
        ctypes.CDLL(os.path.join(rt_dll_dpath, "libodsp.so"))
        ctypes.CDLL(os.path.join(rt_dll_dpath, "libvaccrt.so"))
        ctypes.CDLL(os.path.join(rt_dll_dpath, "libvnnl.so"))
        ctypes.CDLL(os.path.join(rt_dll_dpath, "libvccl.so"))
        ctypes.CDLL(os.path.join(rt_dll_dpath, "libvacc_core.so"))
    except Exception as e:
        raise RuntimeError("Vastai runtime library not loaded.") from e
 register_runtime_libraries()
 from ._vacc_libs import _torch_vacc as _C
 try:
    _C._init_torch_vacc_module()
 except Exception as e:
    raise RuntimeError("Failed to init torch_vacc.") from e
 def _apply_patches(monkey_patches):
    def _getattr(module_list, root_module=torch):
        if len(module_list) <= 1:
            return root_module
        if hasattr(root_module, module_list[0]):
            return _getattr(module_list[1:], getattr(root_module, module_list[0]))
        else:
            empty_module_name = f"{root_module.__name__}.{module_list[0]}"
            sys.modules[empty_module_name] = types.ModuleType(empty_module_name)
            setattr(root_module, module_list[0], sys.modules.get(empty_module_name))
            return _getattr(module_list[1:], getattr(root_module, module_list[0]))
    for patch_pair in monkey_patches:
        dest, patch = patch_pair
        dest_module = _getattr(dest.split("."), root_module=torch)
        last_module_level = dest.split(".")[-1]
        if not isinstance(patch, types.ModuleType):
            setattr(dest_module, last_module_level, patch)
            continue
        if not hasattr(dest_module, last_module_level) or not hasattr(patch, "__all__"):
            setattr(dest_module, last_module_level, patch)
            sys.modules[f"{dest_module.__name__}.{last_module_level}"] = patch
            continue
        assert hasattr(patch, "__all__"), "Patch module must have __all__ definition."
        dest_module = getattr(dest_module, last_module_level)
        for attr in patch.__all__:
            setattr(dest_module, attr, getattr(patch, attr))
 import torch_vacc.vacc as vacc
 # register "vacc" module/functions to torch
 torch._register_device_module("vacc", vacc)
 unsupported_dtype = [
    torch.quint8,
    torch.quint4x2,
    torch.quint2x4,
    torch.qint32,
    torch.qint8,
 ]
 torch.utils.generate_methods_for_privateuse1_backend(
    for_tensor=True,
    for_module=True,
    for_storage=True,  # TODO(qingsong): do we support storage?
    unsupported_dtype=unsupported_dtype,
 )
 # register legacy *DtypeTensor into torch.vacc
 _C._initialize_python_bindings()
 # init seed generators, vacc default generator
 vacc.init()
 def is_vccl_available() -> bool:
    return True
 torch.distributed.is_vccl_available = is_vccl_available
 def set_global_log_level(log_level):
    _C.set_global_log_level(log_level.upper())
 def print_vacc_ops():
    _C._print_vacc_ops()
 def vacc_ops_list():
    return _C._vacc_ops_list().split(",")
 def print_vacc_selective_ops():
    _C._print_vacc_selective_ops()
 def _vacc_shutdown():
    _C._vacc_module_shutdown()
 atexit.register(_vacc_shutdown)
--- a/torch_vacc/pycache/init.cpython-312.pyc
+++ b/torch_vacc/pycache/init.cpython-312.pyc
--- a/torch_vacc/pycache/version.cpython-312.pyc
+++ b/torch_vacc/pycache/version.cpython-312.pyc
--- a/torch_vacc/_vacc_libs/_torch_vacc.cpython-312-x86_64-linux-gnu.so
+++ b/torch_vacc/_vacc_libs/_torch_vacc.cpython-312-x86_64-linux-gnu.so
--- a/torch_vacc/_vacc_libs/libfn-log.so
+++ b/torch_vacc/_vacc_libs/libfn-log.so
--- a/torch_vacc/_vacc_libs/libodsp.so
+++ b/torch_vacc/_vacc_libs/libodsp.so
--- a/torch_vacc/_vacc_libs/libvacc_core.so
+++ b/torch_vacc/_vacc_libs/libvacc_core.so
--- a/torch_vacc/_vacc_libs/libvaccrt.so
+++ b/torch_vacc/_vacc_libs/libvaccrt.so
--- a/torch_vacc/_vacc_libs/libvccl.so
+++ b/torch_vacc/_vacc_libs/libvccl.so
--- a/torch_vacc/_vacc_libs/libvnnl.so
+++ b/torch_vacc/_vacc_libs/libvnnl.so
--- a/torch_vacc/contrib/init.py
+++ b/torch_vacc/contrib/init.py
--- a/torch_vacc/contrib/pycache/init.cpython-312.pyc
+++ b/torch_vacc/contrib/pycache/init.cpython-312.pyc
--- a/torch_vacc/contrib/pycache/transfer_to_vacc.cpython-312.pyc
+++ b/torch_vacc/contrib/pycache/transfer_to_vacc.cpython-312.pyc
--- a/torch_vacc/contrib/transfer_to_vacc.py
+++ b/torch_vacc/contrib/transfer_to_vacc.py
@@ -0,0 +1,398 @@
 import os
 import warnings
 import logging as logger
 from functools import wraps
 import torch
 import torch_vacc
 '''
 try:
    import torchair
 except ImportError:
    IS_TORCHAIR_INSTALLED = False
 else:
    IS_TORCHAIR_INSTALLED = True
 '''
 warnings.filterwarnings(action="once")
 torch_fn_white_list = [
    "_cudnn_init_dropout_state",
    "_empty_affine_quantized",
    "_empty_per_channel_affine_quantized",
    "_pin_memory",
    "_sparse_coo_tensor_unsafe",
    "_sparse_csr_tensor_unsafe",
    "logspace",
    "randint",
    "hann_window",
    "rand",
    "full_like",
    "ones_like",
    "rand_like",
    "randperm",
    "arange",
    "frombuffer",
    "normal",
    "empty_strided",
    "empty_like",
    "scalar_tensor",
    "tril_indices",
    "bartlett_window",
    "ones",
    "sparse_coo_tensor",
    "randn",
    "kaiser_window",
    "tensor",
    "triu_indices",
    "as_tensor",
    "zeros",
    "randint_like",
    "full",
    "eye",
    "empty",
    "blackman_window",
    "zeros_like",
    "range",
    "sparse_csr_tensor",
    "randn_like",
    "from_file",
    "linspace",
    "hamming_window",
    "empty_quantized",
    "autocast",
    "load",
 ]
 torch_tensor_fn_white_list = [
    "new_empty",
    "new_empty_strided",
    "new_full",
    "new_ones",
    "new_tensor",
    "new_zeros",
    "to",
 ]
 torch_module_fn_white_list = ["to", "to_empty"]
 torch_cuda_fn_white_list = [
    "get_device_properties",
    "get_device_name",
    "get_device_capability",
    "list_gpu_processes",
    "set_device",
    "synchronize",
    "mem_get_info",
    "memory_stats",
    "memory_summary",
    "memory_allocated",
    "max_memory_allocated",
    "reset_max_memory_allocated",
    "memory_reserved",
    "max_memory_reserved",
    "reset_max_memory_cached",
    "reset_peak_memory_stats",
    "current_stream",
    "default_stream",
 ]
 torch_profiler_fn_white_list = ["profile"]
 torch_distributed_fn_white_list = ["__init__"]
 device_kwargs_list = ["device", "device_type", "map_location"]
 def wrapper_cuda(fn):
    @wraps(fn)
    def decorated(*args, **kwargs):
        replace_int = fn.__name__ in ["to", "to_empty"]
        if args:
            args_new = list(args)
            args = replace_cuda_to_vacc_in_list(args_new, replace_int)
        if kwargs:
            for device_arg in device_kwargs_list:
                device = kwargs.get(device_arg, None)
                if device is not None:
                    replace_cuda_to_vacc_in_kwargs(kwargs, device_arg, device)
            device_ids = kwargs.get("device_ids", None)
            if type(device_ids) == list:
                device_ids = replace_cuda_to_vacc_in_list(device_ids, replace_int)
        return fn(*args, **kwargs)
    return decorated
 def replace_cuda_to_vacc_in_kwargs(kwargs, device_arg, device):
    if type(device) == str and "cuda" in device:
        kwargs[device_arg] = device.replace("cuda", "vacc")
    elif type(device) == torch.device and "cuda" in device.type:
        device_info = (
            "vacc:{}".format(device.index) if device.index is not None else "vacc"
        )
        kwargs[device_arg] = torch.device(device_info)
    elif type(device) == int:
        kwargs[device_arg] = f"vacc:{device}"
    elif type(device) == dict:
        kwargs[device_arg] = replace_cuda_to_vacc_in_dict(device)
 def replace_cuda_to_vacc_in_list(args_list, replace_int):
    for idx, arg in enumerate(args_list):
        if isinstance(arg, str) and "cuda" in arg:
            args_list[idx] = arg.replace("cuda", "vacc")
        elif isinstance(arg, torch.device) and "cuda" in arg.type:
            device_info = (
                "vacc:{}".format(arg.index) if arg.index is not None else "vacc"
            )
            args_list[idx] = torch.device(device_info)
        elif replace_int and not isinstance(arg, bool) and isinstance(arg, int):
            args_list[idx] = f"vacc:{arg}"
        elif isinstance(arg, dict):
            args_list[idx] = replace_cuda_to_vacc_in_dict(arg)
    return args_list
 def replace_cuda_to_vacc_in_dict(device_dict):
    new_dict = {}
    for key, value in device_dict.items():
        if isinstance(key, str):
            key = key.replace("cuda", "vacc")
        if isinstance(value, str):
            value = value.replace("cuda", "vacc")
        new_dict[key] = value
    return new_dict
 def device_wrapper(enter_fn, white_list):
    for fn_name in white_list:
        fn = getattr(enter_fn, fn_name, None)
        if fn:
            setattr(enter_fn, fn_name, wrapper_cuda(fn))
 def wrapper_vccl(fn):
    @wraps(fn)
    def decorated(*args, **kwargs):
        if args:
            args_new = list(args)
            for idx, arg in enumerate(args_new):
                if type(arg) == str and "nccl" in arg:
                    args_new[idx] = arg.replace("nccl", "vccl")
            args = args_new
        if kwargs:
            if type(kwargs.get("backend", None)) == str:
                kwargs["backend"] = "vccl"
        return fn(*args, **kwargs)
    return decorated
 def wrapper_data_loader(fn):
    @wraps(fn)
    def decorated(*args, **kwargs):
        if kwargs:
            pin_memory = kwargs.get("pin_memory", False)
            pin_memory_device = kwargs.get("pin_memory_device", None)
            if pin_memory and not pin_memory_device:
                kwargs["pin_memory_device"] = "vacc"
            if (
                pin_memory
                and type(pin_memory_device) == str
                and "cuda" in pin_memory_device
            ):
                kwargs["pin_memory_device"] = pin_memory_device.replace("cuda", "vacc")
        return fn(*args, **kwargs)
    return decorated
 def wrapper_get_available_device_type(fn):
    @wraps(fn)
    def decorated(*args, **kwargs):
        try:
            if (torch.vacc.is_available()):
                return 'vacc'
        except Exception as e:
            msg = "vacc device is not available."
            warnings.warn(msg, RuntimeWarning)        
        return fn(*args, **kwargs)
    return decorated
 '''
 def wrapper_profiler(fn):
    @wraps(fn)
    def decorated(*args, **kwargs):
        if kwargs:
            if (
                "experimental_config" in kwargs.keys()
                and type(kwargs.get("experimental_config"))
                != torch_vacc.profiler._ExperimentalConfig
            ):
                logger.warning(
                    "The parameter experimental_config of torch.profiler.profile has been deleted by the tool "
                    "because it can only be used in cuda, please manually modify the code "
                    "and use the experimental_config parameter adapted to vacc."
                )
                del kwargs["experimental_config"]
        return fn(*args, **kwargs)
    return decorated
 def wrapper_compile(fn):
    @wraps(fn)
    def decorated(*args, **kwargs):
        vacc_backend = torchair.get_vacc_backend()
        if kwargs:
            backend = kwargs.get("backend", None)
            if (
                not backend
                or not isinstance(backend, functools.partial)
                or not isinstance(backend.func, type(vacc_backend.func))
            ):
                kwargs["backend"] = vacc_backend
        else:
            kwargs["backend"] = vacc_backend
        return fn(*args, **kwargs)
    return decorated
 '''
 def jit_script(obj, optimize=None, _frames_up=0, _rcb=None, example_inputs=None):
    msg = "torch.jit.script will be disabled by transfer_to_vacc, which currently does not support it."
    warnings.warn(msg, RuntimeWarning)
    return obj
 def patch_cuda():
    patchs = [
        ["cuda", torch_vacc.vacc],
        ["cuda.amp", torch_vacc.vacc.amp],
        ["cuda.random", torch_vacc.vacc.random],
        ["cuda.amp.autocast_mode", torch_vacc.vacc.amp.autocast_mode],
        ["cuda.amp.common", torch_vacc.vacc.amp.common],
        ["cuda.amp.grad_scaler", torch_vacc.vacc.amp.grad_scaler],
    ]
    torch_vacc._apply_patches(patchs)
 '''
 def patch_profiler():
    patchs = [
        ["profiler.profile", torch_vacc.profiler.profile],
        ["profiler.schedule", torch_vacc.profiler.schedule],
        [
            "profiler.tensorboard_trace_handler",
            torch_vacc.profiler.tensorboard_trace_handler,
        ],
        ["profiler.ProfilerAction", torch_vacc.profiler.ProfilerAction],
        ["profiler.ProfilerActivity.CUDA", torch_vacc.profiler.ProfilerActivity.VACC],
        ["profiler.ProfilerActivity.CPU", torch_vacc.profiler.ProfilerActivity.CPU],
    ]
    torch_vacc._apply_patches(patchs)
 '''
 def warning_fn(msg, rank0=True):
    is_distributed = (
        torch.distributed.is_available()
        and torch.distributed.is_initialized()
        and torch.distributed.get_world_size() > 1
    )
    env_rank = os.getenv("RANK", None)
    if rank0 and is_distributed:
        if torch.distributed.get_rank() == 0:
            warnings.warn(msg, ImportWarning)
    elif rank0 and env_rank:
        if env_rank == "0":
            warnings.warn(msg, ImportWarning)
    else:
        warnings.warn(msg, ImportWarning)
 def init():
    warning_fn(
        """
 *************************************************************************************************************
 The torch.Tensor.cuda and torch.nn.Module.cuda are replaced with torch.Tensor.vacc and torch.nn.Module.vacc now..
 The torch.cuda.DoubleTensor is replaced with torch.vacc.FloatTensor cause the double type is not supported now..
 The backend in torch.distributed.init_process_group set to vccl now..
 The torch.cuda.* and torch.cuda.amp.* are replaced with torch.vacc.* and torch.vacc.amp.* now..
 The device parameters have been replaced with vacc in the function below:
 {}
 If you notices any functions you use is not included in the above list, feel free to contact torch-vacc development team.
 *************************************************************************************************************
    """.format(
            ", ".join(
                ["torch." + i for i in torch_fn_white_list]
                + ["torch.Tensor." + i for i in torch_tensor_fn_white_list]
                + ["torch.nn.Module." + i for i in torch_module_fn_white_list]
            )
        )
    )
    # torch.cuda.*
    patch_cuda()
    device_wrapper(torch.cuda, torch_cuda_fn_white_list)
    # torch.profiler.*
    # TODO(qingsong): profiler not implemented yet
    # patch_profiler()
    # device_wrapper(torch.profiler, torch_profiler_fn_white_list)
    # torch.*
    device_wrapper(torch, torch_fn_white_list)
    # torch.Tensor.*
    device_wrapper(torch.Tensor, torch_tensor_fn_white_list)
    torch.Tensor.cuda = torch.Tensor.vacc
    torch.Tensor.is_cuda = torch.Tensor.is_vacc
    for dtype_tensor in [
        "ByteTensor",
        "CharTensor",
        "DoubleTensor",
        "FloatTensor",
        "IntTensor",
        "LongTensor",
        "ShortTensor",
        "HalfTensor",
        "BoolTensor",
    ]:
        setattr(
            torch.cuda,
            dtype_tensor,
            getattr(torch.vacc, dtype_tensor),
        )
    # TODO(qingsong): do we need this? should we add LongTensor=IntTensor?
    torch.cuda.DoubleTensor = torch.vacc.FloatTensor
    # torch.nn.Module.*
    device_wrapper(torch.nn.Module, torch_module_fn_white_list)
    torch.nn.Module.cuda = torch.nn.Module.vacc
    # torch.distributed.init_process_group
    torch.distributed.init_process_group = wrapper_vccl(
        torch.distributed.init_process_group
    )
    torch.distributed.is_nccl_available = torch.distributed.is_vccl_available
    # torch.nn.parallel.DistributedDataParallel
    device_wrapper(
        torch.nn.parallel.DistributedDataParallel, torch_distributed_fn_white_list
    )
    # torch.utils.data.DataLoader
    torch.utils.data.DataLoader.__init__ = wrapper_data_loader(
        torch.utils.data.DataLoader.__init__
    )
    torch.jit.script = jit_script
    torch._utils._get_available_device_type = wrapper_get_available_device_type(
        torch._utils._get_available_device_type
    )
    '''
    if IS_TORCHAIR_INSTALLED:
        torch.compile = wrapper_compile(torch.compile)
    '''
 init()
--- a/torch_vacc/fused_ops/init.py
+++ b/torch_vacc/fused_ops/init.py
--- a/torch_vacc/fused_ops/pycache/init.cpython-312.pyc
+++ b/torch_vacc/fused_ops/pycache/init.cpython-312.pyc
--- a/torch_vacc/fused_ops/pycache/rms_norm.cpython-312.pyc
+++ b/torch_vacc/fused_ops/pycache/rms_norm.cpython-312.pyc
--- a/torch_vacc/fused_ops/pycache/rope_emb.cpython-312.pyc
+++ b/torch_vacc/fused_ops/pycache/rope_emb.cpython-312.pyc
--- a/torch_vacc/fused_ops/rms_norm.py
+++ b/torch_vacc/fused_ops/rms_norm.py
@@ -0,0 +1,42 @@
 import torch
 import torch_vacc
 from torch_vacc._vacc_libs import _torch_vacc
 class FusedRMSNormFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6):
        output, rsigma, var = torch.ops.vacc.rms_norm_forward(input, weight, eps)
        ctx.save_for_backward(input, weight, rsigma, var)
        ctx.eps = eps
        return output
    @staticmethod
    def backward(ctx, grad_output: torch.Tensor):
        input, weight, rsigma, var = ctx.saved_tensors
        grad_input, grad_weight = _torch_vacc.rms_norm_backward(
            grad_output, input, weight, rsigma, var, ctx.eps
        )
        return grad_input, grad_weight, None
 def rms_norm(input: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6):
    return FusedRMSNormFunction.apply(input, weight, eps)
 class FusedRMSNorm(torch.nn.Module):
    def __init__(self, hidden_size, eps: float = 1e-6):
        super(FusedRMSNorm, self).__init__()
        self.eps = eps
        self.weight = torch.nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        output = FusedRMSNormFunction.apply(hidden_states, self.weight, self.eps)
        output = output.to(dtype)
        return output
--- a/torch_vacc/fused_ops/rope_emb.py
+++ b/torch_vacc/fused_ops/rope_emb.py
@@ -0,0 +1,32 @@
 import torch
 import torch_vacc
 from torch_vacc._vacc_libs import _torch_vacc
 class FusedRopeEmbFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, q: torch.Tensor, k: torch.Tensor, offset: int):
        qemb, kemb = _torch_vacc.rope_forward(q, k, offset)
        ctx.offset = offset
        return qemb, kemb
    @staticmethod
    def backward(ctx, q_out_grad: torch.Tensor, k_out_grad: torch.Tensor):
        grad_input, grad_rope = _torch_vacc.rope_backward(
            q_out_grad, k_out_grad, ctx.offset
        )
        return grad_input, grad_rope, None
 def rope_emb(q: torch.Tensor, k: torch.Tensor, offset: int):
    # return FusedRopeEmbFunction.apply(q, k, offset)
    return torch_vacc.vacc.custom_ops.RotaryPosEmbedding(q=q, k=k, offset=offset)
 class RopeEmb(torch.nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, q: torch.Tensor, k: torch.Tensor, offset: int):
        return rope_emb(q, k, offset)
--- a/torch_vacc/testing/init.py
+++ b/torch_vacc/testing/init.py
@@ -0,0 +1,62 @@
 from contextlib import contextmanager
 import os
 import sys
 import torch
 from torch.testing import make_tensor
 from functools import partial, wraps
 import torch.testing._internal.common_device_type as cdt
 from torch.testing._internal.common_device_type import (
    DeviceTypeTestBase,
    dtypes,
    instantiate_device_type_tests,
    onlyOn,
    onlyPRIVATEUSE1,
    ops,
 )
 if sys.version_info > (3, 8):
    from torch.testing._internal.common_distributed import (
        MultiProcessTestCase,
        init_multigpu_helper,
        skip_if_lt_x_gpu,
        get_timeout,
        #skip_if_rocm,
        with_dist_debug_levels,
    )
 else:
    from torch.testing._internal.common_distributed import (
        MultiProcessTestCase,
        init_multigpu_helper,
        skip_if_lt_x_gpu,
        get_timeout,
        skip_if_rocm,
        with_dist_debug_levels,
    )
 from torch.testing._internal.common_utils import (
    TestCase,
    load_tests,
    parametrize,
    run_tests,
    subtest,
    retry_on_connect_failures,
    instantiate_parametrized_tests,
 )
 onlyVacc = onlyPRIVATEUSE1
 class VaccTestBase(DeviceTypeTestBase):
    device_type = "vacc"
 if VaccTestBase not in cdt.device_type_test_bases:
    cdt.device_type_test_bases.append(VaccTestBase)
@contextmanager
 def freeze_rng_state():
    rng_state = torch.get_rng_state()
    yield
    torch.set_rng_state(rng_state)
--- a/torch_vacc/testing/pycache/init.cpython-312.pyc
+++ b/torch_vacc/testing/pycache/init.cpython-312.pyc
--- a/torch_vacc/testing/pycache/summarize_report.cpython-312.pyc
+++ b/torch_vacc/testing/pycache/summarize_report.cpython-312.pyc
--- a/torch_vacc/testing/summarize_report.py
+++ b/torch_vacc/testing/summarize_report.py
@@ -0,0 +1,103 @@
 """
 Tool to summarize unit test XML reports, it summarize
 * number of tests, and failure/error/skipped
 * top 10 slowest tests
 Usage:
    python -m torch_vacc.testing.summarize_report --report report.xml
 """
 import argparse
 from dataclasses import dataclass
 from xml.etree import ElementTree as ET
 import sys
 from torch_vacc import set_global_log_level
 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--report", type=str)
    return parser.parse_args()
 def summarize_testsuites(suites):
    summary = {
        "errors": int,
        "failures": int,
        "skipped": int,
        "skips": int,
        "tests": int,
        "time": float,
    }
    attribs = [s.attrib for s in suites]
    for key in summary:
        summary[key] = sum(summary[key](a[key]) for a in attribs if key in a)
    assert not (summary["skipped"] and summary["skips"])
    if summary["skips"]:
        summary["skipped"] = summary["skips"]
    return summary
 def format_summary(summary):
    template = "Ran {tests} tests in {time:.3f}s (errors={errors}, failures={failures}, skipped={skipped})"
    msg = template.format(**summary)
    if summary["errors"] > 0 or summary["failures"] > 0:
        msg = "FAILED. " + msg
    return msg
@dataclass
 class TestCaseInfo:
    test_class_name: str
    test_name: str
    time: float
    timestamp: str
    success: bool
    def __lt__(self, other):
        return self.time < other.time
 def sort_cases_by_time(suites):
    test_cases = [
        TestCaseInfo(
            s.attrib["classname"],
            s.attrib["name"],
            s.attrib["time"],
            s.attrib["timestamp"],
            s.attrib.get("failure") is None,
        )
        for s in suites
    ]
    test_cases.sort(reverse=True)
    return test_cases
 def read_report(fpath):
    with open(fpath) as report:
        try:
            report = ET.parse(report)
        except ET.ParseError:
            print(f"{sys.argv[0]}: Cannot parse file {fpath}", file=sys.stderr)
            return
        root = report.getroot()
        suites = [root] if root.tag == "testsuite" else root.findall("testsuite")
        summary = summarize_testsuites(suites)
        summary_msg = format_summary(summary)
        print(summary_msg)
        for suite in suites:
            cases = sort_cases_by_time(suite.findall("testcase"))
            [print(case) for case in cases[:10]]
 def main():
    set_global_log_level("ERROR")
    args = parse_args()
    read_report(args.report)
 if __name__ == "__main__":
    main()
--- a/torch_vacc/vacc/init.py
+++ b/torch_vacc/vacc/init.py
@@ -0,0 +1,184 @@
 from __future__ import annotations
 from typing import Tuple
 import torch
 from ._device import (
    current_device,
    device,
    device_count,
    get_device_capability,
    get_device_name,
    get_device_properties,
    is_available,
    is_bf16_supported,
    set_device,
    synchronize,
 )
 from .amp import (
    get_amp_supported_dtype,
    get_autocast_dtype,
    is_autocast_enabled,
    set_autocast_dtype,
    set_autocast_enabled,
 )
 from .lazy_initialize import _is_in_bad_fork, _lazy_call, _lazy_init
 from .memory import (  # caching_allocator_alloc,; caching_allocator_delete,
    empty_cache,
    get_allocator_backend,
    max_memory_allocated,
    max_memory_cached,
    max_memory_reserved,
    mem_get_info,
    memory_allocated,
    memory_cached,
    memory_reserved,
    memory_snapshot,
    memory_stats,
    memory_stats_as_nested_dict,
    memory_summary,
    reset_accumulated_memory_stats,
    reset_max_memory_allocated,
    reset_max_memory_cached,
    reset_peak_memory_stats,
    set_per_process_memory_fraction,
 )
 from .streams import Event, Stream, current_stream, default_stream, set_stream, stream
 def init():
    r"""Initialize PyTorch's VACC state. You may need to call
    this explicitly if you are interacting with PyTorch via
    its C API, as Python bindings for VACC functionality will not
    be available until this initialization takes place.  Ordinary users
    should not need this, as all of PyTorch's VACC methods
    automatically initialize VACC state on-demand.
    Does nothing if the VACC state is already initialized.
    """
    _lazy_init()
 # default_generators is empty util _lazy_init() is called
 default_generators: Tuple[torch._C.Generator] = ()  # type: ignore[assignment]
 from .custom_ops import *
 from .custom_qwen3_ops import *
 from .random import *  # noqa: F403
 __all__ = [
    "device",
    "is_available",
    "is_bf16_supported",
    "current_device",
    "set_device",
    "device_count",
    "get_device_properties",
    "get_device_name",
    "get_device_capability",
    "synchronize",
    "amp",
    "get_amp_supported_dtype",
    "is_autocast_enabled",
    "set_autocast_enabled",
    "get_autocast_dtype",
    "set_autocast_dtype",
    "_is_in_bad_fork",
    "_lazy_call",
    "get_rng_state",
    "get_rng_state_all",
    "set_rng_state",
    "set_rng_state_all",
    "manual_seed",
    "manual_seed_all",
    "seed",
    "seed_all",
    "initial_seed",
    "set_stream",
    "current_stream",
    "default_generators",
    "default_stream",
    "stream",
    "Stream",
    "Event",
    "mem_get_info",
    "set_per_process_memory_fraction",
    "empty_cache",
    "memory_stats",
    "memory_stats_as_nested_dict",
    "reset_accumulated_memory_stats",
    "reset_peak_memory_stats",
    "reset_max_memory_allocated",
    "reset_max_memory_cached",
    "memory_allocated",
    "max_memory_allocated",
    "memory_reserved",
    "max_memory_reserved",
    "memory_cached",
    "max_memory_cached",
    "memory_snapshot",
    "memory_summary",
    "get_allocator_backend",
    "rms_norm",
    "RotaryPosEmbedding",
    "scaled_dot_product_attention",
    "scaled_dot_product_attention_cp_forward",
    "scaled_dot_product_attention_cp_backward",
    "swiglu",
    "paged_attention",
    "reshape_and_cache_attention",
    "concat_and_cache_attention",
    "w8a8_block_fp8_matmul",
    "moe_expert_token_group_reassign",
    "fused_mlp_mm_fp8",
    "fused_mlp_fp8",
    "fused_moe_preprocess",
    "fused_residual_rmsnorm",
    "parallel_embedding",
    "all_reduce",
    "all_gather",
    "broadcast",
    "fused_mlp_moe_with_rmsnorm",
    "fuse_moe_decode_v2_allreduce",
    "topk_topp",
    "fused_mla",
    "fused_mla_allreduce",
    "fused_mlp_with_rmsnorm",
    "fused_mlp_allreduce",
    "ds3_sampler",
    "sampler_v1",
    "rejection_sampler",
    "rejection_sampler_update_hidden_states",
    "rejection_sampler_v1",
    "fused_matmul_allgather",
    "fused_mla_v2",
    "fused_mla_allreduce_v2",
    "mla_matmul_scale",
    "mla_matmul",
    "fused_mla_prefill_stage0",
    "fused_mla_prefill_stage1",
    "fused_mla_prefill_stage0_allreduce",
    "fuse_moe_prefill_stage0",
    "fuse_mla_mlp_v2_allreduce_decode",
    "fuse_mla_moe_v2_allreduce_decode",
    "fuse_mla_mlp_v2_allreduce_decode_layers",
    "fuse_mla_moe_v2_allreduce_decode_layers",
    "fuse_mla_mlp_v2_allreduce_decode_layers_v2",
    "fuse_mla_moe_v2_allreduce_decode_layers_v2",
    "fuse_mlp_qwen_int4",
    "fuse_mlp_qwen_int4_reduce",
    "w4a8_block_int4_matmul",
    "fuse_atten_qwen3",
    "fuse_atten_qwen2",
    "qwen3_fuse_attention_moe_decode",
    "fuse_mtp_stage0",
    "fuse_mtp_allreduce",
    "roll_out",
    "fused_experts_int4_prefill",
    "fuse_bge_embedding_stage1",
    "l2_norm",
    "fuse_mlp_vision",
    "patch_merger_vision",
    "fuse_atten_vit",
    "apply_penalties",
 ]
--- a/torch_vacc/vacc/pycache/init.cpython-312.pyc
+++ b/torch_vacc/vacc/pycache/init.cpython-312.pyc
--- a/torch_vacc/vacc/pycache/_device.cpython-312.pyc
+++ b/torch_vacc/vacc/pycache/_device.cpython-312.pyc
--- a/torch_vacc/vacc/pycache/custom_ops.cpython-312.pyc
+++ b/torch_vacc/vacc/pycache/custom_ops.cpython-312.pyc
--- a/torch_vacc/vacc/pycache/custom_ops_cpu.cpython-312.pyc
+++ b/torch_vacc/vacc/pycache/custom_ops_cpu.cpython-312.pyc
--- a/torch_vacc/vacc/pycache/custom_qwen3_ops.cpython-312.pyc
+++ b/torch_vacc/vacc/pycache/custom_qwen3_ops.cpython-312.pyc
--- a/torch_vacc/vacc/pycache/lazy_initialize.cpython-312.pyc
+++ b/torch_vacc/vacc/pycache/lazy_initialize.cpython-312.pyc
--- a/torch_vacc/vacc/pycache/memory.cpython-312.pyc
+++ b/torch_vacc/vacc/pycache/memory.cpython-312.pyc
--- a/torch_vacc/vacc/pycache/random.cpython-312.pyc
+++ b/torch_vacc/vacc/pycache/random.cpython-312.pyc
--- a/torch_vacc/vacc/pycache/streams.cpython-312.pyc
+++ b/torch_vacc/vacc/pycache/streams.cpython-312.pyc
--- a/torch_vacc/vacc/_device.py
+++ b/torch_vacc/vacc/_device.py
@@ -0,0 +1,106 @@
 # Device information
 # replacing `torch.cuda.func`` with `torch_vacc.vacc.func`.
 # see https://pytorch.org/docs/stable/cuda.html
 from typing import Any
 import warnings
 import torch
 import torch_vacc
 from torch._utils import _get_device_index
 from torch_vacc._vacc_libs import _torch_vacc
 from .lazy_initialize import _lazy_init
 if hasattr(_torch_vacc, "_exchange_device"):
    _exchange_device = _torch_vacc._exchange_device
 else:
    def _exchange_device(device: int) -> int:
        return _torch_vacc._exchange_device()
        if device < 0:
            return -1
        prev_device = current_device()
        if device != prev_device:
            set_device(device)
        return prev_device
 class device(object):
    """Context-manager that changes the selected device.
    Args:
        device (torch.device or int): device index to select. It's a no-op if
            this argument is a negative integer or ``None``.
    """
    def __init__(self, device: Any):
        self.idx = _get_device_index(device, optional=True)
        self.prev_idx = -1
    def __enter__(self):
        self.prev_idx = _exchange_device(self.idx)
    def __exit__(self, *args):
        _exchange_device(self.prev_idx)
        return False
 def is_available() -> bool:
    r"""Returns whether vacc is available."""
    return device_count() > 0
 def is_bf16_supported() -> bool:
    r"""Returns a bool indicating if the current vacc device supports dtype bfloat16"""
    return True
 def current_device() -> int:
    r"""Returns the index of a currently selected vacc device."""
    _lazy_init()
    return _torch_vacc._current_device()
 def set_device(device: torch.device):
    device_index = _get_device_index(device, optional=True)
    if device_index >= 0:
        _torch_vacc._set_device(device_index)
 def get_device_capability(device=None):
    r"""Query the minor and major data of device. Cann does not
    have a corresponding concept and is not supported. By default, it returns None
    """
    _infos = "torch.vacc.get_device_capability isn't implemented! Please do the version check in other ways, Unlike CUDA major,min"
    raise AssertionError(_infos)
 def get_device_name(device_name=None):
    device_id = _get_device_index(device_name, optional=True)
    if device_id < 0 or device_id >= device_count():
        raise AssertionError("Invalid device id")
    _lazy_init()
    device_prop = _torch_vacc._vacc_getDeviceProperties(device_id)
    return device_prop.name
 def get_device_properties(device_name=None):
    device_id = _get_device_index(device_name, optional=True)
    if device_id < 0 or device_id >= device_count():
        raise AssertionError("Invalid device id")
    _lazy_init()
    return _torch_vacc._vacc_getDeviceProperties(device_id)
 def device_count():
    r"""Returns the number of available vacc devices"""
    return _torch_vacc._device_count()
 def synchronize(device=None) -> None:
    """Waits for all operations in all streams on a VACC device to complete."""
    _lazy_init()
    with torch_vacc.vacc.device(device):
        return _torch_vacc._device_synchronize()
 # Memory management (https://pytorch.org/docs/stable/cuda.html#memory-management)
--- a/torch_vacc/vacc/amp/init.py
+++ b/torch_vacc/vacc/amp/init.py
@@ -0,0 +1,26 @@
 from typing import List
 import torch
 from torch_vacc._vacc_libs import _torch_vacc
 from .grad_scaler import OptState, GradScaler
 from .autocast_mode import autocast, custom_fwd, custom_bwd
 def get_amp_supported_dtype() -> List[torch.dtype]:
    return [torch.float16, torch.bfloat16]
 def is_autocast_enabled() -> bool:
    return _torch_vacc.is_autocast_enabled()
 def set_autocast_enabled(enable: bool):
    _torch_vacc.set_autocast_enabled(enable)
 def get_autocast_dtype() -> torch.dtype:
    return _torch_vacc.get_autocast_dtype()
 def set_autocast_dtype(dtype: torch.dtype):
    return _torch_vacc.set_autocast_dtype(dtype)
--- a/torch_vacc/vacc/amp/pycache/init.cpython-312.pyc
+++ b/torch_vacc/vacc/amp/pycache/init.cpython-312.pyc
--- a/torch_vacc/vacc/amp/pycache/autocast_mode.cpython-312.pyc
+++ b/torch_vacc/vacc/amp/pycache/autocast_mode.cpython-312.pyc
--- a/torch_vacc/vacc/amp/pycache/common.cpython-312.pyc
+++ b/torch_vacc/vacc/amp/pycache/common.cpython-312.pyc
--- a/torch_vacc/vacc/amp/pycache/grad_scaler.cpython-312.pyc
+++ b/torch_vacc/vacc/amp/pycache/grad_scaler.cpython-312.pyc
--- a/torch_vacc/vacc/amp/autocast_mode.py
+++ b/torch_vacc/vacc/amp/autocast_mode.py
@@ -0,0 +1,144 @@
 import collections
 import functools
 from typing import Any
 import torch
 try:
    import numpy as np
    HAS_NUMPY = True
 except ModuleNotFoundError:
    np = None  # type: ignore[assignment]
 __all__ = ["autocast", "custom_fwd", "custom_bwd"]
 class autocast(torch.amp.autocast_mode.autocast):
    r"""See :class:`torch.autocast`.
    ``torch.vacc.amp.autocast(args...)`` is equivalent to ``torch.autocast("vacc", args...)``
    """
    def __init__(
        self,
        enabled: bool = True,
        dtype: torch.dtype = torch.float16,
        cache_enabled: bool = True,
    ):
        if torch._jit_internal.is_scripting():
            self._enabled = enabled
            self.device = "vacc"
            self.fast_dtype = dtype
            return
        super().__init__(
            "vacc", enabled=enabled, dtype=dtype, cache_enabled=cache_enabled
        )
    def __enter__(self):
        if torch._jit_internal.is_scripting():
            return self
        return super().__enter__()
    # TODO: discuss a unified TorchScript-friendly API for autocast
    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any):  # type: ignore[override]
        if torch._jit_internal.is_scripting():
            return
        return super().__exit__(exc_type, exc_val, exc_tb)
    def __call__(self, func):
        if torch._jit_internal.is_scripting():
            return func
        return super().__call__(func)
 # Casts Tensors and containers of Tensors.  Special-cases passthroughs for strings and np.ndarrays, which
 # may be falsely detected as "Iterables."
 def _cast(value, dtype):
    if isinstance(value, torch.Tensor):
        is_eligible = (
            value.is_floating_point()
            and value.is_vacc
            and (value.dtype is not torch.float64)
        )
        return value.to(dtype) if is_eligible else value
    elif isinstance(value, (str, bytes)):
        return value
    elif HAS_NUMPY and isinstance(value, np.ndarray):
        return value
    elif isinstance(value, collections.abc.Mapping):
        return {_cast(k, dtype): _cast(v, dtype) for k, v in value.items()}
    elif isinstance(value, collections.abc.Iterable):
        iterable = (_cast(v, dtype) for v in value)
        if isinstance(value, (list, tuple)):
            return type(value)(iterable)
        else:
            return iterable
    else:
        return value
 # custom_fwd is a decorator that may or may not be used with arguments, following
 # https://github.com/dabeaz/python-cookbook/tree/master/src/9/defining_a_decorator_that_takes_an_optional_argument.
 # this works:
 #     @custom_fwd
 #     def forward(...):
 # this also works:
 #     @custom_fwd(cast_inputs=torch.float)
 #     def forward(...):
 def custom_fwd(fwd=None, *, cast_inputs=None):
    """
    Create a helper decorator for ``forward`` methods of custom autograd functions.
    Autograd functions are subclasses of :class:`torch.autograd.Function`.
    See the :ref:`example page<amp-custom-examples>` for more detail.
    Args:
        cast_inputs (:class:`torch.dtype` or None, optional, default=None):  If not ``None``,
            when ``forward`` runs in an autocast-enabled region, casts incoming
            floating-point VACC Tensors to the target dtype (non-floating-point Tensors are not affected),
            then executes ``forward`` with autocast disabled.
            If ``None``, ``forward``'s internal ops execute with the current autocast state.
    .. note::
        If the decorated ``forward`` is called outside an autocast-enabled region,
        :func:`custom_fwd<custom_fwd>` is a no-op and ``cast_inputs`` has no effect.
    """
    if fwd is None:
        return functools.partial(custom_fwd, cast_inputs=cast_inputs)
    @functools.wraps(fwd)
    def decorate_fwd(*args, **kwargs):
        args[0]._dtype = torch.get_autocast_gpu_dtype()
        if cast_inputs is None:
            args[0]._fwd_used_autocast = torch.is_autocast_enabled()
            return fwd(*args, **kwargs)
        else:
            autocast_context = torch.is_autocast_enabled()
            args[0]._fwd_used_autocast = False
            if autocast_context:
                with autocast(enabled=False):
                    return fwd(*_cast(args, cast_inputs), **_cast(kwargs, cast_inputs))
            else:
                return fwd(*args, **kwargs)
    return decorate_fwd
 # Autograd ensures incoming gradients are the same type as forward outputs.  Allowing a separate
 # cast_inputs argument on custom_bwd is unnecessary and could cause errors if it doesn't match
 # cast_inputs supplied to custom_fwd.
 def custom_bwd(bwd):
    """Create a helper decorator for backward methods of custom autograd functions.
    Autograd functions are subclasses of :class:`torch.autograd.Function`.
    Ensures that ``backward`` executes with the same autocast state as ``forward``.
    See the :ref:`example page<amp-custom-examples>` for more detail.
    """
    @functools.wraps(bwd)
    def decorate_bwd(*args, **kwargs):
        with autocast(enabled=args[0]._fwd_used_autocast, dtype=args[0]._dtype):
            return bwd(*args, **kwargs)
    return decorate_bwd
--- a/torch_vacc/vacc/amp/common.py
+++ b/torch_vacc/vacc/amp/common.py
@@ -0,0 +1,7 @@
 import torch
 __all__ = ["amp_definitely_not_available"]
 def amp_definitely_not_available():
    return not torch.vacc.is_available()
--- a/torch_vacc/vacc/amp/grad_scaler.py
+++ b/torch_vacc/vacc/amp/grad_scaler.py
@@ -0,0 +1,667 @@
 import inspect
 import warnings
 from collections import abc, defaultdict
 from enum import Enum
 from typing import Any, cast, Dict, List, Optional, Tuple
 import torch
 from .common import amp_definitely_not_available
 __all__ = ["OptState", "GradScaler"]
 class _MultiDeviceReplicator:
    """
    Lazily serves copies of a tensor to requested devices.  Copies are cached per-device.
    """
    def __init__(self, master_tensor: torch.Tensor) -> None:
        assert (
            master_tensor.is_cuda
            or master_tensor.device.type == "xla"
            or master_tensor.device.type == "vacc"
        )
        self.master = master_tensor
        self._per_device_tensors: Dict[torch.device, torch.Tensor] = {}
    def get(self, device) -> torch.Tensor:
        retval = self._per_device_tensors.get(device, None)
        if retval is None:
            retval = self.master.to(device=device, non_blocking=True, copy=True)
            self._per_device_tensors[device] = retval
        return retval
 # Defines default_factory for GradScaler's _per_optimizer_states defaultdict,
 # as well as associated "enum" values.  Prefers defining these at top level because
 # - Lambdas can't be pickled, so we don't want to supply a lambda as the factory.
 # - Defining READY, UNSCALED, STEPPED and _refresh_per_optimizer_state within GradScaler
 #   causes a circular reference, which we'd rather avoid.
 class OptState(Enum):
    READY = 0
    UNSCALED = 1
    STEPPED = 2
 def _refresh_per_optimizer_state():
    return {"stage": OptState.READY, "found_inf_per_device": {}}
 class GradScaler:
    _scale: Optional[torch.Tensor]
    _grows_tracker: Optional[torch.Tensor]
    _per_optimizer_states: Dict[int, Dict[str, Any]]
    """
    An instance ``scaler`` of :class:`GradScaler` helps perform the steps of gradient scaling
    conveniently.
    * ``scaler.scale(loss)`` multiplies a given loss by ``scaler``'s current scale factor.
    * ``scaler.step(optimizer)`` safely unscales gradients and calls ``optimizer.step()``.
    * ``scaler.update()`` updates ``scaler``'s scale factor.
    Example::
        # Creates a GradScaler once at the beginning of training.
        scaler = GradScaler()
        for epoch in epochs:
            for input, target in data:
                optimizer.zero_grad()
                output = model(input)
                loss = loss_fn(output, target)
                # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
                scaler.scale(loss).backward()
                # scaler.step() first unscales gradients of the optimizer's params.
                # If gradients don't contain infs/NaNs, optimizer.step() is then called,
                # otherwise, optimizer.step() is skipped.
                scaler.step(optimizer)
                # Updates the scale for next iteration.
                scaler.update()
    See the :ref:`Automatic Mixed Precision examples<amp-examples>` for usage
    (along with autocasting) in more complex cases like gradient clipping, gradient accumulation, gradient penalty,
    and multiple losses/optimizers.
    ``scaler`` dynamically estimates the scale factor each iteration.  To minimize gradient underflow,
    a large scale factor should be used.  However, ``float16`` values can "overflow" (become inf or NaN) if
    the scale factor is too large.  Therefore, the optimal scale factor is the largest factor that can be used
    without incurring inf or NaN gradient values.
    ``scaler`` approximates the optimal scale factor over time by checking the gradients for infs and NaNs during every
    ``scaler.step(optimizer)`` (or optional separate ``scaler.unscale_(optimizer)``, see :meth:`unscale_`).
    * If infs/NaNs are found, ``scaler.step(optimizer)`` skips the underlying ``optimizer.step()`` (so the params
      themselves remain uncorrupted) and ``update()`` multiplies the scale by ``backoff_factor``.
    * If no infs/NaNs are found, ``scaler.step(optimizer)`` runs the underlying ``optimizer.step()`` as usual.
      If ``growth_interval`` unskipped iterations occur consecutively, ``update()`` multiplies the scale by
      ``growth_factor``.
    The scale factor often causes infs/NaNs to appear in gradients for the first few iterations as its
    value calibrates.  ``scaler.step`` will skip the underlying ``optimizer.step()`` for these
    iterations.  After that, step skipping should occur rarely (once every few hundred or thousand iterations).
    Args:
        init_scale (float, optional, default=2.**16):  Initial scale factor.
        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
            :meth:`update` if inf/NaN gradients occur in an iteration.
        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
            that must occur for the scale to be multiplied by ``growth_factor``.
        enabled (bool, optional):  If ``False``, disables gradient scaling. :meth:`step` simply
            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
            Default: ``True``
    """
    def __init__(
        self,
        init_scale=2.0**16,
        growth_factor=2.0,
        backoff_factor=0.5,
        growth_interval=2000,
        enabled=True,
    ):
        if enabled and amp_definitely_not_available():
            warnings.warn(
                "torch.vacc.amp.GradScaler is enabled, but VACC device is not available.  Disabling."
            )
            self._enabled = False
        else:
            self._enabled = enabled
        if self._enabled:
            assert growth_factor > 1.0, "The growth factor must be > 1.0."
            assert backoff_factor < 1.0, "The backoff factor must be < 1.0."
            self._init_scale = init_scale
            # self._scale will be lazily initialized during the first call to scale()
            self._scale = None
            self._growth_factor = growth_factor
            self._backoff_factor = backoff_factor
            self._growth_interval = growth_interval
            self._init_growth_tracker = 0
            # self._growth_tracker will be lazily initialized during the first call to scale()
            self._growth_tracker = None
            self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
    def _check_scale_growth_tracker(
        self, funcname
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        fix = "This may indicate your script did not use scaler.scale(loss or outputs) earlier in the iteration."
        assert self._scale is not None, (
            f"Attempted {funcname} but _scale is None.  " + fix
        )
        assert self._growth_tracker is not None, (
            f"Attempted {funcname} but _growth_tracker is None.  " + fix
        )
        return (self._scale, self._growth_tracker)
    def _lazy_init_scale_growth_tracker(self, dev):
        assert self._growth_tracker is None, "_growth_tracker initialized before _scale"
        self._scale = torch.full((), self._init_scale, dtype=torch.float32, device=dev)
        self._growth_tracker = torch.full(
            (), self._init_growth_tracker, dtype=torch.int32, device=dev
        )
    def scale(self, outputs):
        """
        Multiplies ('scales') a tensor or list of tensors by the scale factor.
        Returns scaled outputs.  If this instance of :class:`GradScaler` is not enabled, outputs are returned
        unmodified.
        Args:
            outputs (Tensor or iterable of Tensors):  Outputs to scale.
        """
        if not self._enabled:
            return outputs
        # Short-circuit for the common case.
        if isinstance(outputs, torch.Tensor):
            assert (
                outputs.is_cuda
                or outputs.device.type == "xla"
                or outputs.device.type == "vacc"
            )
            if self._scale is None:
                self._lazy_init_scale_growth_tracker(outputs.device)
            assert self._scale is not None
            return outputs * self._scale.to(device=outputs.device, non_blocking=True)
        # Invoke the more complex machinery only if we're treating multiple outputs.
        stash: List[
            _MultiDeviceReplicator
        ] = []  # holds a reference that can be overwritten by apply_scale
        def apply_scale(val):
            if isinstance(val, torch.Tensor):
                assert (
                    val.is_cuda or val.device.type == "xla" or val.device.type == "vacc"
                )
                if len(stash) == 0:
                    if self._scale is None:
                        self._lazy_init_scale_growth_tracker(val.device)
                    assert self._scale is not None
                    stash.append(_MultiDeviceReplicator(self._scale))
                return val * stash[0].get(val.device)
            elif isinstance(val, abc.Iterable):
                iterable = map(apply_scale, val)
                if isinstance(val, (list, tuple)):
                    return type(val)(iterable)
                else:
                    return iterable
            else:
                raise ValueError("outputs must be a Tensor or an iterable of Tensors")
        return apply_scale(outputs)
    def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16):
        per_device_inv_scale = _MultiDeviceReplicator(inv_scale)
        per_device_found_inf = _MultiDeviceReplicator(found_inf)
        # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
        # There could be hundreds of grads, so we'd like to iterate through them just once.
        # However, we don't know their devices or dtypes in advance.
        # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
        # Google says mypy struggles with defaultdicts type annotations.
        per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))  # type: ignore[var-annotated]
        with torch.no_grad():
            for group in optimizer.param_groups:
                for param in group["params"]:
                    if param.grad is None:
                        continue
                    if (not allow_fp16) and param.grad.dtype == torch.float16:
                        raise ValueError("Attempting to unscale FP16 gradients.")
                    if param.grad.is_sparse:
                        # is_coalesced() == False means the sparse grad has values with duplicate indices.
                        # coalesce() deduplicates indices and adds all values that have the same index.
                        # For scaled fp16 values, there's a good chance coalescing will cause overflow,
                        # so we should check the coalesced _values().
                        if param.grad.dtype is torch.float16:
                            param.grad = param.grad.coalesce()
                        to_unscale = param.grad._values()
                    else:
                        to_unscale = param.grad
                    # TODO: is there a way to split by device and dtype without appending in the inner loop?
                    per_device_and_dtype_grads[to_unscale.device][
                        to_unscale.dtype
                    ].append(to_unscale)
            for device, per_dtype_grads in per_device_and_dtype_grads.items():
                for grads in per_dtype_grads.values():
                    torch._amp_foreach_non_finite_check_and_unscale_(
                        grads,
                        per_device_found_inf.get(device),
                        per_device_inv_scale.get(device),
                    )
        return per_device_found_inf._per_device_tensors
    def unscale_(self, optimizer):
        """
        Divides ("unscales") the optimizer's gradient tensors by the scale factor.
        :meth:`unscale_` is optional, serving cases where you need to
        :ref:`modify or inspect gradients<working-with-unscaled-gradients>`
        between the backward pass(es) and :meth:`step`.
        If :meth:`unscale_` is not called explicitly,  gradients will be unscaled  automatically during :meth:`step`.
        Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::
            ...
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
            scaler.step(optimizer)
            scaler.update()
        Args:
            optimizer (torch.optim.Optimizer):  Optimizer that owns the gradients to be unscaled.
        .. note::
            :meth:`unscale_` does not incur a CPU-GPU sync.
        .. warning::
            :meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
            and only after all gradients for that optimizer's assigned parameters have been accumulated.
            Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.
        .. warning::
            :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
        """
        if not self._enabled:
            return
        self._check_scale_growth_tracker("unscale_")
        optimizer_state = self._per_optimizer_states[id(optimizer)]
        if optimizer_state["stage"] is OptState.UNSCALED:
            raise RuntimeError(
                "unscale_() has already been called on this optimizer since the last update()."
            )
        elif optimizer_state["stage"] is OptState.STEPPED:
            raise RuntimeError("unscale_() is being called after step().")
        # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
        assert self._scale is not None
        inv_scale = self._scale.double().reciprocal().float()
        found_inf = torch.full((), 0.0, dtype=torch.float32, device=self._scale.device)
        optimizer_state["found_inf_per_device"] = self._unscale_grads_(
            optimizer, inv_scale, found_inf, False
        )
        optimizer_state["stage"] = OptState.UNSCALED
    def _maybe_opt_step(self, optimizer, optimizer_state, *args, **kwargs):
        retval = None
        if not sum(v.item() for v in optimizer_state["found_inf_per_device"].values()):
            retval = optimizer.step(*args, **kwargs)
        return retval
    def step(self, optimizer, *args, **kwargs):
        """
        :meth:`step` carries out the following two operations:
        1.  Internally invokes ``unscale_(optimizer)`` (unless :meth:`unscale_` was explicitly called for ``optimizer``
            earlier in the iteration).  As part of the :meth:`unscale_`, gradients are checked for infs/NaNs.
        2.  If no inf/NaN gradients are found, invokes ``optimizer.step()`` using the unscaled
            gradients.  Otherwise, ``optimizer.step()`` is skipped to avoid corrupting the params.
        ``*args`` and ``**kwargs`` are forwarded to ``optimizer.step()``.
        Returns the return value of ``optimizer.step(*args, **kwargs)``.
        Args:
            optimizer (torch.optim.Optimizer):  Optimizer that applies the gradients.
            args:  Any arguments.
            kwargs:  Any keyword arguments.
        .. warning::
            Closure use is not currently supported.
        """
        if not self._enabled:
            return optimizer.step(*args, **kwargs)
        if "closure" in kwargs:
            raise RuntimeError(
                "Closure use is not currently supported if GradScaler is enabled."
            )
        self._check_scale_growth_tracker("step")
        optimizer_state = self._per_optimizer_states[id(optimizer)]
        if optimizer_state["stage"] is OptState.STEPPED:
            raise RuntimeError(
                "step() has already been called since the last update()."
            )
        retval = None
        if (
            hasattr(optimizer, "_step_supports_amp_scaling")
            and optimizer._step_supports_amp_scaling
        ):
            # This optimizer has customized scale-handling logic, so we can call optimizer.step() directly.
            # The contract with custom optimizers is that their step() should accept an additional,
            # optional grad_scaler kwarg.  We append self to the kwargs so the custom optimizer has full information:
            # it can query its own state, invoke unscale_ on itself, etc
            # The contract above is being deprecated to avoid introducing `grad_scaler: GradScaler` argument
            # to `Optimizer.step`. The new behavior is going to add two Tensor attributes of `grad_scale`
            # and `found_inf` to the passed optimizer so that the optimizer can utilize those
            # to skip the parameter updates or unscale gradients before updating parameters in
            # the fused kernel, e.g. `FusedAdamMathFunctor`.
            # In this behavior, `GradScaler._check_inf_per_device` is called if `OptState.READY`,
            # while the method is expected to be called by users side, i.e. their optimizers.
            kwargs_ = kwargs
            has_grad_scaler_kwarg = (
                "grad_scaler" in inspect.signature(optimizer.step).parameters
            )
            if has_grad_scaler_kwarg:
                warnings.warn(
                    "GradScaler is going to stop passing itself as a keyword argument to the passed "
                    "optimizer. In the near future GradScaler registers `grad_scale: Tensor` and "
                    "`found_inf: Tensor` to the passed optimizer and let the optimizer use them directly.",
                    FutureWarning,
                )
                kwargs_.update({"grad_scaler": self})
            else:
                if optimizer_state["stage"] is OptState.READY:
                    self._check_inf_per_device(optimizer)
                scaler = self._get_scale_async()
                found_inf = cast(
                    torch.Tensor,
                    sum(
                        [
                            t.to(scaler.device, non_blocking=True)
                            for t in optimizer_state["found_inf_per_device"].values()
                        ]
                    ),
                )
                optimizer.grad_scale = (
                    None if optimizer_state["stage"] == OptState.UNSCALED else scaler
                )
                optimizer.found_inf = found_inf
            retval = optimizer.step(*args, **kwargs_)
            optimizer_state["stage"] = OptState.STEPPED
            if not has_grad_scaler_kwarg:
                del optimizer.grad_scale
                del optimizer.found_inf
            return retval
        if optimizer_state["stage"] is OptState.READY:
            self.unscale_(optimizer)
        assert (
            len(optimizer_state["found_inf_per_device"]) > 0
        ), "No inf checks were recorded for this optimizer."
        retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs)
        optimizer_state["stage"] = OptState.STEPPED
        return retval
    def update(self, new_scale=None):
        """
        Updates the scale factor.
        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
        the scale is multiplied by ``growth_factor`` to increase it.
        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
        used directly, it's used to fill GradScaler's internal scale tensor. So if
        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
        affect the scale GradScaler uses internally.)
        Args:
            new_scale (float or :class:`torch.vacc.FloatTensor`, optional, default=None):  New scale factor.
        .. warning::
            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
            been invoked for all optimizers used this iteration.
        .. warning::
            For performance reasons, we do not check the scale factor value to avoid synchronizations,
            so the scale factor is not guaranteed to be above 1. If the scale falls below 1 and/or
            you are seeing NaNs in your gradients or loss, something is likely wrong. For example,
            bf16-pretrained models are often incompatible with AMP/fp16 due to differing dynamic ranges.
        """
        if not self._enabled:
            return
        _scale, _growth_tracker = self._check_scale_growth_tracker("update")
        if new_scale is not None:
            # Accept a new user-defined scale.
            if isinstance(new_scale, float):
                self._scale.fill_(new_scale)  # type: ignore[union-attr]
            else:
                reason = "new_scale should be a float or a 1-element torch.vacc.FloatTensor with requires_grad=False."
                # assert isinstance(new_scale, torch.vacc.FloatTensor), reason  # type: ignore[attr-defined]
                assert (
                    isinstance(new_scale, torch.Tensor)
                    and new_scale.dtype == torch.float32
                ), reason
                assert new_scale.numel() == 1, reason
                assert new_scale.requires_grad is False, reason
                self._scale.copy_(new_scale)  # type: ignore[union-attr]
        else:
            # Consume shared inf/nan data collected from optimizers to update the scale.
            # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
            found_infs = [
                found_inf.to(device=_scale.device, non_blocking=True)
                for state in self._per_optimizer_states.values()
                for found_inf in state["found_inf_per_device"].values()
            ]
            assert len(found_infs) > 0, "No inf checks were recorded prior to update."
            found_inf_combined = found_infs[0]
            if len(found_infs) > 1:
                for i in range(1, len(found_infs)):
                    found_inf_combined += found_infs[i]
            torch._amp_update_scale_(
                _scale,
                _growth_tracker,
                found_inf_combined,
                self._growth_factor,
                self._backoff_factor,
                self._growth_interval,
            )
        # To prepare for next iteration, clear the data collected from optimizers this iteration.
        self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
    def _get_scale_async(self):
        return self._scale
    def get_scale(self):
        """
        Returns a Python float containing the current scale, or 1.0 if scaling is disabled.
        .. warning::
            :meth:`get_scale` incurs a CPU-GPU sync.
        """
        if self._enabled:
            return (
                self._init_scale
                if self._scale is None
                else self._get_scale_async().item()
            )
        else:
            return 1.0
    def get_growth_factor(self):
        r"""
        Returns a Python float containing the scale growth factor.
        """
        return self._growth_factor
    def set_growth_factor(self, new_factor):
        r"""
        Args:
            new_scale (float):  Value to use as the new scale growth factor.
        """
        self._growth_factor = new_factor
    def get_backoff_factor(self):
        r"""
        Returns a Python float containing the scale backoff factor.
        """
        return self._backoff_factor
    def set_backoff_factor(self, new_factor):
        r"""
        Args:
            new_scale (float):  Value to use as the new scale backoff factor.
        """
        self._backoff_factor = new_factor
    def get_growth_interval(self):
        r"""
        Returns a Python int containing the growth interval.
        """
        return self._growth_interval
    def set_growth_interval(self, new_interval):
        r"""
        Args:
            new_interval (int):  Value to use as the new growth interval.
        """
        self._growth_interval = new_interval
    def _get_growth_tracker(self):
        if self._enabled:
            return (
                self._init_growth_tracker
                if self._growth_tracker is None
                else self._growth_tracker.item()
            )
        else:
            return 0
    def is_enabled(self):
        r"""
        Returns a bool indicating whether this instance is enabled.
        """
        return self._enabled
    def state_dict(self):
        r"""
        Returns the state of the scaler as a :class:`dict`.  It contains five entries:
        * ``"scale"`` - a Python float containing the current scale
        * ``"growth_factor"`` - a Python float containing the current growth factor
        * ``"backoff_factor"`` - a Python float containing the current backoff factor
        * ``"growth_interval"`` - a Python int containing the current growth interval
        * ``"_growth_tracker"`` - a Python int containing the number of recent consecutive unskipped steps.
        If this instance is not enabled, returns an empty dict.
        .. note::
           If you wish to checkpoint the scaler's state after a particular iteration, :meth:`state_dict`
           should be called after :meth:`update`.
        """
        return (
            {
                "scale": self.get_scale(),
                "growth_factor": self._growth_factor,
                "backoff_factor": self._backoff_factor,
                "growth_interval": self._growth_interval,
                "_growth_tracker": self._get_growth_tracker(),
            }
            if self._enabled
            else {}
        )
    def load_state_dict(self, state_dict):
        r"""
        Loads the scaler state.  If this instance is disabled, :meth:`load_state_dict` is a no-op.
        Args:
           state_dict(dict): scaler state.  Should be an object returned from a call to :meth:`state_dict`.
        """
        if not self._enabled:
            return
        if len(state_dict) == 0:
            raise RuntimeError(
                "The source state dict is empty, possibly because it was saved "
                "from a disabled instance of GradScaler."
            )
        self._init_scale = state_dict["scale"]
        if self._scale is not None:
            self._scale.fill_(state_dict["scale"])
        self._growth_factor = state_dict["growth_factor"]
        self._backoff_factor = state_dict["backoff_factor"]
        self._growth_interval = state_dict["growth_interval"]
        self._init_growth_tracker = state_dict["_growth_tracker"]
        if self._growth_tracker is not None:
            self._growth_tracker.fill_(state_dict["_growth_tracker"])
    def __getstate__(self):
        state = self.__dict__.copy()
        if self._enabled:
            assert len(self._per_optimizer_states) == 0, (
                "A GradScaler instance may only be pickled at the beginning "
                "of an iteration, or at the end after scaler.update()."
            )
            # Pickling _scale and _growth_tracker Tensors directly triggers
            # "warnings.warn("pickle support for Storage will be removed in 1.5..."
            # so instead, we set the unpickled instance up to reinitialize them lazily.
            state["_init_scale"] = self.get_scale()
            state["_init_growth_tracker"] = self._get_growth_tracker()
            state["_scale"] = None
            state["_growth_tracker"] = None
        return state
    def __setstate__(self, state):
        self.__dict__.update(state)
    def _check_inf_per_device(self, optimizer):
        _scale, _ = self._check_scale_growth_tracker("_check_inf_per_device")
        dummy_inv_scale = torch.full((), 1.0, dtype=torch.float32, device=_scale.device)
        found_inf = torch.full((), 0.0, dtype=torch.float32, device=_scale.device)
        self._per_optimizer_states[id(optimizer)][
            "found_inf_per_device"
        ] = self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)
        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
    def _found_inf_per_device(self, optimizer):
        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
--- a/torch_vacc/vacc/custom_ops.py
+++ b/torch_vacc/vacc/custom_ops.py
--- a/torch_vacc/vacc/custom_ops_cpu.py
+++ b/torch_vacc/vacc/custom_ops_cpu.py
@@ -0,0 +1,306 @@
 from typing import Tuple, Union, Optional, List
 import torch
 from torch.nn import functional as F
 def split_last_two_dims_into_blocks(x, h, w):
    leading_dims = x.shape[:-2]
    H, W = x.shape[-2:]
    assert (
        H % h == 0 and W % w == 0
    ), "The last two dimensions must be divisible by block size."
    x_reshaped = x.view(-1, 1, H, W)
    unfolded = F.unfold(x_reshaped, kernel_size=(h, w), stride=(h, w))
    unfolded = unfolded.view(-1, 1, h, w, H // h, W // w)
    unfolded = unfolded.permute(0, 1, 4, 5, 2, 3)
    final_shape = leading_dims + (H // h, W // w, h, w)
    result = unfolded.view(final_shape)
    return result
 def merge_blocks_to_original_layout(x, h, w):
    leading_dims = x.shape[:-4]
    H_div_h, W_div_w, h, w = x.shape[-4:]
    H = H_div_h * h
    W = W_div_w * w
    x_reshaped = x.view(-1, 1, H_div_h, W_div_w, h, w)
    x_reshaped = x_reshaped.permute(0, 1, 4, 5, 2, 3)
    x_reshaped = x_reshaped.view(-1, h * w, H_div_h * W_div_w)
    folded = F.fold(x_reshaped, output_size=(H, W), kernel_size=(h, w), stride=(h, w))
    final_shape = leading_dims + (H, W)
    result = folded.view(final_shape)
    return result
 def w8a8_block_fp8_matmul(
    input: torch.Tensor,
    weight: torch.Tensor,
    input_scale: Optional[torch.Tensor],
    weight_scale: Optional[torch.Tensor],
    block_size: List[int],
    is_linear_weight: bool = False,
    output_opt: Optional[torch.Tensor] = None,
    **kwargs
 ):
    b0, b1 = block_size
    dim0, dim1 = weight.shape
    dim0pad, dim1pad = 0, 0
    if dim0 % b0 != 0:
        dim0pad = b0 - dim0 % b0
    if dim1 % b1 != 0:
        dim1pad = b1 - dim1 % b1
    dim0_origin, dim1_origin = dim0, dim1
    dim0 += dim0pad
    dim1 += dim1pad
    bs0, bs1 = dim0 // b0, dim1 // b1
    weight_dequant = torch.nn.functional.pad(weight, (0, dim1pad, 0, dim0pad), value=0)
    weight_dequant = weight_dequant.cpu().view(bs0, b0, bs1, b1).permute(
        0, 2, 1, 3
    ).reshape(bs0, bs1, -1).float().to(input.device) * weight_scale.unsqueeze(-1)
    weight_dequant = (
        weight_dequant.reshape(bs0, bs1, b0, b1)
        .permute(0, 2, 1, 3)
        .reshape(dim0, dim1)
        .to(input.dtype)
    )
    weight_dequant = weight_dequant[:dim0_origin, :dim1_origin]
    output = torch.matmul(
        input, weight_dequant.T if is_linear_weight else weight_dequant
    )
    if output_opt is not None:
        output = output_opt.copy_(output)
    return output
 def w8a8_block_fp8_linear(
    input: torch.Tensor,
    weight: torch.Tensor,
    input_scale: Optional[torch.Tensor],
    weight_scale: Optional[torch.Tensor],
    block_size: List[int],
    **kwargs
 ):
    assert input_scale is None, "w8a8_block_fp8_matmul only support quant weight now"
    return w8a8_block_fp8_matmul(
        input, weight, None, weight_scale, block_size, is_linear_weight=True
    )
 def fused_experts(
    hidden_states: torch.Tensor,
    w13_weight: torch.Tensor,
    w2_weight: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    use_fp8_w8a8: bool = True,
    w13_scale: Optional[torch.Tensor] = None,
    w2_scale: Optional[torch.Tensor] = None,
    a13_scale: Optional[torch.Tensor] = None,
    a2_scale: Optional[torch.Tensor] = None,
    block_shape: Optional[List[int]] = None,
    decode_with_batch: bool = False,
 ) -> torch.Tensor:
    batch_seq_all, hidden_dims = hidden_states.shape
    intermediate_size = w2_weight.shape[-1]
    num_experts = w13_weight.shape[0]
    w13_weight = w13_weight.contiguous()
    w2_weight = w2_weight.contiguous()
    w13_scale = w13_scale.contiguous()
    w2_scale = w2_scale.contiguous()
    final_hidden_states = torch.zeros_like(hidden_states)
    import torch.nn.functional as F
    w1_scale = w13_scale
    w2_scale = w2_scale
    _, bs0_w13, bs1_w13 = w1_scale.shape
    _, bs0_w2, bs1_w2 = w2_scale.shape
    sel_experts = topk_ids.shape[1]
    if hidden_states.shape[0] == 1:
        for id in range(sel_experts):
            expert_idx = topk_ids[0][id]
            expert_w1 = w13_weight[expert_idx].contiguous()
            expert_w2 = w2_weight[expert_idx].contiguous()
            ws1 = w1_scale[expert_idx].unsqueeze(2).contiguous()
            ws2 = w2_scale[expert_idx].unsqueeze(2).contiguous()
            dim0, dim1 = expert_w1.shape
            b0, b1 = dim0 // bs0_w13, dim1 // bs1_w13
            # assert (bs0, bs1, 1)==ws1.shape, f"bs0, bs1, 1 is {bs0},{bs1}, 1, <==> {ws1.shape}"
            expert_w1 = (
                expert_w1
                .view(bs0_w13, b0, bs1_w13, b1)
                .permute(0, 2, 1, 3)
                .reshape(bs0_w13, bs1_w13, -1)
                .float()
                .to(hidden_states.device)
                * ws1
            )
            expert_w1 = (
                expert_w1.reshape(bs0_w13, bs1_w13, b0, b1)
                .permute(0, 2, 1, 3)
                .reshape(dim0, dim1)
                .to(hidden_states.dtype)
            )
            dim0, dim1 = expert_w2.shape
            b0, b1 = dim0 // bs0_w2, dim1 // bs1_w2
            # assert (bs0, bs1, 1)==ws2.shape
            expert_w2 = (
                expert_w2
                .view(bs0_w2, b0, bs1_w2, b1)
                .permute(0, 2, 1, 3)
                .reshape(bs0_w2, bs1_w2, -1)
                .float()
                .to(hidden_states.device)
                * ws2
            )
            expert_w2 = (
                expert_w2.reshape(bs0_w2, bs1_w2, b0, b1)
                .permute(0, 2, 1, 3)
                .reshape(dim0, dim1)
                .to(hidden_states.dtype)
            )
            expert_weights = topk_weights[0][id].to(hidden_states.dtype)
            x = hidden_states
            x = F.linear(x, expert_w1)
            gate = F.silu(x[:, :intermediate_size])
            x = x[:, intermediate_size:] * gate
            x = F.linear(x, expert_w2)
            current_hidden_states = x * expert_weights
            current_hidden_states = current_hidden_states.to(x.dtype)
            final_hidden_states += current_hidden_states
    else:
        for expert_idx in range(num_experts):
            # topk_ids       [tokens, experts] => sample:[10, 8]
            # expert_mask    [tokens, experts] => sample:[10, 8]
            expert_mask = topk_ids == expert_idx
            idx = torch.where(expert_mask)[0]
            if idx.numel() == 0:
                continue
            expert_w1 = w13_weight[expert_idx].contiguous()
            expert_w2 = w2_weight[expert_idx].contiguous()
            ws1 = w1_scale[expert_idx].unsqueeze(2).contiguous()
            ws2 = w2_scale[expert_idx].unsqueeze(2).contiguous()
            dim0, dim1 = expert_w1.shape
            b0, b1 = dim0 // bs0_w13, dim1 // bs1_w13
            # assert (bs0, bs1, 1)==ws1.shape, f"bs0, bs1, 1 is {bs0},{bs1}, 1, <==> {ws1.shape}"
            expert_w1 = (
                expert_w1
                .view(bs0_w13, b0, bs1_w13, b1)
                .permute(0, 2, 1, 3)
                .reshape(bs0_w13, bs1_w13, -1)
                .float()
                .to(hidden_states.device)
                * ws1
            )
            expert_w1 = (
                expert_w1.reshape(bs0_w13, bs1_w13, b0, b1)
                .permute(0, 2, 1, 3)
                .reshape(dim0, dim1)
                .to(hidden_states.dtype)
            )
            dim0, dim1 = expert_w2.shape
            b0, b1 = dim0 // bs0_w2, dim1 // bs1_w2
            # assert (bs0, bs1, 1)==ws2.shape
            expert_w2 = (
                expert_w2
                .view(bs0_w2, b0, bs1_w2, b1)
                .permute(0, 2, 1, 3)
                .reshape(bs0_w2, bs1_w2, -1)
                .float()
                .to(hidden_states.device)
                * ws2
            )
            expert_w2 = (
                expert_w2.reshape(bs0_w2, bs1_w2, b0, b1)
                .permute(0, 2, 1, 3)
                .reshape(dim0, dim1)
                .to(hidden_states.dtype)
            )
            # [seq, experts]
            expert_weights = (
                topk_weights.masked_select(expert_mask)
                .unsqueeze(1)
                .to(hidden_states.dtype)
            )
            x = hidden_states[idx]
            x = F.linear(x, expert_w1)
            gate = F.silu(x[:, :intermediate_size])
            x = x[:, intermediate_size:] * gate
            x = F.linear(x, expert_w2)
            current_hidden_states = x * expert_weights
            current_hidden_states = current_hidden_states.to(x.dtype)
            # final_hidden_states[idx] += current_hidden_states
            final_hidden_states.index_add_(0, idx, current_hidden_states)
    final_hidden_states = final_hidden_states.reshape(batch_seq_all, hidden_dims)
    return final_hidden_states
 def fused_mlp_mm_fp8(
    hidden_states: torch.Tensor,
    w13_weight: torch.Tensor,
    w2_weight: torch.Tensor,
    use_fp8_w8a8: bool = True,
    w13_scale: Optional[torch.Tensor] = None,
    w2_scale: Optional[torch.Tensor] = None,
    a13_scale: Optional[torch.Tensor] = None,
    a2_scale: Optional[torch.Tensor] = None,
    block_shape_w13: Optional[List[int]] = None,
    block_shape_w2: Optional[List[int]] = None,
 ):
    def fp8_to_fp16(inp, scale, block_size, trans_type):
        inp_t = inp.to(trans_type)
        inp_t = split_last_two_dims_into_blocks(inp_t, block_size[0], block_size[1])
        assert scale.size(0) == inp_t.size(-4)
        assert scale.size(1) == inp_t.size(-3)
        inp_t = inp_t * scale.unsqueeze(-1).unsqueeze(-1)
        inp_t = merge_blocks_to_original_layout(inp_t, block_size[0], block_size[1])
        return inp_t.to(trans_type)
    w13_weight = w13_weight.contiguous()
    w2_weight = w2_weight.contiguous()
    w13_scale = w13_scale.contiguous()
    w2_scale = w2_scale.contiguous()
    w13_fp = fp8_to_fp16(w13_weight, w13_scale, block_shape_w13, hidden_states.dtype)
    w2_fp = fp8_to_fp16(w2_weight, w2_scale, block_shape_w2, hidden_states.dtype)
    out = hidden_states @ w13_fp
    out = torch.chunk(out, 2, dim=-1)
    out = F.silu(out[0]) * out[1]
    out = out @ w2_fp
    return out
 def mla_matmul_scale(input: torch.Tensor, weight: torch.Tensor, scale: float):
    output = torch.matmul(input, weight)
    output = output * scale
    output = output.to(input.dtype)
    return output
 def mla_matmul(input: torch.Tensor, weight: torch.Tensor):
    output = torch.matmul(input, weight)
    output = output.to(input.dtype)
    return output
--- a/torch_vacc/vacc/custom_qwen3_ops.py
+++ b/torch_vacc/vacc/custom_qwen3_ops.py
@@ -0,0 +1,146 @@
 from typing import List, Optional, Tuple, Union
 import torch
 from torch import Generator
 from torch_vacc._vacc_libs import _torch_vacc
 def fuse_moe_prefill_stage0_qwen(
    hidden_states,
    rms_residual,
    rms_weight,
    gate_weight,
    rms_hidden_state_opt: Optional[torch.Tensor] = None,
    zero_moe_hidden_state_opt: Optional[torch.Tensor] = None,
    topk_ids_opt: Optional[torch.Tensor] = None,
    topk_weight_opt: Optional[torch.Tensor] = None,
 ):
    return _torch_vacc.fuse_moe_prefill_stage0_qwen(
        hidden_states,
        rms_residual,
        rms_weight,
        gate_weight,
        rms_hidden_state_opt,
        zero_moe_hidden_state_opt,
        topk_ids_opt,
        topk_weight_opt,
    )
 def fuse_moe_decode_qwen(
    hidden_states,
    rms_residual,
    rms_weight,
    moe_weight_13,
    moe_weight_2,
    moe_weight_13_dequat,
    moe_weight_2_dequant,
    gate_weight,
    block_size_13,
    block_size_2,
    world_size: int,
    rank: int,
    group_id: int,
    dev_info: List[int] = None,
    output: Optional[torch.Tensor] = None,
 ):
    if 0 == len(dev_info):
        dev_info = [i | (i << 16) for i in range(world_size)]
    return _torch_vacc.fuse_moe_decode_qwen(
        hidden_states,
        rms_residual,
        rms_weight,
        moe_weight_13,
        moe_weight_2,
        moe_weight_13_dequat,
        moe_weight_2_dequant,
        gate_weight,
        block_size_13,
        block_size_2,
        world_size,
        rank,
        group_id,
        dev_info,
        output,
    )
 def rot_pos_emb_qwenvl(grid_thw: List[List[int]], 
                hidden_size: int, 
                head_num: int,
                spatial_merge_size: int,
                dtype: torch.dtype,
                device: Union[int, str, torch.device] = "vacc"):
    #assert out_tensor.device.type == "vacc", f"please target vacc device, now is {out_tensor.device}"
    if isinstance(device, str):
        device = torch.device(device)
    elif isinstance(device, int):
        device = torch.device("vacc", device)
    thws = []
    for i in grid_thw:
        thws.extend(i)
    return _torch_vacc.rot_pos_emb_qwenvl(thws, 
                                        hidden_size, 
                                        head_num, 
                                        spatial_merge_size,
                                        dtype,
                                        device)
 def fast_pos_embed_interpolate_qwenvl(weight: torch.Tensor, 
                               grid_thw: List[List[int]], 
                               num_grid_per_side: int, 
                               spatial_merge_size: int, 
                               hidden_dim: int):
    thws = []
    for i in grid_thw:
        thws.extend(i)
    return _torch_vacc.fast_pos_embed_interpolate_qwenvl(weight,
                                                  thws,
                                                  num_grid_per_side,
                                                  spatial_merge_size,
                                                  hidden_dim)
 # qwen2_vl and qwen3_vl img preocess op is same
 def qwen2vl_img_preprocess(
                            image: "torch.Tensor",
                            do_resize: bool,
                            min_pixels: int,
                            max_pixels: int,
                            do_rescale: bool,
                            rescale_factor: float,
                            do_normalize: bool,
                            resized_height: int,
                            resized_width: int,
                            interpolation: int, #Optional["F.InterpolationMode"],
                            patch_size: int,
                            temporal_patch_size: int,
                            merge_size: int,
                            image_mean0: float,
                            image_mean1: float,
                            image_mean2: float,
                            image_std0: float,
                            image_std1: float,
                            image_std2: float,
                            # batch_size: int = 1,
                            # grid_t: int = 1,
                            # channel: int = 3,
                            # output: Optional[torch.Tensor] = None
                            ):
    assert image.device.type == "vacc", f"please target vacc device, now is {image.device}"
    return _torch_vacc.qwen2vl_img_preprocess(
        image,
        do_resize,
        min_pixels,
        max_pixels,
        do_rescale,
        rescale_factor,
        do_normalize,
        resized_height,
        resized_width,
        interpolation,
        patch_size,
        temporal_patch_size,
        merge_size,
        image_mean0, image_mean1, image_mean2,
        image_std0, image_std1, image_std2
    )
--- a/torch_vacc/vacc/lazy_initialize.py
+++ b/torch_vacc/vacc/lazy_initialize.py
@@ -0,0 +1,107 @@
 import threading
 import traceback
 from typing import List
 from .._vacc_libs import _torch_vacc
 _initialized = False
 _tls = threading.local()
 _initialization_lock = threading.Lock()
 _queued_calls = []
 _is_in_bad_fork = getattr(_torch_vacc, "_vacc_in_bad_fork", lambda: False)
 def is_initialized():
    r"""Returns whether PyTorch's VACC state has been initialized."""
    return _initialized and not _is_in_bad_fork()
 class _LazySeedTracker:
    # Since seeding is memory-less, only track the latest seed.
    # Note: `manual_seed_all` followed by `manual_seed` overwrites
    # the seed on current device. We track the order of **latest**
    # calls between these two API.
    def __init__(self):
        self.manual_seed_all_cb = None
        self.manual_seed_cb = None
        self.call_order = []
    def queue_seed_all(self, cb, traceback):
        self.manual_seed_all_cb = (cb, traceback)
        # update seed_all to be latest
        self.call_order = [self.manual_seed_cb, self.manual_seed_all_cb]
    def queue_seed(self, cb, traceback):
        self.manual_seed_cb = (cb, traceback)
        # update seed to be latest
        self.call_order = [self.manual_seed_all_cb, self.manual_seed_cb]
    def get_calls(self) -> List:
        return self.call_order
 _lazy_seed_tracker = _LazySeedTracker()
 def _lazy_call(callable, **kwargs):
    if is_initialized():
        callable()
    else:
        # TODO(torch_deploy): this accesses linecache, which attempts to read the
        # file system to get traceback info. Patch linecache or do something
        # else here if this ends up being important.
        global _lazy_seed_tracker
        if kwargs.get("seed_all", False):
            _lazy_seed_tracker.queue_seed_all(callable, traceback.format_stack())
        elif kwargs.get("seed", False):
            _lazy_seed_tracker.queue_seed(callable, traceback.format_stack())
        else:
            # Don't store the actual traceback to avoid memory cycle
            _queued_calls.append((callable, traceback.format_stack()))
 class DeferredVaccCallError(Exception):
    pass
 def _lazy_init():
    """Initialize VACC device state."""
    global _initialized, _queued_calls
    if _initialized or hasattr(_tls, "is_initializing"):
        return
    with _initialization_lock:
        if _initialized:
            return
        # It is important to prevent other threads from entering _lazy_init
        # immediately, while we are still guaranteed to have the GIL, because some
        # of the C calls we make below will release the GIL
        if _is_in_bad_fork():
            raise RuntimeError(
                "Cannot re-initialize VACC in forked subprocess. To use VACC with "
                "multiprocessing, you must use the 'spawn' start method"
            )
        _torch_vacc._vacc_init()
        _tls.is_initializing = True
        for calls in _lazy_seed_tracker.get_calls():
            if calls:
                _queued_calls.append(calls)
        try:
            for queued_call, orig_traceback in _queued_calls:
                try:
                    queued_call()
                except Exception as e:
                    msg = (
                        f"VACC call failed lazily at initialization with error: {str(e)}\n\n"
                        f"VACC call was originally invoked at:\n\n{''.join(orig_traceback)}"
                    )
                    raise DeferredVaccCallError(msg) from e
        finally:
            delattr(_tls, "is_initializing")
        _initialized = True
--- a/torch_vacc/vacc/memory.py
+++ b/torch_vacc/vacc/memory.py
@@ -0,0 +1,535 @@
 import collections
 import contextlib
 import warnings
 from typing import Tuple
 import torch
 from torch._utils import _get_device_index
 import torch_vacc
 from torch_vacc._vacc_libs import _torch_vacc
 from .lazy_initialize import is_initialized, _lazy_init
 __all__ = [
    "mem_get_info",
    # "caching_allocator_alloc",
    # "caching_allocator_delete",
    "set_per_process_memory_fraction",
    "empty_cache",
    "memory_stats",
    "memory_stats_as_nested_dict",
    "reset_accumulated_memory_stats",
    "reset_peak_memory_stats",
    "reset_max_memory_allocated",
    "reset_max_memory_cached",
    "memory_allocated",
    "max_memory_allocated",
    "memory_reserved",
    "max_memory_reserved",
    "memory_cached",
    "max_memory_cached",
    "memory_snapshot",
    "memory_summary",
    "get_allocator_backend",
 ]
@contextlib.contextmanager
 def _free_mutex():
    _torch_vacc._vacc_lock_mutex()
    try:
        yield
    finally:
        _torch_vacc._vacc_unlock_mutex()
 # def caching_allocator_alloc(size, device=None, stream=None):
 #     r"""Performs a memory allocation using the VACC memory allocator.
 #     Memory is allocated for a given device and a stream, this
 #     function is intended to be used for interoperability with other
 #     frameworks. Allocated memory is released through
 #     :func:`~torch_vacc.vacc.caching_allocator_delete`.
 #     Arguments:
 #         size (int): number of bytes to be allocated.
 #         device (torch.device or int, optional): selected device. If it is
 #             ``None`` the default VACC device is used.
 #         stream (torch_vacc.vacc.Stream or int, optional): selected stream. If is ``None`` then
 #             the default stream for the selected device is used.
 #     """
 #     if device is None:
 #         device = torch_vacc.vacc.current_device()
 #     device = _get_device_index(device)
 #     if stream is None:
 #         stream = torch_vacc.vacc.current_stream(device)
 #     if isinstance(stream, torch_vacc.vacc.streams.Stream):
 #         stream = stream.vacc_stream
 #     if not isinstance(stream, int):
 #         raise TypeError(
 #             "Invalid type for stream argument, must be "
 #             "`torch_vacc.vacc.Stream` or `int` representing a pointer "
 #             "to a exisiting stream"
 #         )
 #     with torch_vacc.vacc.device(device):
 #         return _torch_vacc._vacc_vaccCachingAllocator_raw_alloc(size, stream)
 # def caching_allocator_delete(mem_ptr):
 #     r"""Deletes memory allocated using the VACC memory allocator.
 #     Memory allocated with :func:`~torch_vacc.vacc.caching_allocator_alloc`.
 #     is freed here. The associated device and stream are tracked inside
 #     the allocator.
 #     Arguments:
 #         mem_ptr (int): memory address to be freed by the allocator.
 #     """
 #     _torch_vacc._vacc_vaccCachingAllocator_raw_delete(mem_ptr)
 def set_per_process_memory_fraction(fraction, device=None) -> None:
    r"""Set memory fraction for a process.
    The fraction is used to limit an caching allocator to allocated memory on a VACC device.
    The allowed value equals the total visible memory multiplied fraction.
    If trying to allocate more than the allowed value in a process, will raise an out of
    memory error in allocator.
    Arguments:
        fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction.
        device (torch.device or int, optional): selected device. If it is
            ``None`` the default VACC device is used.
    .. note::
        In general, the total available free memory is less than the total capacity.
    """
    _lazy_init()
    if device is None:
        device = torch_vacc.vacc.current_device()
    device = _get_device_index(device)
    if not isinstance(fraction, float):
        raise TypeError("Invalid type for fraction argument, must be `float`")
    if fraction < 0 or fraction > 1:
        raise ValueError(
            "Invalid fraction value: {}. " "Allowed range: 0~1".format(fraction)
        )
    _torch_vacc._vacc_setMemoryFraction(fraction, device)
 def empty_cache():
    r"""Releases all unoccupied cached memory currently held by the caching
    allocator so that those can be used in other VACC application and visible in
    `nvidia-smi`.
    .. note::
        :func:`~torch_vacc.vacc.empty_cache` doesn't increase the amount of VACC
        memory available for PyTorch. However, it may help reduce fragmentation
        of VACC memory in certain cases.
    """
    if is_initialized():
        _torch_vacc._vacc_emptyCache()
 def memory_stats(device=None):
    """Returns a dictionary of VACC memory allocator statistics for a
    given device.
    The return value of this function is a dictionary of statistics, each of
    which is a non-negative integer.
    Core statistics:
    - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
      number of allocation requests received by the memory allocator.
    - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
      amount of allocated memory.
    - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
      number of reserved segments from ``vaccMalloc()``.
    - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
      amount of reserved memory.
    - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
      number of active memory blocks.
    - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
      amount of active memory.
    - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
      number of inactive, non-releasable memory blocks.
    - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
      amount of inactive, non-releasable memory.
    For these core statistics, values are broken down as follows.
    Pool type:
    - ``all``: combined statistics across all memory pools.
    - ``large_pool``: statistics for the large allocation pool
      (as of October 2019, for size >= 1MB allocations).
    - ``small_pool``: statistics for the small allocation pool
      (as of October 2019, for size < 1MB allocations).
    Metric type:
    - ``current``: current value of this metric.
    - ``peak``: maximum value of this metric.
    - ``allocated``: historical total increase in this metric.
    - ``freed``: historical total decrease in this metric.
    In addition to the core statistics, we also provide some simple event
    counters:
    - ``"num_alloc_retries"``: number of failed ``vaccMalloc`` calls that
      result in a cache flush and retry.
    - ``"num_ooms"``: number of out-of-memory errors thrown.
    The caching allocator can be configured via ENV to not split blocks larger than a
    defined size (see Memory Management section of the Cuda Semantics documentation).
    This helps avoid memory framentation but may have a performance
    penalty. Additional outputs to assist with tuning and evaluating impact:
    - ``"max_split_size"``: blocks above this size will not be split.
    - ``"oversize_allocations.{current,peak,allocated,freed}"``:
      number of over-size allocation requests received by the memory allocator.
    - ``"oversize_segments.{current,peak,allocated,freed}"``:
      number of over-size reserved segments from ``cudaMalloc()``.
    Arguments:
        device (torch.device or int, optional): selected device. Returns
            statistics for the current device, given by :func:`~torch_vacc.vacc.current_device`,
            if :attr:`device` is ``None`` (default).
    """
    result = []
    def _recurse_add_to_result(prefix, obj):
        if isinstance(obj, dict):
            if len(prefix) > 0:
                prefix += "."
            for k, v in obj.items():
                _recurse_add_to_result(prefix + k, v)
        else:
            result.append((prefix, obj))
    stats = memory_stats_as_nested_dict(device=device)
    _recurse_add_to_result("", stats)
    result.sort()
    return collections.OrderedDict(result)
 def memory_stats_as_nested_dict(device=None):
    r"""Returns the result of :func:`~torch_vacc.vacc.memory_stats` as a nested dictionary."""
    device = _get_device_index(device, optional=True)
    return _torch_vacc._vacc_memoryStats(device)
 def reset_accumulated_memory_stats(device=None):
    r"""Resets the "accumulated" (historical) stats tracked by the VACC memory allocator.
    See :func:`~torch_vacc.vacc.memory_stats` for details. Accumulated stats correspond to
    the `"allocated"` and `"freed"` keys in each individual stat dict, as well as
    `"num_alloc_retries"` and `"num_ooms"`.
    Arguments:
        device (torch.device or int, optional): selected device. Returns
            statistic for the current device, given by :func:`~torch_vacc.vacc.current_device`,
            if :attr:`device` is ``None`` (default).
    """
    device = _get_device_index(device, optional=True)
    return _torch_vacc._vacc_resetAccumulatedMemoryStats(device)
 def reset_peak_memory_stats(device=None):
    r"""Resets the "peak" stats tracked by the VACC memory allocator.
    See :func:`~torch_vacc.vacc.memory_stats` for details. Peak stats correspond to the
    `"peak"` key in each individual stat dict.
    Arguments:
        device (torch.device or int, optional): selected device. Returns
            statistic for the current device, given by :func:`~torch_vacc.vacc.current_device`,
            if :attr:`device` is ``None`` (default).
    """
    device = _get_device_index(device, optional=True)
    return _torch_vacc._vacc_resetPeakMemoryStats(device)
 def reset_max_memory_allocated(device=None):
    r"""Resets the starting point in tracking maximum VACC memory occupied by
    tensors for a given device.
    See :func:`~torch_vacc.vacc.max_memory_allocated` for details.
    Arguments:
        device (torch.device or int, optional): selected device. Returns
            statistic for the current device, given by :func:`~torch_vacc.vacc.current_device`,
            if :attr:`device` is ``None`` (default).
    .. warning::
        This function now calls :func:`~torch_vacc.vacc.reset_peak_memory_stats`, which resets
        /all/ peak memory stats.
    """
    # warnings.warn(
    #     "torch_vacc.vacc.reset_max_memory_allocated now calls torch_vacc.vacc.reset_peak_memory_stats, "
    #     "which resets /all/ peak memory stats.",
    #     DeprecationWarning,
    # )
    return reset_peak_memory_stats(device=device)
 def reset_max_memory_cached(device=None):
    r"""Resets the starting point in tracking maximum VACC memory managed by the
    caching allocator for a given device.
    See :func:`~torch_vacc.vacc.max_memory_cached` for details.
    Arguments:
        device (torch.device or int, optional): selected device. Returns
            statistic for the current device, given by :func:`~torch_vacc.vacc.current_device`,
            if :attr:`device` is ``None`` (default).
    .. warning::
        This function now calls :func:`~torch_vacc.vacc.reset_peak_memory_stats`, which resets
        /all/ peak memory stats.
    """
    # warnings.warn(
    #     "torch_vacc.vacc.reset_max_memory_cached now calls torch_vacc.vacc.reset_peak_memory_stats, "
    #     "which resets /all/ peak memory stats.",
    #     DeprecationWarning,
    # )
    return reset_peak_memory_stats(device=device)
 def memory_allocated(device=None):
    r"""Returns the current VACC memory occupied by tensors in bytes for a given
    device.
    Arguments:
        device (torch.device or int, optional): selected device. Returns
            statistic for the current device, given by :func:`~torch_vacc.vacc.current_device`,
            if :attr:`device` is ``None`` (default).
    """
    return memory_stats(device=device)["allocated_bytes.all.current"]
 def max_memory_allocated(device=None):
    r"""Returns the maximum VACC memory occupied by tensors in bytes for a given
    device.
    By default, this returns the peak allocated memory since the beginning of
    this program. :func:`~torch_vacc.vacc.reset_peak_stats` can be used to
    reset the starting point in tracking this metric. For example, these two
    functions can measure the peak allocated memory usage of each iteration in a
    training loop.
    Arguments:
        device (torch.device or int, optional): selected device. Returns
            statistic for the current device, given by :func:`~torch_vacc.vacc.current_device`,
            if :attr:`device` is ``None`` (default).
    """
    return memory_stats(device=device)["allocated_bytes.all.peak"]
 def memory_reserved(device=None):
    r"""Returns the current VACC memory managed by the caching allocator in bytes
    for a given device.
    Arguments:
        device (torch.device or int, optional): selected device. Returns
            statistic for the current device, given by :func:`~torch_vacc.vacc.current_device`,
            if :attr:`device` is ``None`` (default).
    """
    return memory_stats(device=device)["reserved_bytes.all.current"]
 def max_memory_reserved(device=None):
    r"""Returns the maximum VACC memory managed by the caching allocator in bytes
    for a given device.
    By default, this returns the peak cached memory since the beginning of this
    program. :func:`~torch_vacc.vacc.reset_peak_stats` can be used to reset
    the starting point in tracking this metric. For example, these two functions
    can measure the peak cached memory amount of each iteration in a training
    loop.
    Arguments:
        device (torch.device or int, optional): selected device. Returns
            statistic for the current device, given by :func:`~torch_vacc.vacc.current_device`,
            if :attr:`device` is ``None`` (default).
    """
    return memory_stats(device=device)["reserved_bytes.all.peak"]
 def memory_cached(device=None):
    r"""Deprecated; see :func:`~torch_vacc.vacc.memory_reserved`."""
    # warnings.warn(
    #     "torch_vacc.vacc.memory_cached has been renamed to torch_vacc.vacc.memory_reserved",
    #     DeprecationWarning,
    # )
    return memory_reserved(device=device)
 def max_memory_cached(device=None):
    r"""Deprecated; see :func:`~torch_vacc.vacc.max_memory_reserved`."""
    # warnings.warn(
    #     "torch_vacc.vacc.max_memory_cached has been renamed to torch_vacc.vacc.max_memory_reserved",
    #     DeprecationWarning,
    # )
    return max_memory_reserved(device=device)
 def memory_snapshot():
    r"""Returns a snapshot of the VACC memory allocator state across all devices.
    Interpreting the output of this function requires familiarity with the
    memory allocator internals.
    """
    return _torch_vacc._vacc_memorySnapshot()
 def _format_size(sz, pref_sz):
    prefixes = ["B ", "KB", "MB", "GB", "TB", "PB"]
    prefix = prefixes[0]
    for new_prefix in prefixes[1:]:
        if pref_sz < 768 * 1024:
            break
        prefix = new_prefix
        sz //= 1024
        pref_sz /= 1024
    return "{:7d} {}".format(sz, prefix)
 def _format_count(cnt, pref_cnt):
    prefixes = [" ", "K", "M"]
    prefix = prefixes[0]
    for new_prefix in prefixes[1:]:
        if pref_cnt < 750 * 1000:
            break
        prefix = new_prefix
        cnt //= 1000
        pref_cnt /= 1000
    return "{:7d} {} ".format(cnt, prefix)
 def create_metrics_to_display():
    metrics_to_display = [
        ("allocated_bytes", "Allocated memory", _format_size),
        ("active_bytes", "Active memory", _format_size),
        ("reserved_bytes", "VACC reserved memory", _format_size),
        ("inactive_split_bytes", "Non-releasable memory", _format_size),
        ("allocation", "Allocations", _format_count),
        ("active", "Active allocs", _format_count),
        ("segment", "VACC reserved segments", _format_count),
        ("inactive_split", "Non-releasable allocs", _format_count),
    ]
    lines = []
    lines.append("=" * 75)
    lines.append(" {_:16} PyTorch VACC memory summary, device ID {device:<18d} ")
    lines.append("-" * 75)
    lines.append(
        "  {_:9} VACC OOMs: {num_ooms:<13d} | {_:6} vaccMalloc retries: {num_alloc_retries:<9d}  "
    )
    lines.append("=" * 75)
    lines.append(
        "        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  "
    )
    return metrics_to_display, lines
 def memory_summary(device=None, abbreviated=False):
    r"""Returns a human-readable printout of the current memory allocator
    statistics for a given device.
    This can be useful to display periodically during training, or when
    handling out-of-memory exceptions.
    Arguments:
        device (torch.device or int, optional): selected device. Returns
            printout for the current device, given by :func:`~torch_vacc.vacc.current_device`,
            if :attr:`device` is ``None`` (default).
        abbreviated (bool, optional): whether to return an abbreviated summary
            (default: False).
    """
    device = _get_device_index(device, optional=True)
    stats = memory_stats(device=device)
    metrics_to_display, lines = create_metrics_to_display()
    for metric_key, metric_name, formatter in metrics_to_display:
        lines.append("-" * 75)
        submetrics = [("all", metric_name)]
        if not abbreviated:
            submetrics.append(("large_pool", "      from large pool"))
            submetrics.append(("small_pool", "      from small pool"))
        current_prefval, peak_prefval, allocated_prefval, freed_prefval = (
            None,
            None,
            None,
            None,
        )
        for submetric_key, submetric_name in submetrics:
            prefix = metric_key + "." + submetric_key + "."
            current = stats[prefix + "current"]
            peak = stats[prefix + "peak"]
            allocated = stats[prefix + "allocated"]
            freed = stats[prefix + "freed"]
            if current_prefval is None:
                current_prefval = current
                peak_prefval = peak
                allocated_prefval = allocated
                freed_prefval = freed
            lines.append(
                " {:<21} | {} | {} | {} | {} ".format(
                    submetric_name,
                    formatter(current, current_prefval),
                    formatter(peak, peak_prefval),
                    formatter(allocated, allocated_prefval),
                    formatter(freed, freed_prefval),
                ),
            )
    metrics_to_display = [
        ("oversize_allocations", "Oversize allocations", _format_count),
        ("oversize_segments", "Oversize VACC segments", _format_count),
    ]
    for metric_key, metric_name, formatter in metrics_to_display:
        lines.append("-" * 75)
        prefix = metric_key + "."
        current = stats[prefix + "current"]
        peak = stats[prefix + "peak"]
        allocated = stats[prefix + "allocated"]
        freed = stats[prefix + "freed"]
        lines.append(
            " {:<21} | {} | {} | {} | {} ".format(
                metric_name,
                formatter(current, current),
                formatter(peak, peak),
                formatter(allocated, allocated),
                formatter(freed, freed),
            ),
        )
    lines.append("=" * 75)
    fmt_dict = {"_": "", "device": device}
    for k, v in stats.items():
        fmt_dict[k.replace(".", "-")] = v
    return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n"
 def mem_get_info(device=None) -> Tuple[int, int]:
    r"""Returns the global free and total VACC memory for a given
    device using vaccrtMemGetInfo.
    Args:
        device (torch.device or int, optional): selected device. Returns
            statistic for the current device, given by :func:`~torch_vacc.vacc.current_device`,
            if :attr:`device` is ``None`` (default).
    """
    _lazy_init()
    if device is None:
        device = torch_vacc.vacc.current_device()
    device = _get_device_index(device)
    return _torch_vacc._vacc_getDeviceMemories(device)
 def get_allocator_backend() -> str:
    r"""Returns a string describing the active allocator backend as set by
    ``PYTORCH_VACC_ALLOC_CONF``. Currently available backends are
    ``native`` (PyTorch's native caching allocator).
    """
    return _torch_vacc._vacc_getAllocatorBackend()
--- a/torch_vacc/vacc/random.py
+++ b/torch_vacc/vacc/random.py
@@ -0,0 +1,179 @@
 from typing import Union, List, Iterable
 import torch
 from torch import Tensor
 from . import _lazy_call, _lazy_init, current_device, device_count
 __all__ = [
    "get_rng_state",
    "get_rng_state_all",
    "set_rng_state",
    "set_rng_state_all",
    "manual_seed",
    "manual_seed_all",
    "seed",
    "seed_all",
    "initial_seed",
 ]
 # Random Number Generator related functions (https://pytorch.org/docs/stable/cuda.html#random-number-generator)
 def get_rng_state(device: Union[int, str, torch.device] = "vacc") -> Tensor:
    r"""Returns the random number generator state of the specified GPU as a ByteTensor.
    Args:
        device (torch.device or int, optional): The device to return the RNG state of.
            Default: ``'vacc'`` (i.e., ``torch.device('vacc')``, the current VACC device).
    .. warning::
        This function eagerly initializes VACC.
    """
    _lazy_init()
    if isinstance(device, str):
        device = torch.device(device)
    elif isinstance(device, int):
        device = torch.device("vacc", device)
    idx = device.index
    if idx is None:
        idx = current_device()
    default_generator = torch.vacc.default_generators[idx]
    return default_generator.get_state()
 def get_rng_state_all() -> List[Tensor]:
    r"""Returns a list of ByteTensor representing the random number states of all devices."""
    results = []
    for i in range(device_count()):
        results.append(get_rng_state(i))
    return results
 def set_rng_state(
    new_state: Tensor, device: Union[int, str, torch.device] = "vacc"
 ) -> None:
    r"""Sets the random number generator state of the specified GPU.
    Args:
        new_state (torch.ByteTensor): The desired state
        device (torch.device or int, optional): The device to set the RNG state.
            Default: ``'vacc'`` (i.e., ``torch.device('vacc')``, the current VACC device).
    """
    with torch._C._DisableFuncTorch():
        new_state_copy = new_state.clone(memory_format=torch.contiguous_format)
    if isinstance(device, str):
        device = torch.device(device)
    elif isinstance(device, int):
        device = torch.device("vacc", device)
    def cb():
        idx = device.index
        if idx is None:
            idx = current_device()
        default_generator = torch.vacc.default_generators[idx]
        default_generator.set_state(new_state_copy)
    _lazy_call(cb)
 def set_rng_state_all(new_states: Iterable[Tensor]) -> None:
    r"""Sets the random number generator state of all devices.
    Args:
        new_states (Iterable of torch.ByteTensor): The desired state for each device"""
    for i, state in enumerate(new_states):
        set_rng_state(state, i)
 def manual_seed(seed: int) -> None:
    r"""Sets the seed for generating random numbers for the current GPU.
    It's safe to call this function if VACC is not available; in that
    case, it is silently ignored.
    Args:
        seed (int): The desired seed.
    .. warning::
        If you are working with a multi-GPU model, this function is insufficient
        to get determinism.  To seed all GPUs, use :func:`manual_seed_all`.
    """
    seed = int(seed)
    def cb():
        idx = current_device()
        default_generator = torch.vacc.default_generators[idx]
        default_generator.manual_seed(seed)
    _lazy_call(cb, seed=True)
 def manual_seed_all(seed: int) -> None:
    r"""Sets the seed for generating random numbers on all GPUs.
    It's safe to call this function if VACC is not available; in that
    case, it is silently ignored.
    Args:
        seed (int): The desired seed.
    """
    seed = int(seed)
    def cb():
        for i in range(device_count()):
            default_generator = torch.vacc.default_generators[i]
            default_generator.manual_seed(seed)
    _lazy_call(cb, seed_all=True)
 def seed() -> None:
    r"""Sets the seed for generating random numbers to a random number for the current GPU.
    It's safe to call this function if VACC is not available; in that
    case, it is silently ignored.
    .. warning::
        If you are working with a multi-GPU model, this function will only initialize
        the seed on one GPU.  To initialize all GPUs, use :func:`seed_all`.
    """
    def cb():
        idx = current_device()
        default_generator = torch.vacc.default_generators[idx]
        default_generator.seed()
    _lazy_call(cb)
 def seed_all() -> None:
    r"""Sets the seed for generating random numbers to a random number on all GPUs.
    It's safe to call this function if VACC is not available; in that
    case, it is silently ignored.
    """
    def cb():
        random_seed = 0
        seeded = False
        for i in range(device_count()):
            default_generator = torch.vacc.default_generators[i]
            if not seeded:
                default_generator.seed()
                random_seed = default_generator.initial_seed()
                seeded = True
            else:
                default_generator.manual_seed(random_seed)
    _lazy_call(cb)
 def initial_seed() -> int:
    r"""Returns the current random seed of the current GPU.
    .. warning::
        This function eagerly initializes VACC.
    """
    _lazy_init()
    idx = current_device()
    default_generator = torch.vacc.default_generators[idx]
    return default_generator.initial_seed()
--- a/torch_vacc/vacc/streams.py
+++ b/torch_vacc/vacc/streams.py
@@ -0,0 +1,327 @@
 import ctypes
 from typing import Any, Optional
 import torch
 from packaging import version
 from torch._utils import _get_device_index
 try:
    from torch._streambase import _StreamBase, _EventBase
 except ImportError:
    # torch <= 2.1
    _StreamBase = _EventBase = object
 import torch_vacc
 from torch_vacc._vacc_libs import _torch_vacc
 from ._device import device
 from .lazy_initialize import _lazy_init
 # remove torch version arch-suffix(i.e. +cpu)
 torch_version = torch.__version__.split('+')[0]
 class _StreamCommon:
    """Wrapper around a VACC stream.
    A VACC stream is a linear sequence of execution that belongs to a specific
    device, independent from other streams.
    Args:
        device(torch.device or int, optional): a device on which to allocate
            the stream. If :attr:`device` is ``None`` (default) or a negative
            integer, this will use the current device.
        priority(int, optional): priority of the stream. Can be either
            -1 (high priority) or 0 (low priority). By default, streams have
            priority 0.
    """
    def __new__(cls, device=None, priority=0, **kwargs):
        if device is None or ("stream_id" in kwargs and "device_index" in kwargs):
            return super(Stream, cls).__new__(cls, priority=priority, **kwargs)
        else:
            with torch_vacc.vacc.device(device):
                return super(Stream, cls).__new__(cls, priority=priority, **kwargs)
    def wait_event(self, event):
        event.wait(self)
    def record_event(self, event=None):
        """Records an event.
        Args:
            event (torch_vacc.Event, optional): event to record. If not given, a new one
                will be allocated.
        Returns:
            Recorded event.
        """
        if event is None:
            event = Event()
        event.record(self)
        return event
    def wait_stream(self, stream):
        """Synchronizes with another stream.
        All future work submitted to this stream will wait until all kernels
        submitted to a given stream at the time of call complete.
        Args:
            stream (Stream): a stream to synchronize.
        """
        self.wait_event(stream.record_event())
    def query(self):
        return super().query()
    def synchronize(self):
        super().synchronize()
    @property
    def _as_parameter_(self):
        return ctypes.c_void_p(self.vacc_stream)
    def __eq__(self, o):
        if isinstance(o, Stream):
            return super().__eq__(o)
        return False
    def __hash__(self):
        return hash((self.vacc_stream, self.device))
    def __repr__(self):
        return f"torch_vacc.vacc.Stream device={self.device} vacc_stream={self.vacc_stream:#x}"
 if version.parse(torch_version) <= version.parse("2.1"):
    # torch <= 2.1
    class Stream(_torch_vacc._VACCStreamBase, _StreamCommon):
        pass
 elif version.parse(torch_version) < version.parse("2.6"):
    # torch < 2.6
    class Stream(_torch_vacc._VACCStreamBase, _StreamBase, _StreamCommon):
        pass
 else:
    # torch >= 2.6
    class Stream(_torch_vacc._VACCStreamBase, _StreamCommon):
        pass
 class _EventCommon:
    """Wrapper around a VACC event.
    VACC events are synchronization markers that can be used to monitor the
    device's progress, to accurately measure timing, and to synchronize VACC
    streams.
    The underlying VACC events are lazily initialized when the event is first
    recorded or exported to another process. After creation, only streams on the
    same device may record the event. However, streams on any device can wait on
    the event.
    Args:
        calc_time (bool, optional): indicates if the event should measure time
            (default: ``False``)
        blocking (bool, optional): if ``True``, :meth:`wait` will be blocking (default: ``False``)
    """
    def __new__(cls, enable_timing=False, blocking=False):
        return super(Event, cls).__new__(
            cls,
            calc_time=enable_timing,
            blocking=blocking,
        )
    def record(self, stream=None):
        """Records the event in a given stream.
        Uses ``torch_vacc.vacc.current_stream()`` if no stream is specified. The
        stream's device must match the event's device."""
        if stream is None:
            stream = torch_vacc.vacc.current_stream()
        super().record(stream)
    def wait(self, stream=None):
        """Makes all future work submitted to the given stream wait for this
        event.
        Use ``torch_vacc.vacc.current_stream()`` if no stream is specified.
        .. note:: This is a wrapper around ``vaccrtStreamWaitEvent()``
        """
        if stream is None:
            stream = torch_vacc.vacc.current_stream()
        super().wait(stream)
    def query(self):
        """Checks if all work currently captured by event has completed.
        Returns:
            A boolean indicating if all work currently captured by event has
            completed.
        """
        return super().query()
    def elapsed_time(self, end_event):
        """Returns the time elapsed in milliseconds after the event was
        recorded and before the end_event was recorded.
        """
        return super().elapsed_time(end_event)
    def synchronize(self):
        r"""Waits for the event to complete.
        Waits until the completion of all work currently captured in this event.
        This prevents the CPU thread from proceeding until the event completes.
         .. note:: This is a wrapper around ``vaccEventSynchronize()``.
        """
        super().synchronize()
    @property
    def _as_parameter_(self):
        return ctypes.c_void_p(self.vacc_event)
    def __repr__(self):
        if self.vacc_event:
            return f"<torch_vacc.vacc.Event {self._as_parameter_.value:#x}>"
        else:
            return "<torch_vacc.vacc.Event uninitialized>"
 if version.parse(torch_version) <= version.parse("2.1"):
    # torch <= 2.1
    class Event(_torch_vacc._VACCEventBase, _EventCommon):
        pass
 elif version.parse(torch_version) < version.parse("2.6"):
    # torch < 2.6
    class Event(_torch_vacc._VACCEventBase, _EventBase, _EventCommon):
        pass
 else:
    # torch >= 2.6
    class Event(_torch_vacc._VACCEventBase, _EventCommon):
        pass
 class StreamContext:
    r"""Context-manager that selects a given stream.
    All VACC kernels queued within its context will be enqueued on a selected
    stream.
    Args:
        stream (stream): selected stream. This manager is a no-op if it's
            ``None``.
    .. note:: Streams are per-device.
    """
    cur_stream: Optional["torch_vacc.vacc.Stream"]
    def __init__(self, stream: Optional["torch_vacc.vacc.Stream"]):
        self.stream = stream
        self.idx = _get_device_index(None, True)
        if not torch.jit.is_scripting():
            if self.idx is None:
                self.idx = -1
        self.src_prev_stream = (
            None
            if not torch.jit.is_scripting()
            else torch_vacc.vacc.default_stream(None)
        )
        self.dst_prev_stream = (
            None
            if not torch.jit.is_scripting()
            else torch_vacc.vacc.default_stream(None)
        )
    def __enter__(self):
        # Local cur_stream variable for type refinement
        cur_stream = self.stream
        # Return if stream is None or VACC device not available
        if cur_stream is None or self.idx == -1:
            return
        self.src_prev_stream = torch_vacc.vacc.current_stream(None)
        # If the stream is not on the current device, then
        # set the current stream on the device
        if self.src_prev_stream.device != cur_stream.device:
            with device(cur_stream.device):
                self.dst_prev_stream = torch_vacc.vacc.current_stream(cur_stream.device)
        torch_vacc.vacc.set_stream(cur_stream)
    def __exit__(self, type: Any, value: Any, traceback: Any):
        # Local cur_stream variable for type refinement
        cur_stream = self.stream
        # If stream is None or no VACC device available, return
        if cur_stream is None or self.idx == -1:
            return
        # Reset the stream on the original device
        # and destination device
        if self.src_prev_stream.device != cur_stream.device:  # type: ignore[union-attr]
            torch_vacc.vacc.set_stream(self.dst_prev_stream)  # type: ignore[arg-type]
        torch_vacc.vacc.set_stream(self.src_prev_stream)  # type: ignore[arg-type]
 def stream(stream: Optional["torch_vacc.vacc.Stream"]) -> StreamContext:
    r"""Wrapper around the Context-manager StreamContext that
    selects a given stream.
    Arguments:
        stream (Stream): selected stream. This manager is a no-op if it's
            ``None``.
    """
    return StreamContext(stream)
 def set_stream(stream: Stream):
    r"""Sets the current stream.This is a wrapper API to set the stream.
        Usage of this function is discouraged in favor of the ``stream``
        context manager.
    Args:
        stream (Stream): selected stream. This function is a no-op
            if this argument is ``None``.
    """
    if stream is None:
        return
    _torch_vacc._vacc_setStream(
        stream_id=stream.stream_id,
        device_index=stream.device_index,
        device_type=stream.device_type,
    )
 def current_stream(device=None) -> Stream:
    r"""Returns the currently selected :class:`Stream` for a given device.
    Args:
        device (torch.device or int, optional): selected device. Returns
            the currently selected :class:`Stream` for the current device, given
            by :func:`~torch_vacc.vacc.current_device`, if :attr:`device` is ``None``
            (default).
    """
    _lazy_init()
    streamdata = _torch_vacc._vacc_getCurrentStream(
        _get_device_index(device, optional=True)
    )
    return Stream(
        stream_id=streamdata[0], device_index=streamdata[1], device_type=streamdata[2]
    )
 def default_stream(device=None) -> Stream:
    r"""Returns the default :class:`Stream` for a given device.
    Args:
        device (torch.device or int, optional): selected device. Returns
            the default :class:`Stream` for the current device, given by
            :func:`_torch_vacc.current_device`, if :attr:`device` is ``None``
            (default).
    """
    _lazy_init()
    streamdata = _torch_vacc._vacc_getDefaultStream(
        _get_device_index(device, optional=True)
    )
    return Stream(
        stream_id=streamdata[0], device_index=streamdata[1], device_type=streamdata[2]
    )
--- a/torch_vacc/version.py
+++ b/torch_vacc/version.py
@@ -0,0 +1,2 @@
 __all__ = ['__version__']
 __version__ = '1.3.3.777'
--- a/torch_vacc/vslog.cfg
+++ b/torch_vacc/vslog.cfg
@@ -0,0 +1,269 @@
 hot_update: true
 - channel: 0
    sync: sync
    priority: error
    category: 0
    category_extend: 0
    -device: 0
        disable: false
        out_type: file
        priority: error
        category: 0
        category_extend: 0
        path: "./log/"
        file: "$PNAME-$YEAR_$MON_$DAY_$HOUR_$MIN_$SEC_$PID"
        rollback: 5
        limit_size: 50 m #only support M byte
    -device: 1
        disable: false
        out_type: screen
        category: 0
        category_extend: 0
 - channel: 1
    sync: sync
    priority: error
    category: 0
    category_extend: 0
    -device: 0
        disable: false
        out_type: file
        priority: error
        category: 0
        category_extend: 0
        path: "./log/"
        file: "vacm-$YEAR_$MON_$DAY_$HOUR_$MIN_$SEC_$PID"
        rollback: 5
        limit_size: 50 m #only support M byte
    -device: 1
        disable: false
        out_type: screen
        category: 0
        category_extend: 0
 - channel: 2
    sync: sync
    priority: error
    category: 0
    category_extend: 0
    -device: 0
        disable: false
        out_type: file
        priority: error
        category: 0
        category_extend: 0
        path: "./log/"
        file: "vace-$YEAR_$MON_$DAY_$HOUR_$MIN_$SEC_$PID"
        rollback: 5
        limit_size: 50 m #only support M byte
    -device: 1
        disable: false
        out_type: screen
        category: 0
        category_extend: 0
 - channel: 3
    sync: sync
    priority: error
    category: 0
    category_extend: 0
    -device: 0
        disable: false
        out_type: file
        priority: error
        category: 0
        category_extend: 0
        path: "./log/"
        file: "vacl-$YEAR_$MON_$DAY_$HOUR_$MIN_$SEC_$PID"
        rollback: 5
        limit_size: 50 m #only support M byte
    -device: 1
        disable: false
        out_type: screen
        category: 0
        category_extend: 0
 - channel: 4
    sync: sync
    priority: error
    category: 0
    category_extend: 0
    -device: 0
        disable: false
        out_type: file
        priority: error
        category: 0
        category_extend: 0
        path: "./log/"
        file: "vame-$YEAR_$MON_$DAY_$HOUR_$MIN_$SEC_$PID"
        rollback: 5
        limit_size: 50 m #only support M byte
    -device: 1
        disable: false
        out_type: screen
        category: 0
        category_extend: 0
 - channel: 5
    sync: sync
    priority: error
    category: 0
    category_extend: 0
    -device: 0
        disable: false
        out_type: file
        priority: error
        category: 0
        category_extend: 0
        path: "./log/"
        file: "vaml-$YEAR_$MON_$DAY_$HOUR_$MIN_$SEC_$PID"
        rollback: 5
        limit_size: 50 m #only support M byte
    -device: 1
        disable: false
        out_type: screen
        category: 0
        category_extend: 0
 - channel: 6
    sync: sync
    priority: error
    category: 0
    category_extend: 0
    append_cr: true
    -device: 0
        disable: false
        out_type: file
        priority: error
        category: 0
        category_extend: 0
        path: "./log/"
        file: "rt-$YEAR_$MON_$DAY_$HOUR_$MIN_$SEC_$PID"
        rollback: 5
        limit_size: 50 m #only support M byte
    -device: 1
        disable: true
        out_type: screen
        category: 0
        category_extend: 0
 - channel: 7
    sync: sync
    priority: error
    category: 0
    category_extend: 0
    -device: 0
        disable: false
        out_type: file
        priority: error
        category: 0
        category_extend: 0
        path: "./log/"
        file: "nn-$YEAR_$MON_$DAY_$HOUR_$MIN_$SEC_$PID"
        rollback: 5
        limit_size: 50 m #only support M byte
    -device: 1
        disable: true
        out_type: screen
        category: 0
        category_extend: 0
 - channel: 8
    sync: sync
    priority: error
    category: 0
    category_extend: 0
    -device: 0
        disable: false
        out_type: file
        priority: error
        category: 0
        category_extend: 0
        path: "./log/"
        file: "tm-$YEAR_$MON_$DAY_$HOUR_$MIN_$SEC_$PID"
        rollback: 5
        limit_size: 50 m #only support M byte
    -device: 1
        disable: false
        out_type: screen
        category: 0
        category_extend: 0
 - channel: 9
    sync: sync
    priority: error
    category: 0
    category_extend: 0
    append_cr: true
    no_prefix: true
    -device: 0
        disable: false
        out_type: file
        priority: error
        category: 0
        category_extend: 0
        path: "./log/"
        file: "md-$YEAR_$MON_$DAY_$HOUR_$MIN_$SEC_$PID"
        rollback: 5
        limit_size: 50 m #only support M byte
    -device: 1
        disable: true
        out_type: screen
        category: 0
        category_extend: 0
 - channel: 10
    sync: sync
    priority: error
    category: 0
    category_extend: 0
    append_cr: false
    no_prefix: true
    -device: 0
        disable: false
        out_type: file
        priority: error
        category: 0
        category_extend: 0
        path: "./log/"
        file: "rs-$YEAR_$MON_$DAY_$HOUR_$MIN_$SEC_$PID"
        rollback: 5
        limit_size: 50 m #only support M byte
    -device: 1
        disable: false
        out_type: screen
        category: 0
        category_extend: 0
 - channel: 11
    sync: sync
    priority: error
    category: 0
    category_extend: 0
    append_cr: false
    no_prefix: true
    -device: 0
        disable: false
        out_type: file
        priority: error
        category: 0
        category_extend: 0
        path: "./log/"
        file: "vaapi-$YEAR_$MON_$DAY_$HOUR_$MIN_$SEC_$PID"
        rollback: 5
        limit_size: 50 m #only support M byte
    -device: 1
        disable: false
        out_type: screen
        category: 0
        category_extend: 0
 - channel: 12
    sync: sync
    priority: error
    category: 0
    category_extend: 0
    -device: 0
        disable: false
        out_type: file
        priority: error
        category: 0
        category_extend: 0
        path: "./log/"
        file: "vccl-$YEAR_$MON_$DAY_$HOUR_$MIN_$SEC_$PID"
        rollback: 5
        limit_size: 50 m #only support M byte
    -device: 1
        disable: true
        out_type: screen
        category: 0
        category_extend: 0 
--- a/vacc_tools/init.py
+++ b/vacc_tools/init.py
@@ -0,0 +1,31 @@
 from functools import partial
 from datetime import datetime
 from typing import Union, Tuple
 import torch
 import torch.distributed
 _module_time = {}
 def print_module_time(
    model: torch.nn.Module, module: Union[Tuple[torch.nn.Module], torch.nn.Module]
 ):
    def now_as_us():
        return int(datetime.now().timestamp() * 1e6)  # in us
    def _pre_forward(suffix, m, inputs):
        name = f"{type(m).__name__}.{suffix}"
        _module_time[name] = now_as_us()
    def _post_forward(suffix, m, inputs, outputs):
        name = f"{type(m).__name__}.{suffix}"
        start_time = _module_time.pop(name)
        print(f"{name}: {now_as_us() - start_time} us")
    for name, m in model.named_modules():
        if isinstance(m, module):
            m.register_forward_pre_hook(partial(_pre_forward, "forward"))
            m.register_forward_hook(partial(_post_forward, "forward"))
            m.register_full_backward_pre_hook(partial(_pre_forward, "backward"))
            m.register_full_backward_hook(partial(_post_forward, "backward"))
--- a/vacc_tools/pycache/init.cpython-312.pyc
+++ b/vacc_tools/pycache/init.cpython-312.pyc
--- a/vacc_tools/pycache/generate_trace.cpython-312.pyc
+++ b/vacc_tools/pycache/generate_trace.cpython-312.pyc
--- a/vacc_tools/pycache/memory_analyzer.cpython-312.pyc
+++ b/vacc_tools/pycache/memory_analyzer.cpython-312.pyc
--- a/vacc_tools/pycache/parse_vacc_log_for_tracing.cpython-312.pyc
+++ b/vacc_tools/pycache/parse_vacc_log_for_tracing.cpython-312.pyc
--- a/vacc_tools/pycache/trace_logger.cpython-312.pyc
+++ b/vacc_tools/pycache/trace_logger.cpython-312.pyc
--- a/vacc_tools/generate_trace.py
+++ b/vacc_tools/generate_trace.py
@@ -0,0 +1,214 @@
 """Generating tracing json files from log files.
 Usage:
    python -m vacc_tools.generate_trace --log-dir <directory of log files> --out-file-prefix <prefix of output file>
 """
 import argparse
 import json
 import os
 import re
 import numpy as np
 import tabulate
 from glob import glob
 from collections import defaultdict
 from multiprocessing import Pool
 def run_stats_on_traces(timelines):
    op_cat_list = ["ODSP", "DLC", "VCCL", "CPU", "CPU_OP"]
    op_stats = {op: {} for op in op_cat_list}
    for line in timelines:
        if '"E"' not in line:  # optim 3, skip everything if not `"E"`
            continue
        # optim 2: using `[:-2]` instead of replace()
        line = line[:-2]  # remove ',\n'
        try:
            values = json.loads(line)
        except json.decoder.JSONDecodeError:
            # some log may not ends properly, just skip it
            continue
        if values["ph"] == "E" and values["cat"] in op_cat_list:
            cat = values["cat"]
            if values["name"] not in op_stats[cat]:
                op_stats[cat][values["name"]] = []
            if "dur" in values["args"]:
                # optim 1: using `[:-2]` instead of replace()
                op_stats[cat][values["name"]].append(
                    int(values["args"]["dur"][:-2])  # strip `us`
                )
            elif "values(us)" in values["args"]:
                op_stats[cat][values["name"]].append(values["args"]["value(us)"])
    op_tables = {}
    for cat, stats in op_stats.items():
        # optim 4: using list comprehension instead of for loop
        table = []
        for name, dur in stats.items():
            dur = np.array(dur)
            t = [
                name,
                np.min(dur),
                np.max(dur),
                np.sum(dur),
                np.mean(dur),
                np.percentile(dur, 90),
                len(dur),
            ]
            table.append(t)
        table = sorted(table, key=lambda x: x[-1], reverse=True)
        op_tables[cat] = tabulate.tabulate(
            table,
            headers=["op", "min", "max", "sum", "avg", "p90", "count"],
            tablefmt="plain",
        )
        if cat in ["VCCL", "ODSP", "DLC"]:
            op_tables["VACC-ALL"] = op_tables.get("VACC-ALL", []) + [
                t + [cat] for t in table
            ]
    total = sum([x[3] for x in op_tables["VACC-ALL"]])
    op_tables["VACC-ALL"] = [t + [t[3] / total * 100] for t in op_tables["VACC-ALL"]]
    op_tables["VACC-ALL"] = tabulate.tabulate(
        sorted(op_tables["VACC-ALL"], key=lambda x: x[-1], reverse=True),
        headers=["op", "min", "max", "sum", "avg", "p90", "count", "cat", "percent(%)"],
        tablefmt="plain",
    )
    return op_tables
 def get_rank_info(files):
    # using pattern rank-<rank> in file name to get rank
    for fpath in files:
        rank = re.findall(r"rank-(\d+)", fpath)
        if rank:
            return int(rank[0])
    return 0
 def extract_traces(arg):
    files, target_file_path, group_name, trace_token = arg
    entries = [
        (0, "scheduler"),
        (1, "megatron"),
        (2, "deepspeed"),
        (3, "nn.Module"),
        (10, "vacc-odsp"),
        (11, "vacc-dlc"),
        (12, "vacc-vccl"),
        (13, "vacc-cpu"),
        (14, "vacc-fallback"),
        (15, "vacc-ddr"),
        (20, "lib-vccl"),
    ]
    with open(target_file_path, "w", encoding="utf-8") as trace_file:
        trace_file.write("[")
        for tid, thread_name in entries:
            line = f'{{"cat":"__metadata","pid":{group_name},"tid":{tid},"ts":0,"ph":"M","name":"thread_name","args":{{"name":"{thread_name}"}}}},\n'
            trace_file.write(line)
        timelines = []
        for fpath in files:
            with open(fpath, "r", encoding="utf-8") as file:
                # timelines += [line.split(trace_token)[1] for line in file if trace_token in line]
                for line in file:
                    if trace_token in line:
                        # 找到目标字符串，取其之后的内容（包括目标字符串）
                        timelines.append(line.split(trace_token)[1])
                try:
                    json.loads(timelines[-1][:-2])  # remove ',\n'
                except json.decoder.JSONDecodeError:
                    # some log may not ends properly, just skip it
                    # chrome:://tracing stops reading following lines if an error encountered
                    # so must remove lines with error
                    timelines.pop()
        for line in timelines[:-1]:
            trace_file.write(line)
        # fixing JSON format error by removing last comma in a list
        trace_file.write(timelines[-1].replace(",\n", "\n"))
        trace_file.write("]")
        op_stats = run_stats_on_traces(timelines)
        with open(
            target_file_path.replace(".json", ".txt"), "w", encoding="utf-8"
        ) as op_stats_file:
            for cat, tables in op_stats.items():
                op_stats_file.write(f"{cat}".center(80, "-") + "\n")
                op_stats_file.write(tables + "\n\n")
 def merge_schedule(out_file_prefix):
    scheduler_data = []
    for file in glob(f"{out_file_prefix}*.json"):
        if file.endswith("schedule.json"):
            continue
        assert "rank" in file
        rank = file.split("rank_")[-1].split("_")[0]
        pid = None
        with open(file, "r", encoding="utf-8") as f:
            for line in f:
                # set all schedule's pid to 0 and set all schedule's tid to rank id
                if '"tid":0,' in line and "__metadata" not in line:
                    if pid is None:
                        pid = line.split('"pid":')[1].split(",")[0]
                    line = line.replace(f'"pid":{pid}', f'"pid":0')
                    line = line.replace('"tid":0,', f'"tid":{rank},')
                    scheduler_data.append(line)
    out_file = f"{out_file_prefix}schedule.json"
    with open(out_file, "w", encoding="utf-8") as f:
        f.write("[\n")
        f.writelines(scheduler_data[:-1])
        f.write(scheduler_data[-1].replace(",\n", "\n"))
        f.write("]\n")
 def scan_and_generate_trace(args, trace_token):
    grouped_files = defaultdict(list)
    for root, dirs, files in os.walk(args.log_dir):
        for filename in files:
            fpath = os.path.join(root, filename)
            file_size = os.path.getsize(fpath)
            if file_size != 0:
                group_name = filename.rsplit("_", 1)[1].split(".")[0]
                grouped_files[group_name].append(fpath)
    pool_args = []
    for group_name, files in grouped_files.items():
        rank = get_rank_info(files)
        out_file = f"{args.out_file_prefix}rank_{rank}_{group_name}.json"
        pool_args.append((files, out_file, group_name, trace_token))
    with Pool(len(grouped_files)) as p:
        p.map(extract_traces, pool_args)
    if args.merge_schedule:
        merge_schedule(args.out_file_prefix)
 if __name__ == "__main__":
    TRACE_TOKEN = "LOG_TRACE:"
    current_file_path = os.path.abspath(__file__)
    parent_directory = os.path.dirname(os.path.dirname(current_file_path))
    find_directory = os.path.join(parent_directory, "log")
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--log-dir", default=find_directory, type=str, help="directory of log files"
    )
    parser.add_argument("--out-file-prefix", default="timeline_", type=str)
    parser.add_argument("--merge-schedule", action="store_true")
    args = parser.parse_args()
    scan_and_generate_trace(args, TRACE_TOKEN)
    print("Scan and trace generation done!")
--- a/vacc_tools/memory_analyzer.py
+++ b/vacc_tools/memory_analyzer.py
@@ -0,0 +1,151 @@
 from contextlib import contextmanager
 from dataclasses import fields
 from typing import Dict, Tuple, List, Optional
 import torch
 NUM_BYTES_IN_MB = 1024**2
 NUM_BYTES_IN_GB = 1024**3
 class MemoryAnalyzer:
    def __init__(
        self, model: torch.nn.Module, optimizer: Optional[torch.optim.Optimizer] = None
    ):
        """This memory usage analyzer will be mostly acurate only if you initialize
        at the beginning and insert `get_memory_usage_in_gb` at the end of your
        forward pass.
        NOTE: It will have negative impact if not properly used as it stores
        activations of every nn.Module's forward function and relies on user to
        reset it everytime the forward pass ends.
        Limitations:
            1. does not work with customized operators
            2. does not work with functional operators
            3. it approximates activation as nn.Module.forward's output (if it's
            inside the graph requires gradients), so it may not be exactly accurate.
        """
        self.model = model
        self.optimizer = optimizer
        self.activ_addrs = set()
        self.activ_memory = 0
    @staticmethod
    def _is_activation(x):
        return torch.is_tensor(x) and x.requires_grad and x.device != "cpu"
    def _get_weight_grads_addrs(self):
        weights = set([p.untyped_storage().data_ptr() for p in self.model.parameters()])
        grads = set(
            [
                p.grad.untyped_storage().data_ptr()
                for p in self.model.parameters()
                if p.grad is not None
            ]
        )
        return weights.union(grads)
    def pack_hook(self):
        def _pack_hook(x):
            if self._is_activation(x):
                weight_grads = self._get_weight_grads_addrs()
                # NOTE: storage is more accurate than using x.nelement() * x.element_size()
                data_ptr = x.untyped_storage().data_ptr()
                if data_ptr not in weight_grads and data_ptr not in self.activ_addrs:
                    self.activ_addrs.add(data_ptr)
                    self.activ_memory += x.untyped_storage().size()
            return x
        return _pack_hook
    def unpack_hook(self):
        def _unpack_hook(x):
            if self._is_activation(x):
                weight_grads = self._get_weight_grads_addrs()
                data_ptr = x.untyped_storage().data_ptr()
                if data_ptr not in weight_grads and data_ptr in self.activ_addrs:
                    self.activ_addrs.remove(data_ptr)
                    self.activ_memory -= x.untyped_storage().size()
            return x
        return _unpack_hook
    @contextmanager
    def record_activation(self):
        with torch.autograd.graph.saved_tensors_hooks(
            self.pack_hook(), self.unpack_hook()
        ):
            yield
    @staticmethod
    def get_weight_memory(model: torch.nn.Module):
        weights = [
            p.nelement() * p.element_size()
            for p in model.parameters()
            if p.device != "cpu"
        ]
        return sum(weights)
    @staticmethod
    def get_gradient_memory(model: torch.nn.Module):
        grads = [
            p.grad.nelement() * p.grad.element_size()
            for p in model.parameters()
            if p.grad is not None and p.grad.device != "cpu"
        ]
        return sum(grads)
    def _sum_activation_memory(self):
        return self.activ_memory
    def get_optimizer_state_memory(self):
        if isinstance(self.optimizer, torch.optim.AdamW):
            params = sum(
                [
                    p.nelement() * p.element_size()
                    for pg in self.optimizer.param_groups
                    for p in pg["params"]
                    if torch.is_tensor(p) and p.device != "cpu"
                ]
            )
            for state in self.optimizer.state.values():
                params += sum(
                    [
                        v.nelement() * v.element_size()
                        for k, v in state.items()
                        if torch.is_tensor(v) and v.device != "cpu"
                    ]
                )
            return params
        return 0
    def _get_memory_usage(self) -> Tuple[int, int, int, int]:
        return (
            self.get_weight_memory(self.model),
            self.get_gradient_memory(self.model),
            self._sum_activation_memory(),
            self.get_optimizer_state_memory(),
        )
    def get_memory_usage_in_gb(self) -> str:
        w, g, a, opt = self._get_memory_usage()
        return (
            f"Total: {(w + g + a + opt) / NUM_BYTES_IN_GB:.3f} GB, "
            f"weight: {w / NUM_BYTES_IN_GB:.3f} GB, "
            f"gradient: {g / NUM_BYTES_IN_GB:.3f} GB, "
            f"activation: {a / NUM_BYTES_IN_GB:.3f} GB, "
            f"optimizer states: {opt / NUM_BYTES_IN_GB:.3f} GB"
        )
    def get_memory_usage_in_mb(self) -> str:
        w, g, a, opt = self._get_memory_usage()
        return (
            f"Total: {(w + g + a + opt) / NUM_BYTES_IN_MB:.2f} MB, "
            f"weight: {w / NUM_BYTES_IN_MB:.2f} MB, "
            f"gradient: {g / NUM_BYTES_IN_MB:.2f} MB, "
            f"activation: {a / NUM_BYTES_IN_MB:.2f} MB, "
            f"optimizer states: {opt / NUM_BYTES_IN_MB:.2f} MB"
        )
--- a/vacc_tools/parse_vacc_log_for_tracing.py
+++ b/vacc_tools/parse_vacc_log_for_tracing.py
@@ -0,0 +1,65 @@
 import argparse
 import os
 from collections import defaultdict
 from multiprocessing import Pool
 log_tag = "LOG_TRACE:"
 tid_names = [
    (0, "module"),
    (1, "megatron"),
    (2, "deepspeed"),
    (10, "vacc-odsp"),
    (11, "vacc-dlc"),
    (12, "vacc-vccl"),
    (13, "vacc-cpu"),
    (14, "vacc-cpu_fallback"),
    (15, "vacc-ddr"),
    (20, "lib-vccl"),
 ]
 def parse_files_of_process(args):
    pid, in_files = args
    out_file = "trace_" + pid + ".json"
    with open(out_file, "w", encoding="utf-8") as new_file:
        metadata_lines = [
            f'{{"name": "thread_name","ph": "M","pid": {pid},"tid": {tid},"args": {{"name": "{name}"}}}},'
            for tid, name in tid_names
        ]
        new_file.write("[\n")
        new_file.write("\n".join(metadata_lines))
        new_file.write("\n")
        for file_path in in_files:
            with open(file_path, "r", encoding="utf-8") as file:
                for line in file:
                    if log_tag in line:
                        new_line = line.split(log_tag, 1)[1].strip()
                        new_file.write(new_line + "\n")
        new_file.write("]")
 def parse_directory(directory):
    pro_files = defaultdict(list)
    for dirpath, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            if filename.startswith("vacc") and os.path.getsize(file_path) != 0:
                pid = filename.rsplit("_", 1)[1].split(".")[0]
                pro_files[pid].append(file_path)
    args = []
    for pid, in_files in pro_files.items():
        args.append((pid, in_files))
    with Pool() as p:
        p.map(parse_files_of_process, args)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="parse vacc log files and generate trace files"
    )
    parser.add_argument("directory", type=str, help="log directory to parse")
    args = parser.parse_args()
    parse_directory(args.directory)
--- a/vacc_tools/trace_logger.py
+++ b/vacc_tools/trace_logger.py
@@ -0,0 +1,329 @@
 """
 This module provides mechanisms for tracing torch's module and function's execution,
 and output the trace into a json file.
 User needs to set environmental variable `LOG_TRAIN_SCHEDULE=1` to enable tracing.
 If not, no trace will be applied.
 Inside your module, create your module's tracer functions by using `get_trace_api`.
 You will get three functions:
    * `@trace_time(name)`: decorator to trace the execution of a function.
    ```python
    @trace_time("my_func")
    def my_func(x):
        ...
    ```
    * `@trace_autograd_function()`: decorator to trace the execution of forward
    and backward of a user defined `torch.autograd.Function` operator.
    ```python
    @trace_autograd_function()
    class MyAutogradFunction(torch.autograd.Function):
        @staticmethod
        def forward(ctx, x):
            ...
        @staticmethod
        def backward(ctx, grad_output):
            ...
    ```
    * `register_module_trace()`: function to register trace a model (`nn.Module`),
    it applies traces recursively to a torch model by enumerating all nn.Module
    and register tracer to their forward and backward function. Only applying to
    top level nn.Module is recommended.
    ```python
    model = Model()
    register_module_trace(model)
    ```
 """
 import os
 import json
 from contextlib import contextmanager
 from dataclasses import dataclass, asdict
 from datetime import datetime
 from functools import partial
 import torch
 import torch.distributed
 MODULE_TID = {"megatron": 1, "deepspeed": 2, "nn.Module": 3, "ram": 100}
 # pylint: disable=missing-docstring
@dataclass
 class TraceEntry:
    name: str
    cat: str
    pid: int
    tid: int
    ts: int
    ph: str
    args: str = None
    def to_json_str(self):
        d = asdict(self)
        if self.args is None:
            d.pop("args")
        return json.dumps(d, separators=(",", ": "))
 class LogFiles:
    def __init__(self) -> None:
        self.loggers = {}
    def get(self, file_prefix, rank, pid):
        os.makedirs("log", exist_ok=True)
        fpath = f"log/{file_prefix}-rank-{rank}_{pid}.txt"
        if not fpath in self.loggers:
            self.loggers[fpath] = open(fpath, "w")
        return self.loggers[fpath]
    def close(self):
        for f in self.loggers.values():
            f.close()
    def __del__(self):
        self.close()
 def trace_logger_enabled() -> bool:
    return (
        "LOG_TRAIN_SCHEDULE" in os.environ and os.environ["LOG_TRAIN_SCHEDULE"] == "1"
    )
 class TraceLogger:
    _log_files = LogFiles()
    def __init__(self, category, tid=None, file_prefix=None) -> None:
        self.enabled = trace_logger_enabled()
        if self.enabled:
            self.pid = os.getpid()
            self.logger = None
            self.cat = category
            self._traces = {}
            self.global_rank = 0
            if tid is None:
                self.tid = MODULE_TID.get(category, 1000)
            else:
                self.tid = tid
            self.file_prefix = file_prefix if file_prefix is not None else self.cat
            self.registered_modules = []
    def _creat_logger(self) -> None:
        # delay creating logger file until first log call,
        # since torch.distributed may not be ready yet
        if torch.distributed.is_initialized():
            self.global_rank = torch.distributed.get_rank()
        self.logger = TraceLogger._log_files.get(
            self.file_prefix, self.global_rank, self.pid
        )
    def begin_trace(self, name, memory=False) -> None:
        if not self.enabled:
            return
        if self.logger is None:
            self._creat_logger()
            assert self.logger is not None
        name = f"{name}"  # convert it to str to ensure json serializable
        start_time = int(datetime.now().timestamp() * 1e6)  # in us
        trace = TraceEntry(name, self.cat, self.pid, self.tid, start_time, "B")
        mem_trace = self._get_memory(start_time) if memory else None
        if name not in self._traces:
            self._traces[name] = [(trace, mem_trace)]
        else:  # in case call to the function is nested
            self._traces[name].append((trace, mem_trace))
    def end_trace(self, name, flush=False, memory=False) -> None:
        if not self.enabled:
            return
        name = f"{name}"  # convert it to str to ensure json serializable
        assert self.logger is not None, "begin_trace should be called before end_trace"
        assert name in self._traces, "begin_trace should be called before end_trace"
        start_trace, start_mem = self._traces[name].pop()
        if start_mem is not None:
            self.logger.write(f"LOG_TRACE:{start_mem.to_json_str()},\n")
        self.logger.write(f"LOG_TRACE:{start_trace.to_json_str()},\n")
        end_time = int(datetime.now().timestamp() * 1e6)  # in us
        args = {"value(us)": end_time - start_trace.ts}
        trace = TraceEntry(name, self.cat, self.pid, self.tid, end_time, "E", args)
        self.logger.write(f"LOG_TRACE:{trace.to_json_str()},\n")
        if memory:
            mem_trace = self._get_memory(end_time)
            self.logger.write(f"LOG_TRACE:{mem_trace.to_json_str()},\n")
        if flush:
            self.flush()
    def flush(self) -> None:
        if self.logger is not None:
            self.logger.flush()
    def _get_memory(self, timestamp):
        args = {"value": torch.vacc.memory_allocated(self.global_rank)}
        mem_trace = TraceEntry(
            "memory", "memory", self.pid, MODULE_TID["ram"], timestamp, "C", args
        )
        return mem_trace
@contextmanager
 def _trace_time(name, logger_inst, memory=False, flush=False):
    if not logger_inst.enabled:
        yield
        return
    logger_inst.begin_trace(name)
    yield
    logger_inst.end_trace(name, flush=flush)
 SKIPED_MODULES = []
 def _register_module_trace(
    module: torch.nn.Module, logger_inst, flush: bool = True, forward_only=False
 ):
    if not logger_inst.enabled:
        return
    if not isinstance(module, torch.nn.Module):
        return
    def _register(m):
        module_name = f"{type(m).__name__}"
        if module_name == "WrapName":
            module_name = f"{type(m.forward_func.__self__).__name__}"
        if module_name in SKIPED_MODULES:
            return
        forward_name = module_name + ".forward"
        m.register_forward_pre_hook(
            lambda m, inp: logger_inst.begin_trace(forward_name, memory=True)
        )
        m.register_forward_hook(
            lambda m, inp, out: logger_inst.end_trace(forward_name, memory=True)
        )
        if not forward_only:
            backward_name = module_name + ".backward"
            m.register_full_backward_pre_hook(
                lambda m, grad_out: logger_inst.begin_trace(backward_name, memory=True)
            )
            m.register_full_backward_hook(
                lambda m, grad_in, grad_out: logger_inst.end_trace(
                    backward_name, memory=True, flush=flush
                )
            )
    for m in module.modules():
        if m in logger_inst.registered_modules:
            print(
                f"module `{m}` already registered, skip applying trace on same module multiple times."
            )
            continue
        _register(m)
 def _trace_autograd_function(logger_inst):
    def decorator(cls):
        if not issubclass(cls, torch.autograd.Function):
            return cls
        def _apply(name, method):
            def wrapper(*args, **kwargs):
                with _trace_time(name, logger_inst=logger_inst, memory=True):
                    result = method(*args, **kwargs)
                return result
            return wrapper
        for attr in ["forward", "backward"]:
            setattr(cls, attr, _apply(cls.__name__ + "." + attr, getattr(cls, attr)))
        return cls
    return decorator
 def _register_optimizer_trace(
    optimizer: torch.optim.Optimizer, logger_inst, flush: bool = True
 ):
    if not logger_inst.enabled:
        return
    trace_name = f"{type(optimizer).__name__}.step"
    if isinstance(optimizer, torch.optim.Optimizer):
        optimizer.register_step_pre_hook(
            lambda m, *args, **kwargs: logger_inst.begin_trace(trace_name, memory=True)
        )
        optimizer.register_step_post_hook(
            lambda m, *args, **kwargs: logger_inst.end_trace(
                trace_name, memory=True, flush=flush
            )
        )
    elif hasattr(optimizer, "step") and callable(optimizer.step):
        # customized optimzier does not has step hooks
        original_step = optimizer.step
        def traced_step(*args, **kwargs):
            logger_inst.begin_trace(trace_name, memory=True)
            result = original_step(*args, **kwargs)
            logger_inst.end_trace(trace_name, memory=True, flush=flush)
            return result
        # Replace the step method with the new function
        optimizer.step = traced_step
    else:
        # unknown optimizer or wrong instance pass to this function.
        pass
    if hasattr(optimizer, "reduce_gradients") and callable(optimizer.reduce_gradients):
        trace_name = f"{type(optimizer).__name__}.reduce_gradients"
        original_reduce = optimizer.reduce_gradients
        def traced_reduce(*args, **kwargs):
            logger_inst.begin_trace(trace_name, memory=True)
            result = original_reduce(*args, **kwargs)
            logger_inst.end_trace(trace_name, memory=True, flush=flush)
            return result
        # Replace the step method with the new function
        optimizer.reduce_gradients = traced_reduce
 def get_trace_api(name="nn.Module"):
    """generate module execution trace APIs for a given module name
    Args:
        name (str): module name
    Returns:
        tuple: (trace_time, register_module_trace, trace_autograd_function)
        Usage of these three functions is describted in the docstring of this module
    """
    _trace_logger = TraceLogger(name)
    return (
        partial(_trace_time, logger_inst=_trace_logger),
        partial(_register_module_trace, logger_inst=_trace_logger),
        partial(_trace_autograd_function, logger_inst=_trace_logger),
        partial(_register_optimizer_trace, logger_inst=_trace_logger),
    )
--- a/vllm/_C.abi3.so
+++ b/vllm/_C.abi3.so
--- a/vllm/init.py
+++ b/vllm/init.py
@@ -0,0 +1,102 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
 # The version.py should be independent library, and we always import the
 # version library first.  Such assumption is critical for some customization.
 from .version import __version__, __version_tuple__  # isort:skip
 import typing
 # The environment variables override should be imported before any other
 # modules to ensure that the environment variables are set before any
 # other modules are imported.
 import vllm.env_override  # noqa: F401
 MODULE_ATTRS = {
    "bc_linter_skip": "._bc_linter:bc_linter_skip",
    "bc_linter_include": "._bc_linter:bc_linter_include",
    "AsyncEngineArgs": ".engine.arg_utils:AsyncEngineArgs",
    "EngineArgs": ".engine.arg_utils:EngineArgs",
    "AsyncLLMEngine": ".engine.async_llm_engine:AsyncLLMEngine",
    "LLMEngine": ".engine.llm_engine:LLMEngine",
    "LLM": ".entrypoints.llm:LLM",
    "initialize_ray_cluster": ".executor.ray_utils:initialize_ray_cluster",
    "PromptType": ".inputs:PromptType",
    "TextPrompt": ".inputs:TextPrompt",
    "TokensPrompt": ".inputs:TokensPrompt",
    "ModelRegistry": ".model_executor.models:ModelRegistry",
    "SamplingParams": ".sampling_params:SamplingParams",
    "PoolingParams": ".pooling_params:PoolingParams",
    "ClassificationOutput": ".outputs:ClassificationOutput",
    "ClassificationRequestOutput": ".outputs:ClassificationRequestOutput",
    "CompletionOutput": ".outputs:CompletionOutput",
    "EmbeddingOutput": ".outputs:EmbeddingOutput",
    "EmbeddingRequestOutput": ".outputs:EmbeddingRequestOutput",
    "PoolingOutput": ".outputs:PoolingOutput",
    "PoolingRequestOutput": ".outputs:PoolingRequestOutput",
    "RequestOutput": ".outputs:RequestOutput",
    "ScoringOutput": ".outputs:ScoringOutput",
    "ScoringRequestOutput": ".outputs:ScoringRequestOutput",
 }
 if typing.TYPE_CHECKING:
    from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
    from vllm.engine.async_llm_engine import AsyncLLMEngine
    from vllm.engine.llm_engine import LLMEngine
    from vllm.entrypoints.llm import LLM
    from vllm.executor.ray_utils import initialize_ray_cluster
    from vllm.inputs import PromptType, TextPrompt, TokensPrompt
    from vllm.model_executor.models import ModelRegistry
    from vllm.outputs import (ClassificationOutput,
                              ClassificationRequestOutput, CompletionOutput,
                              EmbeddingOutput, EmbeddingRequestOutput,
                              PoolingOutput, PoolingRequestOutput,
                              RequestOutput, ScoringOutput,
                              ScoringRequestOutput)
    from vllm.pooling_params import PoolingParams
    from vllm.sampling_params import SamplingParams
    from ._bc_linter import bc_linter_include, bc_linter_skip
 else:
    def __getattr__(name: str) -> typing.Any:
        from importlib import import_module
        if name in MODULE_ATTRS:
            module_name, attr_name = MODULE_ATTRS[name].split(":")
            module = import_module(module_name, __package__)
            return getattr(module, attr_name)
        else:
            raise AttributeError(
                f'module {__package__} has no attribute {name}')
 __all__ = [
    "__version__",
    "bc_linter_skip",
    "bc_linter_include",
    "__version_tuple__",
    "LLM",
    "ModelRegistry",
    "PromptType",
    "TextPrompt",
    "TokensPrompt",
    "SamplingParams",
    "RequestOutput",
    "CompletionOutput",
    "PoolingOutput",
    "PoolingRequestOutput",
    "EmbeddingOutput",
    "EmbeddingRequestOutput",
    "ClassificationOutput",
    "ClassificationRequestOutput",
    "ScoringOutput",
    "ScoringRequestOutput",
    "LLMEngine",
    "EngineArgs",
    "AsyncLLMEngine",
    "AsyncEngineArgs",
    "initialize_ray_cluster",
    "PoolingParams",
 ]
--- a/vllm/pycache/init.cpython-312.pyc
+++ b/vllm/pycache/init.cpython-312.pyc
--- a/vllm/pycache/_bc_linter.cpython-312.pyc
+++ b/vllm/pycache/_bc_linter.cpython-312.pyc
--- a/vllm/pycache/_custom_ops.cpython-312.pyc
+++ b/vllm/pycache/_custom_ops.cpython-312.pyc
--- a/vllm/pycache/_version.cpython-312.pyc
+++ b/vllm/pycache/_version.cpython-312.pyc
--- a/vllm/pycache/beam_search.cpython-312.pyc
+++ b/vllm/pycache/beam_search.cpython-312.pyc
--- a/vllm/pycache/connections.cpython-312.pyc
+++ b/vllm/pycache/connections.cpython-312.pyc
--- a/vllm/pycache/env_override.cpython-312.pyc
+++ b/vllm/pycache/env_override.cpython-312.pyc
--- a/vllm/pycache/envs.cpython-312.pyc
+++ b/vllm/pycache/envs.cpython-312.pyc
--- a/vllm/pycache/logger.cpython-312.pyc
+++ b/vllm/pycache/logger.cpython-312.pyc
--- a/vllm/pycache/logits_process.cpython-312.pyc
+++ b/vllm/pycache/logits_process.cpython-312.pyc
--- a/vllm/pycache/logprobs.cpython-312.pyc
+++ b/vllm/pycache/logprobs.cpython-312.pyc
--- a/vllm/pycache/outputs.cpython-312.pyc
+++ b/vllm/pycache/outputs.cpython-312.pyc
--- a/vllm/pycache/pooling_params.cpython-312.pyc
+++ b/vllm/pycache/pooling_params.cpython-312.pyc
--- a/vllm/pycache/sampling_params.cpython-312.pyc
+++ b/vllm/pycache/sampling_params.cpython-312.pyc
--- a/vllm/pycache/scalar_type.cpython-312.pyc
+++ b/vllm/pycache/scalar_type.cpython-312.pyc
--- a/vllm/pycache/sequence.cpython-312.pyc
+++ b/vllm/pycache/sequence.cpython-312.pyc
--- a/vllm/pycache/tasks.cpython-312.pyc
+++ b/vllm/pycache/tasks.cpython-312.pyc
--- a/vllm/pycache/test_utils.cpython-312.pyc
+++ b/vllm/pycache/test_utils.cpython-312.pyc
--- a/vllm/pycache/tracing.cpython-312.pyc
+++ b/vllm/pycache/tracing.cpython-312.pyc
--- a/vllm/pycache/version.cpython-312.pyc
+++ b/vllm/pycache/version.cpython-312.pyc
--- a/vllm/_bc_linter.py
+++ b/vllm/_bc_linter.py
@@ -0,0 +1,59 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # vllm/_bc_linter.py
 from __future__ import annotations
 from typing import Any, Callable, TypeVar, overload
 T = TypeVar("T")
@overload
 def bc_linter_skip(obj: T) -> T:
    ...
@overload
 def bc_linter_skip(*, reason: str | None = ...) -> Callable[[T], T]:
    ...
 def bc_linter_skip(obj: Any = None, *, reason: str | None = None):
    """
    No-op decorator to mark symbols/files for BC-linter suppression.
    Usage:
        @bc_linter_skip
        def legacy_api(...): ...
    """
    def _wrap(x: T) -> T:
        return x
    return _wrap if obj is None else obj
@overload
 def bc_linter_include(obj: T) -> T:
    ...
@overload
 def bc_linter_include(*, reason: str | None = ...) -> Callable[[T], T]:
    ...
 def bc_linter_include(obj: Any = None, *, reason: str | None = None):
    """
    Usage:
        @bc_linter_include
        def public_api(...): ...
    """
    def _wrap(x: T) -> T:
        return x
    return _wrap if obj is None else obj
 __all__ = ["bc_linter_skip", "bc_linter_include"]
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -0,0 +1,393 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional, Union
 import torch
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 logger = init_logger(__name__)
 try:
    import intel_extension_for_pytorch as ipex
 except ImportError as e:
    logger.debug("Import error msg: %s", e.msg)
 class ipex_ops:
    @staticmethod
    def _reshape_activation_tensor(
            x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        num = x.size(0)
        d = x.size(1) // 2
        x = x.reshape(num, 2, d)
        x1, x2 = torch.chunk(x, chunks=2, dim=1)
        x1 = x1.reshape(num, d)
        x2 = x2.reshape(num, d)
        return x1, x2
    @staticmethod
    def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
        ipex.llm.functional.silu_and_mul(x, out)
    @staticmethod
    def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
        ipex.llm.functional.gelu_and_mul(x, out)
    @staticmethod
    def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
        ipex.llm.functional.gelu_and_mul(x, out)
    @staticmethod
    def gelu_fast(x: torch.Tensor) -> torch.Tensor:
        return torch.nn.functional.gelu(x)
    @staticmethod
    def gelu_new(x: torch.Tensor) -> torch.Tensor:
        return torch.nn.functional.gelu(x)
    @staticmethod
    def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
        ipex.llm.functional.gelu_quick(x, out)
    @staticmethod
    def paged_attention_v1(
        out: torch.Tensor,
        query: torch.Tensor,
        key_cache: torch.Tensor,
        value_cache: torch.Tensor,
        num_kv_heads: int,
        scale: float,
        block_tables: torch.Tensor,
        context_lens: torch.Tensor,
        block_size: int,
        max_context_len: int,
        alibi_slopes: Optional[torch.Tensor],
        kv_cache_dtype: str,
        k_scale: float,
        v_scale: float,
        tp_rank: int = 0,
        blocksparse_local_blocks: int = 0,
        blocksparse_vert_stride: int = 0,
        blocksparse_block_size: int = 64,
        blocksparse_head_sliding_step: int = 0,
    ) -> None:
        assert kv_cache_dtype == "auto"
        num_heads = out.size(1)
        num_queries_per_tokens = num_heads // num_kv_heads
        ipex.llm.modules.PagedAttention.single_query_kv_attention(
            out,
            query.contiguous(),
            key_cache.view_as(value_cache),
            value_cache,
            num_queries_per_tokens,
            scale,
            block_tables,
            context_lens,
            block_size,
            max_context_len,
            alibi_slopes,
        )
    @staticmethod
    def paged_attention_v2(
        out: torch.Tensor,
        exp_sum: torch.Tensor,
        max_logits: torch.Tensor,
        tmp_out: torch.Tensor,
        query: torch.Tensor,
        key_cache: torch.Tensor,
        value_cache: torch.Tensor,
        num_kv_heads: int,
        scale: float,
        block_tables: torch.Tensor,
        context_lens: torch.Tensor,
        block_size: int,
        max_context_len: int,
        alibi_slopes: Optional[torch.Tensor],
        kv_cache_dtype: str,
        k_scale: float,
        v_scale: float,
        tp_rank: int = 0,
        blocksparse_local_blocks: int = 0,
        blocksparse_vert_stride: int = 0,
        blocksparse_block_size: int = 64,
        blocksparse_head_sliding_step: int = 0,
    ) -> None:
        assert kv_cache_dtype == "auto"
        num_heads = out.size(1)
        num_queries_per_tokens = num_heads // num_kv_heads
        ipex.llm.modules.PagedAttention.single_query_kv_attention(
            out,
            query.contiguous(),
            key_cache.view_as(value_cache),
            value_cache,
            num_queries_per_tokens,
            scale,
            block_tables,
            context_lens,
            block_size,
            max_context_len,
            alibi_slopes,
        )
    @staticmethod
    def rotary_embedding(
        positions: torch.Tensor,  # [batch_size, seq_len]
        query: torch.Tensor,  # [batch_size, seq_len, num_heads*head_size]
        key: torch.Tensor,  # [batch_size, seq_len, num_kv_heads*head_size]
        head_size: int,
        cos_sin_cache: torch.Tensor,  # [cos_sin_dim, rot_dim]
        is_neox: bool,
    ) -> None:
        rot_dim = cos_sin_cache.size(1)
        ipex.llm.functional.rotary_embedding_batched(positions, query, key,
                                                     head_size, cos_sin_cache,
                                                     is_neox, rot_dim)
    @staticmethod
    def rms_norm(input: torch.Tensor, weight: torch.Tensor,
                 epsilon: float) -> torch.Tensor:
        return ipex.llm.functional.rms_norm(input, weight, epsilon)
    @staticmethod
    def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
                           weight: torch.Tensor, epsilon: float) -> None:
        tmp = ipex.llm.functional.add_rms_norm(residual, input, weight, None,
                                               epsilon, True)
        input.copy_(tmp)
    @staticmethod
    def varlen_attention(
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        out: torch.Tensor,
        seqlen_q: torch.Tensor,
        seqlen_k: torch.Tensor,
        alibi_slopes: Optional[torch.Tensor],
        max_seqlen_q: int,
        max_seqlen_k: int,
        pdropout: float,
        softmax_scale: float,
        zero_tensors: bool,
        is_causal: bool,
        return_softmax: bool,
        gen_: torch.Generator,
        window_size_left: float,
        window_size_right: float,
        logits_soft_cap: float,
    ) -> None:
        if ipex.__version__.endswith("cpu"):
            if logits_soft_cap != 0.0:
                raise ValueError("IPEX CPU does not support logits_soft_cap")
            assert alibi_slopes is None
            assert window_size_left < 0 and window_size_right < 0
            ipex.llm.functional.varlen_attention(query.contiguous(),
                                                 key.contiguous(),
                                                 value.contiguous(), out,
                                                 seqlen_q.int(),
                                                 seqlen_k.int(), max_seqlen_q,
                                                 max_seqlen_k, pdropout,
                                                 softmax_scale, zero_tensors,
                                                 is_causal, return_softmax,
                                                 gen_)
        else:  # XPU build
            ipex.llm.functional.varlen_attention(
                query.contiguous(), key.contiguous(), value.contiguous(), out,
                seqlen_q.int(), seqlen_k.int(), alibi_slopes, max_seqlen_q,
                max_seqlen_k, pdropout, softmax_scale, zero_tensors, is_causal,
                return_softmax, gen_, window_size_left, window_size_right,
                logits_soft_cap)
    @staticmethod
    def reshape_and_cache(
        key: torch.Tensor,
        value: torch.Tensor,
        key_cache: torch.Tensor,
        value_cache: torch.Tensor,
        slot_mapping: torch.Tensor,
        kv_cache_dtype: str,
        k_scale: float,
        v_scale: float,
    ) -> None:
        assert kv_cache_dtype == "auto"
        ipex.llm.modules.PagedAttention.reshape_and_cache(
            key, value, key_cache, value_cache, slot_mapping)
    @staticmethod
    def reshape_and_cache_flash(
        key: torch.Tensor,
        value: torch.Tensor,
        key_cache: torch.Tensor,
        value_cache: torch.Tensor,
        slot_mapping: torch.Tensor,
        kv_cache_dtype: str,
        k_scale: Optional[torch.Tensor] = None,
        v_scale: Optional[torch.Tensor] = None,
        k_scale_float: float = 1.0,
        v_scale_float: float = 1.0,
    ) -> None:
        ipex.llm.modules.PagedAttention.reshape_and_cache_flash(
            key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
            k_scale_float, v_scale_float)
    @staticmethod
    def flash_attn_varlen_func(
        out: torch.Tensor,
        q: torch.Tensor,
        k: torch.Tensor,
        v: torch.Tensor,
        cu_seqlens_q: torch.Tensor,
        seqused_k: torch.Tensor,  # we don't support this in ipex kernel
        max_seqlen_q: int,
        max_seqlen_k: int,
        softmax_scale: float,
        causal: bool,
        block_table: torch.Tensor,
        alibi_slopes: Optional[torch.Tensor],
        window_size: Optional[list[int]] = None,
        softcap: Optional[float] = 0.0,
        cu_seqlens_k: Optional[torch.Tensor] = None,
        # The following parameters are not used in ipex kernel currently,
        # we keep API compatible to CUDA's.
        scheduler_metadata=None,
        fa_version: int = 2,
        q_descale=None,
        k_descale=None,
        v_descale=None,
        num_splits=0,
        s_aux: Optional[torch.Tensor] = None,
    ):
        if cu_seqlens_k is None:
            # cu_seqlens_k is not used in ipex kernel.
            cu_seqlens_k = torch.cumsum(seqused_k, dim=0)
            cu_seqlens_k = torch.cat([
                torch.tensor([0], device=seqused_k.device, dtype=torch.int32),
                cu_seqlens_k
            ]).to(torch.int32)
        real_window_size: tuple[int, int]
        if window_size is None:
            real_window_size = (-1, -1)
        else:
            assert len(window_size) == 2
            real_window_size = (window_size[0], window_size[1])
        return ipex.llm.modules.PagedAttention.flash_attn_varlen_func(
            out,
            q.contiguous(),
            k,
            v,
            cu_seqlens_q,
            cu_seqlens_k,
            max_seqlen_q,
            max_seqlen_k,
            softmax_scale,
            causal,
            block_table,
            alibi_slopes,
            softcap=softcap,
            window_size_left=real_window_size[0],
            window_size_right=real_window_size[1],
            k_scale=1.0,
            v_scale=1.0,
        )
    @staticmethod
    def get_scheduler_metadata(
            batch_size,
            max_seqlen_q,
            max_seqlen_k,
            num_heads_q,
            num_heads_kv,
            headdim,
            cache_seqlens: torch.Tensor,
            qkv_dtype=torch.bfloat16,
            headdim_v=None,
            cu_seqlens_q: Optional[torch.Tensor] = None,
            cu_seqlens_k_new: Optional[torch.Tensor] = None,
            cache_leftpad: Optional[torch.Tensor] = None,
            page_size: Optional[int] = None,
            max_seqlen_k_new=0,
            causal=False,
            window_size=(-1, -1),  # -1 means infinite context window
            has_softcap=False,
            num_splits=0,  # Can be tuned for speed
            pack_gqa=None,  # Can be tuned for speed
            sm_margin=0,  # Can be tuned if some SMs are used for communication
    ) -> None:
        logger.warning_once(
            "get_scheduler_metadata is not implemented for ipex_ops, "
            "returning None.")
        return None
    @staticmethod
    def copy_blocks(key_caches: list[torch.Tensor],
                    value_caches: list[torch.Tensor],
                    block_mapping: torch.Tensor) -> None:
        torch.xpu.copy_blocks(  # type: ignore
            key_caches,
            value_caches,
            block_mapping,
        )
    @staticmethod
    def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
                    block_mapping: torch.Tensor) -> None:
        torch.xpu.swap_blocks(src, dst, block_mapping)  # type: ignore
    @staticmethod
    def scaled_fp8_quant(
        input: torch.Tensor,
        scale: Optional[torch.Tensor] = None,
        num_token_padding: Optional[int] = None,
        scale_ub: Optional[torch.Tensor] = None,
        use_per_token_if_dynamic: bool = False,
        output: Optional[torch.Tensor] = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Quantize input tensor to FP8 and return quantized tensor and scale.
        This function is designed for both static and dynamic quantization:
        If you provide the scale, it will use static scaling and if you omit
        it, the scale will be determined dynamically. Currently, XPU platform
        only supports dynamic quantization. The function also allows optional
        padding of the output tensors for downstream kernels that will benefit
        from padding.
        Args:
            input: The input tensor to be quantized to FP8
            scale: Optional scaling factor for the FP8 quantization
            scale_ub: Optional upper bound for scaling factor in dynamic
                per token case
            num_token_padding: If specified, pad the first dimension
                of the output to at least this value.
            use_per_token_if_dynamic: Whether to do per_tensor or per_token
                in the dynamic quantization case.
        Returns:
            tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
                scaling factor.
        """
        # This code assumes batch_dim and num_tokens are flattened
        assert (input.ndim == 2)
        shape: Union[tuple[int, int], torch.Size] = input.shape
        out_dtype: torch.dtype = current_platform.fp8_dtype()
        if num_token_padding:
            shape = (max(num_token_padding, input.shape[0]), shape[1])
        if output is None:
            output = torch.empty(shape, device=input.device, dtype=out_dtype)
        else:
            assert num_token_padding is None, \
                "padding not supported if output passed in"
            assert output.dtype == out_dtype
        assert scale is None, "only dynamic fp8 quantization supported on XPU"
        assert not use_per_token_if_dynamic, (
            "per token dynamic fp8 quantization not supported on XPU")
        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
        torch.ops.torch_ipex.dynamic_scaled_fp8_quant(output, input, scale)
        return output, scale
--- a/vllm/_version.py
+++ b/vllm/_version.py
@@ -0,0 +1,34 @@
 # file generated by setuptools-scm
 # don't change, don't track in version control
 __all__ = [
    "__version__",
    "__version_tuple__",
    "version",
    "version_tuple",
    "__commit_id__",
    "commit_id",
 ]
 TYPE_CHECKING = False
 if TYPE_CHECKING:
    from typing import Tuple
    from typing import Union
    VERSION_TUPLE = Tuple[Union[int, str], ...]
    COMMIT_ID = Union[str, None]
 else:
    VERSION_TUPLE = object
    COMMIT_ID = object
 version: str
 __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
 __version__ = version = '0.11.0'
 __version_tuple__ = version_tuple = (0, 11, 0)
 __commit_id__ = commit_id = None
--- a/vllm/assets/init.py
+++ b/vllm/assets/init.py
--- a/vllm/assets/pycache/init.cpython-312.pyc
+++ b/vllm/assets/pycache/init.cpython-312.pyc
--- a/vllm/assets/pycache/base.cpython-312.pyc
+++ b/vllm/assets/pycache/base.cpython-312.pyc
--- a/vllm/assets/audio.py
+++ b/vllm/assets/audio.py
@@ -0,0 +1,45 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Literal
 from urllib.parse import urljoin
 import numpy.typing as npt
 from vllm.utils import PlaceholderModule
 from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets
 try:
    import librosa
 except ImportError:
    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
 ASSET_DIR = "multimodal_asset"
 AudioAssetName = Literal["winning_call", "mary_had_lamb"]
@dataclass(frozen=True)
 class AudioAsset:
    name: AudioAssetName
    @property
    def filename(self) -> str:
        return f"{self.name}.ogg"
    @property
    def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
        audio_path = get_vllm_public_assets(filename=self.filename,
                                            s3_prefix=ASSET_DIR)
        return librosa.load(audio_path, sr=None)
    def get_local_path(self) -> Path:
        return get_vllm_public_assets(filename=self.filename,
                                      s3_prefix=ASSET_DIR)
    @property
    def url(self) -> str:
        return urljoin(VLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
--- a/vllm/assets/base.py
+++ b/vllm/assets/base.py
@@ -0,0 +1,41 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from functools import lru_cache
 from pathlib import Path
 from typing import Optional
 import vllm.envs as envs
 from vllm.connections import global_http_connection
 VLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
 def get_cache_dir() -> Path:
    """Get the path to the cache for storing downloaded assets."""
    path = Path(envs.VLLM_ASSETS_CACHE)
    path.mkdir(parents=True, exist_ok=True)
    return path
@lru_cache
 def get_vllm_public_assets(filename: str,
                           s3_prefix: Optional[str] = None) -> Path:
    """
    Download an asset file from ``s3://vllm-public-assets``
    and return the path to the downloaded file.
    """
    asset_directory = get_cache_dir() / "vllm_public_assets"
    asset_directory.mkdir(parents=True, exist_ok=True)
    asset_path = asset_directory / filename
    if not asset_path.exists():
        if s3_prefix is not None:
            filename = s3_prefix + "/" + filename
        global_http_connection.download_file(
            f"{VLLM_S3_BUCKET_URL}/{filename}",
            asset_path,
            timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT)
    return asset_path
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -0,0 +1,50 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Literal
 import torch
 from PIL import Image
 from .base import get_vllm_public_assets
 VLM_IMAGES_DIR = "vision_model_images"
 ImageAssetName = Literal["stop_sign", "cherry_blossom", "hato",
                         "2560px-Gfp-wisconsin-madison-the-nature-boardwalk",
                         "Grayscale_8bits_palette_sample_image",
                         "1280px-Venn_diagram_rgb", "RGBA_comp", "237-400x300",
                         "231-200x300", "27-500x500", "17-150x600",
                         "handelsblatt-preview", "paper-11"]
@dataclass(frozen=True)
 class ImageAsset:
    name: ImageAssetName
    def get_path(self, ext: str) -> Path:
        """
        Return s3 path for given image.
        """
        return get_vllm_public_assets(filename=f"{self.name}.{ext}",
                                      s3_prefix=VLM_IMAGES_DIR)
    @property
    def pil_image(self, ext="jpg") -> Image.Image:
        image_path = self.get_path(ext)
        return Image.open(image_path)
    @property
    def image_embeds(self) -> torch.Tensor:
        """
        Image embeddings, only used for testing purposes with llava 1.5.
        """
        image_path = self.get_path('pt')
        return torch.load(image_path, map_location="cpu", weights_only=True)
    def read_bytes(self, ext: str) -> bytes:
        p = Path(self.get_path(ext))
        return p.read_bytes()
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -0,0 +1,145 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 from functools import lru_cache
 from typing import Any, ClassVar, Literal, Optional
 import cv2
 import numpy as np
 import numpy.typing as npt
 from huggingface_hub import hf_hub_download
 from PIL import Image
 from vllm.utils import PlaceholderModule
 from .base import get_cache_dir
 try:
    import librosa
 except ImportError:
    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
@lru_cache
 def download_video_asset(filename: str) -> str:
    """
    Download and open an image from huggingface
    repo: raushan-testing-hf/videos-test
    """
    video_directory = get_cache_dir() / "video-example-data"
    video_directory.mkdir(parents=True, exist_ok=True)
    video_path = video_directory / filename
    video_path_str = str(video_path)
    if not video_path.exists():
        video_path_str = hf_hub_download(
            repo_id="raushan-testing-hf/videos-test",
            filename=filename,
            repo_type="dataset",
            cache_dir=video_directory,
        )
    return video_path_str
 def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
    cap = cv2.VideoCapture(path)
    if not cap.isOpened():
        raise ValueError(f"Could not open video file {path}")
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frames = []
    num_frames = num_frames if num_frames > 0 else total_frames
    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
    for idx in range(total_frames):
        ok = cap.grab()  # next img
        if not ok:
            break
        if idx in frame_indices:  # only decompress needed
            ret, frame = cap.retrieve()
            if ret:
                # OpenCV uses BGR format, we need to convert it to RGB
                # for PIL and transformers compatibility
                frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    frames = np.stack(frames)
    if len(frames) < num_frames:
        raise ValueError(f"Could not read enough frames from video file {path}"
                         f" (expected {num_frames} frames, got {len(frames)})")
    return frames
 def video_to_pil_images_list(path: str,
                             num_frames: int = -1) -> list[Image.Image]:
    frames = video_to_ndarrays(path, num_frames)
    return [Image.fromarray(frame) for frame in frames]
 def video_get_metadata(path: str, num_frames: int = -1) -> dict[str, Any]:
    cap = cv2.VideoCapture(path)
    if not cap.isOpened():
        raise ValueError(f"Could not open video file {path}")
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    duration = total_frames / fps if fps > 0 else 0
    if num_frames == -1 or num_frames > total_frames:
        num_frames = total_frames
    metadata = {
        "total_num_frames": num_frames,
        "fps": fps,
        "duration": duration,
        "video_backend": "opencv",
        "frames_indices": list(range(num_frames)),
        # extra field used to control hf processor's video
        # sampling behavior
        "do_sample_frames": num_frames == total_frames,
    }
    return metadata
 VideoAssetName = Literal["baby_reading"]
@dataclass(frozen=True)
 class VideoAsset:
    name: VideoAssetName
    num_frames: int = -1
    _NAME_TO_FILE: ClassVar[dict[VideoAssetName, str]] = {
        "baby_reading": "sample_demo_1.mp4",
    }
    @property
    def filename(self) -> str:
        return self._NAME_TO_FILE[self.name]
    @property
    def video_path(self) -> str:
        return download_video_asset(self.filename)
    @property
    def pil_images(self) -> list[Image.Image]:
        ret = video_to_pil_images_list(self.video_path, self.num_frames)
        return ret
    @property
    def np_ndarrays(self) -> npt.NDArray:
        ret = video_to_ndarrays(self.video_path, self.num_frames)
        return ret
    @property
    def metadata(self) -> dict[str, Any]:
        ret = video_get_metadata(self.video_path, self.num_frames)
        return ret
    def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray:
        """
        Read audio data from the video asset, used in Qwen2.5-Omni examples.
        See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
        """
        return librosa.load(self.video_path, sr=sampling_rate)[0]
--- a/vllm/attention/init.py
+++ b/vllm/attention/init.py
@@ -0,0 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.attention.backends.abstract import (AttentionBackend,
                                              AttentionMetadata, AttentionType)
 from vllm.attention.layer import Attention
 from vllm.attention.selector import get_attn_backend
 __all__ = [
    "Attention",
    "AttentionBackend",
    "AttentionMetadata",
    "AttentionType",
    "get_attn_backend",
 ]
--- a/vllm/attention/backends/init.py
+++ b/vllm/attention/backends/init.py
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -0,0 +1,204 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from typing import Generic, List, Optional, Protocol, Tuple, Type, TypeVar
 import torch
 from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
 class AttentionType:
    """
    Attention type.
    Use string to be compatible with `torch.compile`.
    """
    DECODER = "decoder"
    """Decoder attention between previous layer Q/K/V."""
    ENCODER = "encoder"
    """Encoder attention between previous layer Q/K/V for encoder-decoder."""
    ENCODER_ONLY = "encoder_only"
    """Encoder attention between previous layer Q/K/V."""
    ENCODER_DECODER = "encoder_decoder"
    """Attention between dec. Q and enc. K/V for encoder-decoder."""
 class AttentionBackend(ABC):
    """Abstract class for attention backends."""
    # For some attention backends, we allocate an output tensor before
    # calling the custom op. When piecewise cudagraph is enabled, this
    # makes sure the output tensor is allocated inside the cudagraph.
    accept_output_buffer: bool = False
    # Whether this backend supports receiving pre-quantized query input.
    # If True, the attention layer will handle query quantization instead
    # of the backend, allowing torch.compile to fuse quantization with
    # previous operations.
    # Needs to be worked through for all backends
    # https://github.com/vllm-project/vllm/issues/25584
    supports_quant_query_input: bool = False
    @staticmethod
    @abstractmethod
    def get_name() -> str:
        raise NotImplementedError
    @staticmethod
    @abstractmethod
    def get_impl_cls() -> Type["AttentionImpl"]:
        raise NotImplementedError
    @staticmethod
    @abstractmethod
    def get_metadata_cls() -> Type["AttentionMetadata"]:
        raise NotImplementedError
    @classmethod
    def make_metadata(cls, *args, **kwargs) -> "AttentionMetadata":
        return cls.get_metadata_cls()(*args, **kwargs)
    @staticmethod
    @abstractmethod
    def get_builder_cls():  # -> Type["AttentionMetadataBuilder"]:
        raise NotImplementedError
    @staticmethod
    @abstractmethod
    def get_kv_cache_shape(
        num_blocks: int,
        block_size: int,
        num_kv_heads: int,
        head_size: int,
        cache_dtype_str: str = "auto",
    ) -> Tuple[int, ...]:
        raise NotImplementedError
    @staticmethod
    def get_kv_cache_stride_order() -> Tuple[int, ...]:
        raise NotImplementedError
    @classmethod
    def full_cls_name(cls) -> tuple[str, str]:
        return (cls.__module__, cls.__qualname__)
 class AttentionMetadata:
    pass
 T = TypeVar("T", bound=AttentionMetadata)
 class AttentionLayer(Protocol):
    _q_scale: torch.Tensor
    _k_scale: torch.Tensor
    _v_scale: torch.Tensor
    _q_scale_float: float
    _k_scale_float: float
    _v_scale_float: float
    _prob_scale: torch.Tensor
    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        kv_cache: torch.Tensor,
        attn_metadata: AttentionMetadata,
    ) -> torch.Tensor:
        ...
 class AttentionImpl(ABC, Generic[T]):
    # Whether the attention impl can return the softmax lse for decode.
    # Some features like decode context parallelism require the softmax lse.
    can_return_lse_for_decode: bool = False
    # some attention backends might not always want to return lse
    # even if they can return lse (for efficiency reasons)
    need_to_return_lse_for_decode: bool = False
    dcp_world_size: int
    dcp_rank: int
    def __new__(cls, *args, **kwargs):
        # use __new__ so that all subclasses will call this
        self = super().__new__(cls)
        try:
            from vllm.distributed.parallel_state import get_dcp_group
            self.dcp_world_size = get_dcp_group().world_size
            self.dcp_rank = get_dcp_group().rank_in_group
        except AssertionError:
            # DCP might not be initialized in testing
            self.dcp_world_size = 1
            self.dcp_rank = 0
        self.need_to_return_lse_for_decode = self.dcp_world_size > 1 \
            and self.can_return_lse_for_decode
        return self
    @abstractmethod
    def __init__(
        self,
        num_heads: int,
        head_size: int,
        scale: float,
        num_kv_heads: Optional[int] = None,
        alibi_slopes: Optional[List[float]] = None,
        sliding_window: Optional[int] = None,
        kv_cache_dtype: str = "auto",
        logits_soft_cap: Optional[float] = None,
        attn_type: str = AttentionType.DECODER,
        kv_sharing_target_layer_name: Optional[str] = None,
    ) -> None:
        raise NotImplementedError
    @abstractmethod
    def forward(
        self,
        layer: AttentionLayer,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        kv_cache: torch.Tensor,
        attn_metadata: T,
        output: Optional[torch.Tensor] = None,
        output_scale: Optional[torch.Tensor] = None,
        output_block_scale: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        raise NotImplementedError
    def fused_output_quant_supported(self, quant_key: QuantKey):
        """
        Does this attention implementation support fused output quantization.
        This is used by the AttnFusionPass to only fuse output quantization
        onto implementations that support it.
        :param quant_key: QuantKey object that describes the quantization op
        :return: is fusion supported for this type of quantization
        """
        return False
 class MLAAttentionImpl(AttentionImpl[T], Generic[T]):
    @abstractmethod
    def forward(
        self,
        layer: AttentionLayer,
        hidden_states_or_cq: torch.Tensor,
        kv_c_normed: torch.Tensor,
        k_pe: torch.Tensor,
        kv_cache: torch.Tensor,
        attn_metadata: T,
        output: Optional[torch.Tensor] = None,
        output_scale: Optional[torch.Tensor] = None,
        output_block_scale: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        raise NotImplementedError
 def is_quantized_kv_cache(kv_cache_dtype: str) -> bool:
    return kv_cache_dtype != "auto"
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -0,0 +1,33 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention backend utils"""
 from dataclasses import dataclass
 from typing import Optional
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 logger = init_logger(__name__)
 PAD_SLOT_ID = -1
@dataclass
 class MLADims:
    q_lora_rank: Optional[int]
    kv_lora_rank: int
    qk_nope_head_dim: int
    qk_rope_head_dim: int
    v_head_dim: int
 def get_mla_dims(model_config: ModelConfig) -> MLADims:
    hf_text_config = model_config.hf_text_config
    return MLADims(
        q_lora_rank=getattr(hf_text_config, "q_lora_rank", None),
        kv_lora_rank=hf_text_config.kv_lora_rank,
        qk_nope_head_dim=hf_text_config.qk_nope_head_dim,
        qk_rope_head_dim=hf_text_config.qk_rope_head_dim,
        v_head_dim=hf_text_config.v_head_dim,
    )
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -0,0 +1,645 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer."""
 from typing import List, Optional
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import vllm.envs as envs
 from vllm.attention import AttentionType
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
 from vllm.config import CacheConfig, get_current_vllm_config
 from vllm.distributed.kv_transfer import (get_kv_transfer_group,
                                          has_kv_transfer_group,
                                          is_v1_kv_transfer_group)
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig)
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    GroupShape)
 from vllm.model_executor.models.vision import get_vit_attn_backend
 from vllm.platforms import _Backend, current_platform
 from vllm.utils import GiB_bytes, direct_register_custom_op
 logger = init_logger(__name__)
 USE_XFORMERS_OPS = None
 try:
    tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe, )
 except AttributeError:
    tag_cudagraph_unsafe = ()  # type: ignore[assignment]
 def check_xformers_availability():
    global USE_XFORMERS_OPS
    if USE_XFORMERS_OPS is not None:
        return USE_XFORMERS_OPS
    if current_platform.is_cuda() and current_platform.has_device_capability(
            100):
        # Xformers FA is not compatible with B200
        USE_XFORMERS_OPS = False
    else:
        try:
            from importlib.util import find_spec
            find_spec("xformers.ops")
            USE_XFORMERS_OPS = True
        except ImportError:
            USE_XFORMERS_OPS = False
    # the warning only needs to be shown once
    if not USE_XFORMERS_OPS:
        logger.warning("Xformers is not available, falling back.")
    return USE_XFORMERS_OPS
 def check_upstream_fa_availability(dtype: torch.dtype):
    if dtype in (torch.float16, torch.bfloat16) and current_platform.is_cuda(
    ) and current_platform.has_device_capability(80):
        from transformers.utils import is_flash_attn_2_available
        return is_flash_attn_2_available()
    return False
 class Attention(nn.Module, AttentionLayerBase):
    """Attention layer.
    This class takes query, key, and value tensors as input. The input tensors
    can either contain prompt tokens or generation tokens.
    The class does the following:
    1. Store the input key and value tensors in the KV cache.
    2. Perform (multi-head/multi-query/grouped-query) attention.
    3. Return the output tensor.
    """
    def __init__(
        self,
        num_heads: int,
        head_size: int,
        scale: float,
        num_kv_heads: Optional[int] = None,
        alibi_slopes: Optional[List[float]] = None,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        logits_soft_cap: Optional[float] = None,
        per_layer_sliding_window: Optional[int] = None,
        use_mla: bool = False,
        use_sparse: bool = False,
        prefix: str = "",
        attn_type: str = AttentionType.DECODER,
        kv_sharing_target_layer_name: Optional[str] = None,
        attn_backend: Optional[type[AttentionBackend]] = None,
        **extra_impl_args,
    ) -> None:
        """
        The KV cache is stored inside this class and is accessed via
        `self.kv_cache`.
        """
        super().__init__()
        if per_layer_sliding_window is not None:
            # per-layer sliding window
            sliding_window = per_layer_sliding_window
        elif cache_config is not None:
            # model-level sliding window
            sliding_window = cache_config.sliding_window
        else:
            sliding_window = None
        if cache_config is not None:
            kv_cache_dtype = cache_config.cache_dtype
            block_size = cache_config.block_size
            calculate_kv_scales = cache_config.calculate_kv_scales
        else:
            kv_cache_dtype = "auto"
            block_size = 16
            calculate_kv_scales = False
        if num_kv_heads is None:
            num_kv_heads = num_heads
        assert num_heads % num_kv_heads == 0, \
            f"num_heads ({num_heads}) is not " \
            f"divisible by num_kv_heads ({num_kv_heads})"
        # The default k/v_scale is set to 1.0. This is ignored
        # when kv-cache is not fp8, and should be used with
        # kv-cache in fp8_e5m2. For kv-cache in fp8_e4m3, we
        # expect the pre-quantized k/v_scale to be loaded along
        # with the model weights.
        self.kv_cache_dtype = kv_cache_dtype
        self.calculate_kv_scales = calculate_kv_scales
        self._k_scale = torch.tensor(1.0, dtype=torch.float32)
        self._v_scale = torch.tensor(1.0, dtype=torch.float32)
        # FlashAttn doesn't support quantizing the kv-cache only
        # but requires q to be quantized as well.
        self._q_scale = torch.tensor(1.0, dtype=torch.float32)
        self._prob_scale = torch.tensor(1.0, dtype=torch.float32)
        # We also keep q/k/v_scale on host (cpu) memory for attention
        # backends that require the scales to be on host instead of on device.
        # e.g. Flashinfer
        self._q_scale_float = 1.0
        self._k_scale_float = 1.0
        self._v_scale_float = 1.0
        # The output scale on host memory. This should be the input scale of
        # the quant op after this attention layer.
        self._o_scale_float: Optional[float] = None
        self.use_mla = use_mla
        self.use_sparse = use_sparse
        self.num_heads = num_heads
        self.head_size = head_size
        self.num_kv_heads = num_kv_heads
        self.sliding_window = sliding_window
        self.has_sink = extra_impl_args.get("sinks") is not None
        quant_method = quant_config.get_quant_method(
            self, prefix=prefix) if quant_config else None
        if quant_method is not None and not isinstance(
                quant_method, UnquantizedLinearMethod):
            assert isinstance(quant_method, BaseKVCacheMethod)
            # TODO (mgoin): kv cache dtype should be specified in the FP8
            # checkpoint config and become the "auto" behavior
            if self.kv_cache_dtype == "fp8_e5m2":
                raise ValueError("fp8_e5m2 kv-cache is not supported with "
                                 "fp8 checkpoints.")
            # If quantization is enabled, we make "k_scale" and "v_scale"
            # parameters so that it can be loaded from the model checkpoint.
            # The k/v_scale will then be converted back to native float32
            # values after weight loading.
            self.quant_method = quant_method
            self.quant_method.create_weights(self)
        # During model initialization, the default dtype is set as the model
        # weight and activation dtype.
        dtype = torch.get_default_dtype()
        if attn_backend is None:
            self.attn_backend = get_attn_backend(head_size,
                                                 dtype,
                                                 kv_cache_dtype,
                                                 block_size,
                                                 use_mla=use_mla,
                                                 has_sink=self.has_sink,
                                                 use_sparse=use_sparse)
        else:
            self.attn_backend = attn_backend
        impl_cls = self.attn_backend.get_impl_cls()
        self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
                             alibi_slopes, sliding_window, kv_cache_dtype,
                             logits_soft_cap, attn_type,
                             kv_sharing_target_layer_name, **extra_impl_args)
        self.backend = backend_name_to_enum(self.attn_backend.get_name())
        self.dtype = dtype
        # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
        # torch.compile works by registering the attention as one giant
        # opaque custom op. For other platforms, we directly call them
        # and let torch.compile handle them.
        self.use_direct_call = not current_platform.opaque_attention_op()
        self.use_output = self.attn_backend.accept_output_buffer
        compilation_config = get_current_vllm_config().compilation_config
        if prefix in compilation_config.static_forward_context:
            raise ValueError(f"Duplicate layer name: {prefix}")
        compilation_config.static_forward_context[prefix] = self
        self.layer_name = prefix
        self.attn_type = attn_type
        if kv_sharing_target_layer_name is not None:
            validate_kv_sharing_target(
                prefix,
                kv_sharing_target_layer_name,
                compilation_config.static_forward_context,
            )
        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
        # use a placeholder kv cache tensor during init, which will be replaced
        # by bind_kv_cache
        # this variable will not be accessed if use_direct_call is True
        self.kv_cache = [
            torch.tensor([]) for _ in range(get_current_vllm_config(
            ).parallel_config.pipeline_parallel_size)
        ]
        try:
            self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT,
                                        dtype=torch.float32)
            self.k_range = torch.tensor(envs.K_SCALE_CONSTANT,
                                        dtype=torch.float32)
            self.v_range = torch.tensor(envs.V_SCALE_CONSTANT,
                                        dtype=torch.float32)
        except torch.cuda.OutOfMemoryError as e:
            logger.error(
                "Failed to initialize attention q/k/v range constants: %s", e)
            if torch.cuda.is_available():
                logger.debug("CUDA device: %s", torch.cuda.current_device())
                logger.debug("Allocated: %.2f GiB",
                             torch.cuda.memory_allocated() / GiB_bytes)
                logger.debug("Reserved: %.2f GiB",
                             torch.cuda.memory_reserved() / GiB_bytes)
            raise RuntimeError(
                "Failed to initialize q/k/v range constants. "
                "This may be caused by insufficient memory to allocate "
                "kv cache.") from e
        # for attn backends supporting query quantization
        self.query_quant = None
        if self.kv_cache_dtype.startswith(
                "fp8") and self.attn_backend.supports_quant_query_input:
            self.query_quant = QuantFP8(static=True,
                                        group_shape=GroupShape.PER_TENSOR)
    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        # For some alternate attention backends like MLA the attention output
        # shape does not match the query shape, so we optionally let the model
        # definition specify the output tensor shape.
        output_shape: Optional[torch.Size] = None,
    ) -> torch.Tensor:
        """
        The KV cache is stored inside this class and is accessed via
        `self.kv_cache`.
        Attention metadata (`attn_metadata`) is set using a context manager in
        the model runner's `execute_model` method. It is accessed via forward
        context using
        `vllm.forward_context.get_forward_context().attn_metadata`.
        """
        if self.calculate_kv_scales:
            attn_metadata = get_forward_context().attn_metadata
            if attn_metadata.enable_kv_scales_calculation:
                self.calc_kv_scales(query, key, value)
        output_dtype = query.dtype
        if self.query_quant is not None:
            # quantizing with a simple torch operation enables
            # torch.compile to fuse this into previous ops
            # which reduces overheads during decoding.
            # Otherwise queries are quantized using custom ops
            # which causes decoding overheads
            assert self.kv_cache_dtype in {"fp8", "fp8_e4m3"}
            query, _ = self.query_quant(query, self._q_scale)
        if self.use_output:
            output_shape = (output_shape
                            if output_shape is not None else query.shape)
            output = torch.zeros(output_shape,
                                 dtype=output_dtype,
                                 device=query.device)
            hidden_size = output_shape[-1]
            # We skip reshaping query, key and value tensors for the MLA
            # backend since these tensors have different semantics and are
            # processed differently.
            if not self.use_mla:
                # Reshape the query, key, and value tensors.
                # NOTE(woosuk): We do this outside the custom op to minimize the
                # CPU overheads from the non-CUDA-graph regions.
                query = query.view(-1, self.num_heads, self.head_size)
                output = output.view(-1, self.num_heads, self.head_size)
                if key is not None:
                    key = key.view(-1, self.num_kv_heads, self.head_size)
                if value is not None:
                    value = value.view(-1, self.num_kv_heads, self.head_size)
            if self.use_direct_call:
                forward_context: ForwardContext = get_forward_context()
                attn_metadata = forward_context.attn_metadata
                if isinstance(attn_metadata, dict):
                    attn_metadata = attn_metadata[self.layer_name]
                self_kv_cache = self.kv_cache[forward_context.virtual_engine]
                self.impl.forward(self,
                                  query,
                                  key,
                                  value,
                                  self_kv_cache,
                                  attn_metadata,
                                  output=output)
            else:
                torch.ops.vllm.unified_attention_with_output(
                    query, key, value, output, self.layer_name)
            return output.view(-1, hidden_size)
        else:
            if self.use_direct_call:
                forward_context = get_forward_context()
                attn_metadata = forward_context.attn_metadata
                if isinstance(attn_metadata, dict):
                    attn_metadata = attn_metadata[self.layer_name]
                self_kv_cache = self.kv_cache[forward_context.virtual_engine]
                return self.impl.forward(self, query, key, value,
                                         self_kv_cache, attn_metadata)
            else:
                return torch.ops.vllm.unified_attention(
                    query, key, value, self.layer_name)
    def calc_kv_scales(self, query, key, value):
        self._q_scale.copy_(torch.abs(query).max() / self.q_range)
        self._k_scale.copy_(torch.abs(key).max() / self.k_range)
        self._v_scale.copy_(torch.abs(value).max() / self.v_range)
        self._q_scale_float = self._q_scale.item()
        self._k_scale_float = self._k_scale.item()
        self._v_scale_float = self._v_scale.item()
        # We only calculate the scales once
        self.calculate_kv_scales = False
    def extra_repr(self) -> str:
        s = f"head_size={self.impl.head_size}"  # type: ignore
        s += f", num_heads={self.impl.num_heads}"  # type: ignore
        s += f", num_kv_heads={self.impl.num_kv_heads}"  # type: ignore
        s += f", scale={self.impl.scale}"  # type: ignore
        s += f", backend={self.impl.__class__.__name__}"
        return s
    def process_weights_after_loading(self, act_dtype: torch.dtype):
        if hasattr(self.impl, "process_weights_after_loading"):
            self.impl.process_weights_after_loading(act_dtype)
        # FlashInfer requires attention sinks to be float32
        if (self.backend == _Backend.FLASHINFER
                and hasattr(self.impl, 'sinks')):
            from vllm.v1.attention.backends.flashinfer import FlashInferImpl
            assert isinstance(self.impl, FlashInferImpl)
            if (self.impl.sinks is not None
                    and self.impl.sinks.dtype != torch.float32):
                self.impl.sinks = self.impl.sinks.to(torch.float32)
    def get_attn_backend(self) -> type[AttentionBackend]:
        return self.attn_backend
 class MultiHeadAttention(nn.Module):
    """Multi-headed attention without any cache, used for ViT."""
    def __init__(
        self,
        num_heads: int,
        head_size: int,
        scale: float,
        num_kv_heads: Optional[int] = None,
    ):
        super().__init__()
        self.num_heads = num_heads
        self.head_size = head_size
        self.scale = scale
        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
        assert self.num_heads % self.num_kv_heads == 0, \
            f"num_heads ({self.num_heads}) is not " \
            f"divisible by num_kv_heads ({self.num_kv_heads})"
        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
        # During model initialization, the default dtype is set as the model
        # weight and activation dtype.
        dtype = torch.get_default_dtype()
        # Determine the attention backend
        backend = get_vit_attn_backend(head_size=head_size, dtype=dtype)
        # Some auto-selected backends can be upgraded
        # to upstream flash attention if available.
        # If vllm native fa is selected, we use it directly.
        use_upstream_fa = False
        if backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
                dtype):
            backend = _Backend.FLASH_ATTN
            use_upstream_fa = True
        if current_platform.is_rocm() or current_platform.is_xpu():
            # currently, only torch_sdpa is supported on rocm/xpu
            self.attn_backend = _Backend.TORCH_SDPA
        else:
            self.attn_backend = backend if backend in {
                _Backend.TORCH_SDPA,
                _Backend.XFORMERS,
                _Backend.PALLAS,
                _Backend.ROCM_AITER_FA,
                _Backend.FLASH_ATTN,
            } else _Backend.TORCH_SDPA
        if (self.attn_backend == _Backend.XFORMERS
                and not check_xformers_availability()):
            self.attn_backend = _Backend.TORCH_SDPA
        if self.attn_backend == _Backend.FLASH_ATTN:
            if use_upstream_fa:
                from flash_attn import flash_attn_varlen_func
                self._flash_attn_varlen_func = flash_attn_varlen_func
            else:
                from vllm.vllm_flash_attn import flash_attn_varlen_func
                self._flash_attn_varlen_func = flash_attn_varlen_func
        logger.info_once(
            f"MultiHeadAttention attn_backend: {self.attn_backend}, "
            f"use_upstream_fa: {use_upstream_fa}")
    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
    ) -> torch.Tensor:
        """Input shape: 
        (batch_size x seq_len x hidden_size) or
        (batch_size x seq_len x num_heads x head_size)
        """
        bsz, q_len = query.size()[:2]
        kv_len = key.size(1)
        query = query.view(bsz, q_len, self.num_heads, self.head_size)
        key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
        value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
        if (num_repeat := self.num_queries_per_kv) > 1:
            # Handle MQA and GQA
            key = torch.repeat_interleave(key, num_repeat, dim=2)
            value = torch.repeat_interleave(value, num_repeat, dim=2)
        if self.attn_backend == _Backend.FLASH_ATTN:
            cu_seqlens_q = torch.arange(0, (bsz + 1) * q_len,
                                        step=q_len,
                                        dtype=torch.int32,
                                        device=query.device)
            cu_seqlens_k = torch.arange(0, (bsz + 1) * kv_len,
                                        step=kv_len,
                                        dtype=torch.int32,
                                        device=key.device)
            out = self._flash_attn_varlen_func(
                query.flatten(0, 1),
                key.flatten(0, 1),
                value.flatten(0, 1),
                cu_seqlens_q=cu_seqlens_q,
                cu_seqlens_k=cu_seqlens_k,
                max_seqlen_q=q_len,
                max_seqlen_k=kv_len,
                softmax_scale=self.scale,
            )
        elif self.attn_backend == _Backend.XFORMERS:
            from xformers import ops as xops
            out = xops.memory_efficient_attention_forward(query,
                                                          key,
                                                          value,
                                                          scale=self.scale)
        elif self.attn_backend == _Backend.TORCH_SDPA:
            query, key, value = (x.transpose(1, 2)
                                 for x in (query, key, value))
            out = F.scaled_dot_product_attention(query,
                                                 key,
                                                 value,
                                                 scale=self.scale)
            out = out.transpose(1, 2)
        elif self.attn_backend == _Backend.PALLAS:
            query, key, value = (x.transpose(1, 2)
                                 for x in (query, key, value))
            from torch_xla.experimental.custom_kernel import flash_attention
            out = flash_attention(query, key, value, sm_scale=self.scale)
            out = out.transpose(1, 2)
        elif self.attn_backend == _Backend.ROCM_AITER_FA:
            from aiter import flash_attn_varlen_func
            # ROCm Flash Attention expects (batch, seq, heads, head_dim)
            out = flash_attn_varlen_func(query,
                                         key,
                                         value,
                                         softmax_scale=self.scale)
        else:
            # ViT attention hasn't supported this backend yet
            raise NotImplementedError(
                f"ViT attention hasn't supported {self.attn_backend} "
                f"backend yet.")
        return out.reshape(bsz, q_len, -1)
 def wait_for_kv_layer_from_connector(layer_name: str):
    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
        return
    connector = get_kv_transfer_group()
    forward_context: ForwardContext = get_forward_context()
    attn_metadata = forward_context.attn_metadata
    if attn_metadata is None:
        return
    assert isinstance(attn_metadata, dict)
    connector.wait_for_layer_load(layer_name)
 def maybe_save_kv_layer_to_connector(
    layer_name: str,
    kv_cache_layer: List[torch.Tensor],
 ):
    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
        return
    connector = get_kv_transfer_group()
    forward_context: ForwardContext = get_forward_context()
    attn_metadata = forward_context.attn_metadata
    if attn_metadata is None:
        return
    assert isinstance(attn_metadata, dict)
    connector.save_kv_layer(layer_name, kv_cache_layer,
                            attn_metadata[layer_name])
 def unified_attention(
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    layer_name: str,
 ) -> torch.Tensor:
    wait_for_kv_layer_from_connector(layer_name)
    forward_context: ForwardContext = get_forward_context()
    attn_metadata = forward_context.attn_metadata
    if isinstance(attn_metadata, dict):
        attn_metadata = attn_metadata[layer_name]
    self = forward_context.no_compile_layers[layer_name]
    kv_cache = self.kv_cache[forward_context.virtual_engine]
    output = self.impl.forward(self, query, key, value, kv_cache,
                               attn_metadata)
    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
    return output
 def unified_attention_fake(
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    layer_name: str,
 ) -> torch.Tensor:
    return torch.empty_like(query).contiguous()
 direct_register_custom_op(
    op_name="unified_attention",
    op_func=unified_attention,
    fake_impl=unified_attention_fake,
    tags=tag_cudagraph_unsafe,
 )
 def unified_attention_with_output(
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    output: torch.Tensor,
    layer_name: str,
    output_scale: Optional[torch.Tensor] = None,
    output_block_scale: Optional[torch.Tensor] = None,
 ) -> None:
    wait_for_kv_layer_from_connector(layer_name)
    forward_context: ForwardContext = get_forward_context()
    attn_metadata = forward_context.attn_metadata
    if isinstance(attn_metadata, dict):
        attn_metadata = attn_metadata[layer_name]
    self = forward_context.no_compile_layers[layer_name]
    kv_cache = self.kv_cache[forward_context.virtual_engine]
    self.impl.forward(self,
                      query,
                      key,
                      value,
                      kv_cache,
                      attn_metadata,
                      output=output,
                      output_scale=output_scale,
                      output_block_scale=output_block_scale)
    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
 def unified_attention_with_output_fake(
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    output: torch.Tensor,
    layer_name: str,
    output_scale: Optional[torch.Tensor] = None,
    output_block_scale: Optional[torch.Tensor] = None,
 ) -> None:
    return
 direct_register_custom_op(
    op_name="unified_attention_with_output",
    op_func=unified_attention_with_output,
    mutates_args=["output", "output_block_scale"],
    fake_impl=unified_attention_with_output_fake,
    tags=tag_cudagraph_unsafe,
 )
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1,2 @@`
							`__all__ = ['__version__']`
							`__version__ = '1.3.3.777'`