58 lines
1.8 KiB
Python
58 lines
1.8 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
|
|
|
from vllm.v1.executor.abstract import Executor
|
|
|
|
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
|
|
|
|
|
def vllm__v1__executor__abstract__Executor__get_hfu_info(self, batch, input_len, output_len):
|
|
output = self.collective_rpc("get_hfu_info", args=([batch, input_len, output_len]))
|
|
return max(output)
|
|
|
|
def vllm__v1__executor__abstract__Executor__get_mm_encoder_latency(self):
|
|
output = self.collective_rpc("get_mm_encoder_latency")
|
|
return None if any(item is None for item in output) else max(output)
|
|
|
|
def vllm__v1__executor__abstract__Executor__get_latency(self):
|
|
output = self.collective_rpc("get_latency")
|
|
return max(output)
|
|
|
|
|
|
def vllm__v1__executor__abstract__Executor__get_memory_usage(self):
|
|
output = self.collective_rpc("get_memory_usage")
|
|
return output[0]
|
|
|
|
|
|
def vllm__v1__executor__abstract__Executor__recapture_model(
|
|
self, prefill_enable_mlugraph: bool, batch_size: int, input_len: int):
|
|
self.collective_rpc("recapture_model",
|
|
args=(prefill_enable_mlugraph, batch_size, input_len))
|
|
|
|
|
|
MluHijackObject.apply_hijack(
|
|
Executor,
|
|
"get_hfu_info",
|
|
vllm__v1__executor__abstract__Executor__get_hfu_info
|
|
)
|
|
MluHijackObject.apply_hijack(
|
|
Executor,
|
|
"get_latency",
|
|
vllm__v1__executor__abstract__Executor__get_latency
|
|
)
|
|
MluHijackObject.apply_hijack(
|
|
Executor,
|
|
"get_mm_encoder_latency",
|
|
vllm__v1__executor__abstract__Executor__get_mm_encoder_latency
|
|
)
|
|
MluHijackObject.apply_hijack(
|
|
Executor,
|
|
"get_memory_usage",
|
|
vllm__v1__executor__abstract__Executor__get_memory_usage
|
|
)
|
|
MluHijackObject.apply_hijack(
|
|
Executor,
|
|
"recapture_model",
|
|
vllm__v1__executor__abstract__Executor__recapture_model
|
|
)
|