Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -419,6 +419,7 @@ class EngineArgs:
|
||||
enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
|
||||
moe_backend: MoEBackend = KernelConfig.moe_backend
|
||||
all2all_backend: All2AllBackend = ParallelConfig.all2all_backend
|
||||
enable_elastic_ep: bool = ParallelConfig.enable_elastic_ep
|
||||
enable_dbo: bool = ParallelConfig.enable_dbo
|
||||
ubatch_size: int = ParallelConfig.ubatch_size
|
||||
dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
|
||||
@@ -896,6 +897,9 @@ class EngineArgs:
|
||||
"--ubatch-size",
|
||||
**parallel_kwargs["ubatch_size"],
|
||||
)
|
||||
parallel_group.add_argument(
|
||||
"--enable-elastic-ep", **parallel_kwargs["enable_elastic_ep"]
|
||||
)
|
||||
parallel_group.add_argument(
|
||||
"--dbo-decode-token-threshold",
|
||||
**parallel_kwargs["dbo_decode_token_threshold"],
|
||||
@@ -1321,6 +1325,7 @@ class EngineArgs:
|
||||
"launched vLLM.",
|
||||
self.seed,
|
||||
)
|
||||
|
||||
return ModelConfig(
|
||||
model=self.model,
|
||||
model_weights=self.model_weights,
|
||||
@@ -1697,6 +1702,7 @@ class EngineArgs:
|
||||
is_moe_model=model_config.is_moe,
|
||||
enable_expert_parallel=self.enable_expert_parallel,
|
||||
all2all_backend=self.all2all_backend,
|
||||
enable_elastic_ep=self.enable_elastic_ep,
|
||||
enable_dbo=self.enable_dbo,
|
||||
ubatch_size=self.ubatch_size,
|
||||
dbo_decode_token_threshold=self.dbo_decode_token_threshold,
|
||||
@@ -1905,6 +1911,7 @@ class EngineArgs:
|
||||
performance_mode=self.performance_mode,
|
||||
weight_transfer_config=self.weight_transfer_config,
|
||||
)
|
||||
|
||||
return config
|
||||
|
||||
def _check_feature_supported(self):
|
||||
@@ -2074,20 +2081,19 @@ class EngineArgs:
|
||||
)
|
||||
|
||||
# Disable chunked prefill and prefix caching for:
|
||||
# POWER (ppc64le)/RISCV CPUs in V1
|
||||
# RISCV CPUs in V1
|
||||
if current_platform.is_cpu() and current_platform.get_cpu_architecture() in (
|
||||
CpuArchEnum.POWERPC,
|
||||
CpuArchEnum.RISCV,
|
||||
):
|
||||
logger.info(
|
||||
"Chunked prefill is not supported for POWER, "
|
||||
"and RISC-V CPUs; "
|
||||
"Chunked prefill is not supported for"
|
||||
"RISC-V CPUs; "
|
||||
"disabling it for V1 backend."
|
||||
)
|
||||
self.enable_chunked_prefill = False
|
||||
logger.info(
|
||||
"Prefix caching is not supported for POWER, "
|
||||
"and RISC-V CPUs; "
|
||||
"Prefix caching is not supported for "
|
||||
"RISC-V CPUs; "
|
||||
"disabling it for V1 backend."
|
||||
)
|
||||
self.enable_prefix_caching = False
|
||||
@@ -2181,14 +2187,10 @@ class AsyncEngineArgs(EngineArgs):
|
||||
"--enable-log-requests",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=AsyncEngineArgs.enable_log_requests,
|
||||
help="Enable logging requests.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-log-requests",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=not AsyncEngineArgs.enable_log_requests,
|
||||
help="[DEPRECATED] Disable logging requests.",
|
||||
deprecated=True,
|
||||
help="Enable logging request information, dependant on log level:\n"
|
||||
"- INFO: Request ID, parameters and LoRA request.\n"
|
||||
"- DEBUG: Prompt inputs (e.g: text, token IDs).\n"
|
||||
"You can set the minimum log level via `VLLM_LOGGING_LEVEL`.",
|
||||
)
|
||||
current_platform.pre_register_and_update(parser)
|
||||
return parser
|
||||
|
||||
Reference in New Issue
Block a user