[Misc] Remove VLLM_USE_V1 usage in code (#1764)
We plan to remove V0 code from this version. The first step is to delete
v0 usage.
Related: https://github.com/vllm-project/vllm-ascend/issues/1620
- vLLM version: v0.9.2
- vLLM main:
61e20828da
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -193,71 +193,48 @@ class TestAscendConfig(TestBase):
|
|||||||
@_clean_up_ascend_config
|
@_clean_up_ascend_config
|
||||||
def test_check_ascend_config_wrong_case(self):
|
def test_check_ascend_config_wrong_case(self):
|
||||||
test_vllm_config = VllmConfig()
|
test_vllm_config = VllmConfig()
|
||||||
# For V0 engine
|
|
||||||
with mock.patch.dict(os.environ, {"VLLM_USE_V1": "0"}):
|
# torchair + eager mode
|
||||||
with self.assertRaises(NotImplementedError):
|
with self.assertRaises(RuntimeError):
|
||||||
test_vllm_config.additional_config = {
|
test_vllm_config.additional_config = {
|
||||||
"torchair_graph_config": {
|
"torchair_graph_config": {
|
||||||
"enabled": True,
|
"enabled": True,
|
||||||
},
|
},
|
||||||
"refresh": True
|
"refresh": True
|
||||||
}
|
}
|
||||||
init_ascend_config(test_vllm_config)
|
init_ascend_config(test_vllm_config)
|
||||||
check_ascend_config(test_vllm_config, False)
|
enforce_eager = True
|
||||||
with self.assertRaises(NotImplementedError):
|
check_ascend_config(test_vllm_config, enforce_eager)
|
||||||
test_vllm_config.additional_config = {
|
# torchair + non deepseek model
|
||||||
"ascend_scheduler_config": {
|
with self.assertRaises(NotImplementedError):
|
||||||
"enabled": True,
|
test_vllm_config.additional_config = {
|
||||||
},
|
"torchair_graph_config": {
|
||||||
"refresh": True
|
"enabled": True,
|
||||||
}
|
},
|
||||||
init_ascend_config(test_vllm_config)
|
"refresh": True
|
||||||
check_ascend_config(test_vllm_config, True)
|
}
|
||||||
# For V1 engine
|
model_path = os.path.join(os.path.dirname(__file__), "fake_weight")
|
||||||
with mock.patch.dict(os.environ, {"VLLM_USE_V1": "1"}):
|
fake_model_config = ModelConfig(model=model_path)
|
||||||
# torchair + eager mode
|
fake_model_config.hf_config = PretrainedConfig()
|
||||||
with self.assertRaises(RuntimeError):
|
fake_model_config.hf_config.model_type = "llama"
|
||||||
test_vllm_config.additional_config = {
|
test_vllm_config.model_config = fake_model_config
|
||||||
"torchair_graph_config": {
|
init_ascend_config(test_vllm_config)
|
||||||
"enabled": True,
|
check_ascend_config(test_vllm_config, False)
|
||||||
},
|
# aclgraph + deepseek model
|
||||||
"refresh": True
|
with self.assertRaises(NotImplementedError):
|
||||||
}
|
test_vllm_config.additional_config = {
|
||||||
init_ascend_config(test_vllm_config)
|
"torchair_graph_config": {
|
||||||
enforce_eager = True
|
"enabled": False,
|
||||||
check_ascend_config(test_vllm_config, enforce_eager)
|
},
|
||||||
# torchair + non deepseek model
|
"refresh": True
|
||||||
with self.assertRaises(NotImplementedError):
|
}
|
||||||
test_vllm_config.additional_config = {
|
model_path = os.path.join(os.path.dirname(__file__), "fake_weight")
|
||||||
"torchair_graph_config": {
|
fake_model_config = ModelConfig(model=model_path)
|
||||||
"enabled": True,
|
fake_model_config.hf_config = PretrainedConfig()
|
||||||
},
|
fake_model_config.hf_config.model_type = "deepseek"
|
||||||
"refresh": True
|
test_vllm_config.model_config = fake_model_config
|
||||||
}
|
init_ascend_config(test_vllm_config)
|
||||||
model_path = os.path.join(os.path.dirname(__file__),
|
check_ascend_config(test_vllm_config, False)
|
||||||
"fake_weight")
|
|
||||||
fake_model_config = ModelConfig(model=model_path)
|
|
||||||
fake_model_config.hf_config = PretrainedConfig()
|
|
||||||
fake_model_config.hf_config.model_type = "llama"
|
|
||||||
test_vllm_config.model_config = fake_model_config
|
|
||||||
init_ascend_config(test_vllm_config)
|
|
||||||
check_ascend_config(test_vllm_config, False)
|
|
||||||
# aclgraph + deepseek model
|
|
||||||
with self.assertRaises(NotImplementedError):
|
|
||||||
test_vllm_config.additional_config = {
|
|
||||||
"torchair_graph_config": {
|
|
||||||
"enabled": False,
|
|
||||||
},
|
|
||||||
"refresh": True
|
|
||||||
}
|
|
||||||
model_path = os.path.join(os.path.dirname(__file__),
|
|
||||||
"fake_weight")
|
|
||||||
fake_model_config = ModelConfig(model=model_path)
|
|
||||||
fake_model_config.hf_config = PretrainedConfig()
|
|
||||||
fake_model_config.hf_config.model_type = "deepseek"
|
|
||||||
test_vllm_config.model_config = fake_model_config
|
|
||||||
init_ascend_config(test_vllm_config)
|
|
||||||
check_ascend_config(test_vllm_config, False)
|
|
||||||
|
|
||||||
def test_check_torchair_supported(self):
|
def test_check_torchair_supported(self):
|
||||||
test_cases = [('deepseek_v3', True), ('PanguProMoE', True),
|
test_cases = [('deepseek_v3', True), ('PanguProMoE', True),
|
||||||
|
|||||||
@@ -389,69 +389,6 @@ class TestNPUPlatform(TestBase):
|
|||||||
"vllm_ascend.worker.worker_v1.NPUWorker",
|
"vllm_ascend.worker.worker_v1.NPUWorker",
|
||||||
)
|
)
|
||||||
|
|
||||||
@patch("vllm_ascend.ascend_config.check_ascend_config")
|
|
||||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
|
||||||
@patch("vllm.envs.VLLM_USE_V1", False)
|
|
||||||
def test_check_and_update_config_speculative_worker_config(
|
|
||||||
self, mock_init_ascend, mock_check_ascend):
|
|
||||||
mock_init_ascend.return_value = self.mock_ascend_config
|
|
||||||
self.mock_vllm_config.speculative_config = MagicMock()
|
|
||||||
self.mock_vllm_config.speculative_config.disable_logprobs = True
|
|
||||||
self.mock_vllm_config.parallel_config.worker_cls = "auto"
|
|
||||||
|
|
||||||
with patch.dict("os.environ", {}):
|
|
||||||
from vllm_ascend import platform
|
|
||||||
|
|
||||||
importlib.reload(platform)
|
|
||||||
self.platform.check_and_update_config(self.mock_vllm_config)
|
|
||||||
import os
|
|
||||||
|
|
||||||
self.assertEqual(os.environ.get("ACL_OP_INIT_MODE"), "1")
|
|
||||||
self.assertEqual(
|
|
||||||
self.mock_vllm_config.parallel_config.worker_cls,
|
|
||||||
"vllm.spec_decode.spec_decode_worker.create_spec_worker",
|
|
||||||
)
|
|
||||||
self.assertEqual(
|
|
||||||
self.mock_vllm_config.parallel_config.sd_worker_cls,
|
|
||||||
"vllm_ascend.worker.worker.NPUWorker",
|
|
||||||
)
|
|
||||||
|
|
||||||
@patch("vllm_ascend.ascend_config.check_ascend_config")
|
|
||||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
|
||||||
@patch("vllm.envs.VLLM_USE_V1", False)
|
|
||||||
def test_check_and_update_config_multi_step_worker_config(
|
|
||||||
self, mock_init_ascend, mock_check_ascend):
|
|
||||||
mock_init_ascend.return_value = self.mock_ascend_config
|
|
||||||
self.mock_vllm_config.scheduler_config.is_multi_step = True
|
|
||||||
self.mock_vllm_config.parallel_config.worker_cls = "auto"
|
|
||||||
|
|
||||||
from vllm_ascend import platform
|
|
||||||
|
|
||||||
importlib.reload(platform)
|
|
||||||
self.platform.check_and_update_config(self.mock_vllm_config)
|
|
||||||
self.assertEqual(
|
|
||||||
self.mock_vllm_config.parallel_config.worker_cls,
|
|
||||||
"vllm_ascend.worker.multi_step_worker.MultiStepWorker",
|
|
||||||
)
|
|
||||||
|
|
||||||
@patch("vllm_ascend.ascend_config.check_ascend_config")
|
|
||||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
|
||||||
@patch("vllm.envs.VLLM_USE_V1", False)
|
|
||||||
def test_check_and_update_config_default_worker_config(
|
|
||||||
self, mock_init_ascend, mock_check_ascend):
|
|
||||||
mock_init_ascend.return_value = self.mock_ascend_config
|
|
||||||
self.mock_vllm_config.parallel_config.worker_cls = "auto"
|
|
||||||
self.mock_vllm_config.scheduler_config.is_multi_step = False
|
|
||||||
|
|
||||||
from vllm_ascend import platform
|
|
||||||
|
|
||||||
importlib.reload(platform)
|
|
||||||
self.platform.check_and_update_config(self.mock_vllm_config)
|
|
||||||
self.assertEqual(
|
|
||||||
self.mock_vllm_config.parallel_config.worker_cls,
|
|
||||||
"vllm_ascend.worker.worker.NPUWorker",
|
|
||||||
)
|
|
||||||
|
|
||||||
@patch("vllm_ascend.ascend_config.check_ascend_config")
|
@patch("vllm_ascend.ascend_config.check_ascend_config")
|
||||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||||
@patch("vllm_ascend.utils.is_310p", return_value=True)
|
@patch("vllm_ascend.utils.is_310p", return_value=True)
|
||||||
|
|||||||
@@ -15,7 +15,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import vllm.envs as envs
|
|
||||||
from vllm.logger import logger
|
from vllm.logger import logger
|
||||||
|
|
||||||
TORCHAIR_MODEL_LIST = ["deepseek", "pangu"]
|
TORCHAIR_MODEL_LIST = ["deepseek", "pangu"]
|
||||||
@@ -126,46 +125,36 @@ def get_ascend_config():
|
|||||||
def check_ascend_config(vllm_config, enforce_eager):
|
def check_ascend_config(vllm_config, enforce_eager):
|
||||||
ascend_config = get_ascend_config()
|
ascend_config = get_ascend_config()
|
||||||
|
|
||||||
# for v0 engine
|
# for eager mode
|
||||||
if not envs.VLLM_USE_V1:
|
if enforce_eager:
|
||||||
|
# torchair_graph cannot be enabled with eager mode.
|
||||||
if ascend_config.torchair_graph_config.enabled:
|
if ascend_config.torchair_graph_config.enabled:
|
||||||
raise NotImplementedError(
|
raise RuntimeError(
|
||||||
"Torchair graph mode is only supported for V1 Engine.")
|
"Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode."
|
||||||
if ascend_config.ascend_scheduler_config.enabled:
|
)
|
||||||
raise NotImplementedError(
|
# for graph mode
|
||||||
"Ascend scheduler is only supported for V1 Engine.")
|
|
||||||
# for v1 engine
|
|
||||||
else:
|
else:
|
||||||
# for eager mode
|
# torchair_graph case
|
||||||
if enforce_eager:
|
if ascend_config.torchair_graph_config.enabled:
|
||||||
# torchair_graph cannot be enabled with eager mode.
|
# torchair_graph is supported for deepseek/pangu model only.
|
||||||
if ascend_config.torchair_graph_config.enabled:
|
if vllm_config.model_config:
|
||||||
raise RuntimeError(
|
model_type = vllm_config.model_config.hf_config.model_type
|
||||||
"Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode."
|
if not _check_torchair_supported(model_type):
|
||||||
)
|
raise NotImplementedError(
|
||||||
# for graph mode
|
"Torchair graph mode only works with following model types:"
|
||||||
|
f"{TORCHAIR_MODEL_LIST}.")
|
||||||
|
# aclgraph case
|
||||||
else:
|
else:
|
||||||
# torchair_graph case
|
# aclgraph doesn't work with deepseek model and only qwen model is well tested.
|
||||||
if ascend_config.torchair_graph_config.enabled:
|
if vllm_config.model_config:
|
||||||
# torchair_graph is supported for deepseek/pangu model only.
|
model_type = vllm_config.model_config.hf_config.model_type
|
||||||
if vllm_config.model_config:
|
if "deepseek" in model_type:
|
||||||
model_type = vllm_config.model_config.hf_config.model_type
|
raise NotImplementedError(
|
||||||
if not _check_torchair_supported(model_type):
|
"ACL Graph does not support deepseek. Please "
|
||||||
raise NotImplementedError(
|
"try torchair graph mode to serve deepseek models on vllm-ascend."
|
||||||
"Torchair graph mode only works with following model types:"
|
" Or set `enforce_eager=True` to use eager mode.")
|
||||||
f"{TORCHAIR_MODEL_LIST}.")
|
if "qwen" not in model_type:
|
||||||
# aclgraph case
|
logger.warning(
|
||||||
else:
|
"ACL Graph is currently experimental. Please "
|
||||||
# aclgraph doesn't work with deepseek model and only qwen model is well tested.
|
"raise an issue on https://github.com/vllm-project/vllm-ascend/issues"
|
||||||
if vllm_config.model_config:
|
" if you encourage any Error")
|
||||||
model_type = vllm_config.model_config.hf_config.model_type
|
|
||||||
if "deepseek" in model_type:
|
|
||||||
raise NotImplementedError(
|
|
||||||
"ACL Graph does not support deepseek. Please "
|
|
||||||
"try torchair graph mode to serve deepseek models on vllm-ascend."
|
|
||||||
" Or set `enforce_eager=True` to use eager mode.")
|
|
||||||
if "qwen" not in model_type:
|
|
||||||
logger.warning(
|
|
||||||
"ACL Graph is currently experimental. Please "
|
|
||||||
"raise an issue on https://github.com/vllm-project/vllm-ascend/issues"
|
|
||||||
" if you encourage any Error")
|
|
||||||
|
|||||||
@@ -30,7 +30,6 @@ from typing import Any, Dict, Iterable, List, Optional, Union
|
|||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
import torch_npu # noqa: F401
|
import torch_npu # noqa: F401
|
||||||
import vllm.envs as envs
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
from vllm.attention import Attention, AttentionMetadata
|
from vllm.attention import Attention, AttentionMetadata
|
||||||
@@ -397,20 +396,17 @@ class CustomDeepseekDBOMLAAttention(DeepseekV2MLAAttention):
|
|||||||
hidden_states_or_q_c = hidden_states
|
hidden_states_or_q_c = hidden_states
|
||||||
if self.torchair_graph_enabled:
|
if self.torchair_graph_enabled:
|
||||||
forward_kwargs = {}
|
forward_kwargs = {}
|
||||||
if envs.VLLM_USE_V1:
|
output_shape = hidden_states.shape
|
||||||
output_shape = hidden_states.shape
|
output = torch.empty(output_shape,
|
||||||
output = torch.empty(output_shape,
|
dtype=hidden_states_or_q_c.dtype,
|
||||||
dtype=hidden_states_or_q_c.dtype,
|
device=hidden_states_or_q_c.device)
|
||||||
device=hidden_states_or_q_c.device)
|
forward_kwargs['output'] = output
|
||||||
forward_kwargs['output'] = output
|
|
||||||
|
|
||||||
output = self.mla_attn.impl.forward(self.mla_attn,
|
output = self.mla_attn.impl.forward(self.mla_attn,
|
||||||
hidden_states_or_q_c,
|
hidden_states_or_q_c,
|
||||||
hidden_states, None, kv_cache,
|
hidden_states, None, kv_cache,
|
||||||
attn_metadata,
|
attn_metadata,
|
||||||
**forward_kwargs)
|
**forward_kwargs)
|
||||||
if envs.VLLM_USE_V1:
|
output = output.view(-1, output_shape[-1])
|
||||||
output = output.view(-1, output_shape[-1])
|
|
||||||
return output
|
return output
|
||||||
else:
|
else:
|
||||||
kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
|
kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
|
||||||
@@ -885,7 +881,7 @@ class CustomDeepseekDBOModel(nn.Module):
|
|||||||
def can_run_ms(self):
|
def can_run_ms(self):
|
||||||
attn_metadata = get_forward_context().attn_metadata
|
attn_metadata = get_forward_context().attn_metadata
|
||||||
# support mla attention and V1 engine at present
|
# support mla attention and V1 engine at present
|
||||||
if not self.use_mla or not envs.VLLM_USE_V1:
|
if not self.use_mla:
|
||||||
return False
|
return False
|
||||||
# enable prefill overlap
|
# enable prefill overlap
|
||||||
if attn_metadata is None or attn_metadata.num_prefills == 0:
|
if attn_metadata is None or attn_metadata.num_prefills == 0:
|
||||||
|
|||||||
@@ -29,7 +29,6 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch_npu
|
import torch_npu
|
||||||
import vllm.envs as envs
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
from vllm.attention import Attention, AttentionMetadata
|
from vllm.attention import Attention, AttentionMetadata
|
||||||
@@ -579,20 +578,17 @@ class CustomDeepseekV2MLAAttention(DeepseekV2MLAAttention):
|
|||||||
else:
|
else:
|
||||||
hidden_states_or_q_c = hidden_states
|
hidden_states_or_q_c = hidden_states
|
||||||
if self.torchair_graph_enabled:
|
if self.torchair_graph_enabled:
|
||||||
if envs.VLLM_USE_V1:
|
output_shape = hidden_states.shape
|
||||||
output_shape = hidden_states.shape
|
output = torch.empty(output_shape,
|
||||||
output = torch.empty(output_shape,
|
dtype=hidden_states_or_q_c.dtype,
|
||||||
dtype=hidden_states_or_q_c.dtype,
|
device=hidden_states_or_q_c.device)
|
||||||
device=hidden_states_or_q_c.device)
|
forward_kwargs['output'] = output
|
||||||
forward_kwargs['output'] = output
|
|
||||||
|
|
||||||
output = self.mla_attn.impl.forward(self.mla_attn,
|
output = self.mla_attn.impl.forward(self.mla_attn,
|
||||||
hidden_states_or_q_c,
|
hidden_states_or_q_c,
|
||||||
hidden_states, None, kv_cache,
|
hidden_states, None, kv_cache,
|
||||||
attn_metadata,
|
attn_metadata,
|
||||||
**forward_kwargs)
|
**forward_kwargs)
|
||||||
if envs.VLLM_USE_V1:
|
output = output.view(-1, output_shape[-1])
|
||||||
output = output.view(-1, output_shape[-1])
|
|
||||||
return output
|
return output
|
||||||
else:
|
else:
|
||||||
kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
|
kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
|
||||||
@@ -660,7 +656,7 @@ class CustomDeepseekV2DecoderLayer(DeepseekV2DecoderLayer):
|
|||||||
prefix=f"{prefix}.mlp",
|
prefix=f"{prefix}.mlp",
|
||||||
)
|
)
|
||||||
self.mla_moe_communication = ascend_config.torchair_graph_config.enable_multistream_moe \
|
self.mla_moe_communication = ascend_config.torchair_graph_config.enable_multistream_moe \
|
||||||
and model_config.use_mla and envs.VLLM_USE_V1 and self.tp_size > 1
|
and model_config.use_mla and self.tp_size > 1
|
||||||
else:
|
else:
|
||||||
self.mlp = CustomDeepseekV2MLP(
|
self.mlp = CustomDeepseekV2MLP(
|
||||||
hidden_size=config.hidden_size,
|
hidden_size=config.hidden_size,
|
||||||
|
|||||||
@@ -16,7 +16,6 @@
|
|||||||
#
|
#
|
||||||
|
|
||||||
import gc
|
import gc
|
||||||
import os
|
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from typing import TYPE_CHECKING, Optional, Tuple
|
from typing import TYPE_CHECKING, Optional, Tuple
|
||||||
|
|
||||||
@@ -117,6 +116,8 @@ class NPUPlatform(Platform):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||||
|
if not envs.VLLM_USE_V1:
|
||||||
|
raise ValueError("vLLM Ascend does not support V0 engine")
|
||||||
# initialize ascend config from vllm additional_config
|
# initialize ascend config from vllm additional_config
|
||||||
ascend_config = init_ascend_config(vllm_config)
|
ascend_config = init_ascend_config(vllm_config)
|
||||||
|
|
||||||
@@ -180,18 +181,7 @@ class NPUPlatform(Platform):
|
|||||||
update_aclgraph_sizes(vllm_config)
|
update_aclgraph_sizes(vllm_config)
|
||||||
|
|
||||||
if parallel_config and parallel_config.worker_cls == "auto":
|
if parallel_config and parallel_config.worker_cls == "auto":
|
||||||
if envs.VLLM_USE_V1:
|
parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
|
||||||
parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
|
|
||||||
elif vllm_config.speculative_config:
|
|
||||||
# NOTE: We set this var to `1` in vllm-ascend to avoid segment
|
|
||||||
# fault when using spec decode with V0 engine.
|
|
||||||
os.environ["ACL_OP_INIT_MODE"] = "1"
|
|
||||||
parallel_config.worker_cls = "vllm.spec_decode.spec_decode_worker.create_spec_worker"
|
|
||||||
parallel_config.sd_worker_cls = "vllm_ascend.worker.worker.NPUWorker"
|
|
||||||
elif vllm_config.scheduler_config.is_multi_step:
|
|
||||||
parallel_config.worker_cls = "vllm_ascend.worker.multi_step_worker.MultiStepWorker"
|
|
||||||
else:
|
|
||||||
parallel_config.worker_cls = "vllm_ascend.worker.worker.NPUWorker"
|
|
||||||
|
|
||||||
if cache_config:
|
if cache_config:
|
||||||
if cache_config.block_size is None:
|
if cache_config.block_size is None:
|
||||||
@@ -202,20 +192,18 @@ class NPUPlatform(Platform):
|
|||||||
)
|
)
|
||||||
cache_config.block_size = 128
|
cache_config.block_size = 128
|
||||||
|
|
||||||
if envs.VLLM_USE_V1:
|
# Activate custom ops for v1, except on 310P
|
||||||
# Activate custom ops for v1, except on 310P
|
if not is_310p():
|
||||||
if not is_310p():
|
compilation_config.custom_ops = ["all"]
|
||||||
compilation_config.custom_ops = ["all"]
|
|
||||||
|
|
||||||
# If ascend_scheduler_config is enabled,
|
# If ascend_scheduler_config is enabled,
|
||||||
# extents original scheduler_config to use AscendScheduler.
|
# extents original scheduler_config to use AscendScheduler.
|
||||||
if ascend_config.ascend_scheduler_config.enabled:
|
if ascend_config.ascend_scheduler_config.enabled:
|
||||||
from vllm_ascend.core.schedule_config import \
|
from vllm_ascend.core.schedule_config import AscendSchedulerConfig
|
||||||
AscendSchedulerConfig
|
ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config(
|
||||||
ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config(
|
vllm_config.scheduler_config,
|
||||||
vllm_config.scheduler_config,
|
ascend_config.ascend_scheduler_config)
|
||||||
ascend_config.ascend_scheduler_config)
|
vllm_config.scheduler_config = ascend_scheduler_config
|
||||||
vllm_config.scheduler_config = ascend_scheduler_config
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
|
def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
|
||||||
|
|||||||
Reference in New Issue
Block a user