support torchair mode (#2641)
### What this PR does / why we need it?
support torchair mode
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
- vLLM version: v0.10.1.1
- vLLM main:
5438967fbc
Signed-off-by: zhangdepeng <zhangdepeng2@huawei.com>
Signed-off-by: p00465316 <panchao13@huawei.com>
Co-authored-by: zhangdepeng <zhangdepeng2@huawei.com>
This commit is contained in:
@@ -43,6 +43,7 @@ The details of each config option are as follows:
|
||||
| Name | Type | Default | Description |
|
||||
| ---- | ---- | ------- | ----------- |
|
||||
| `enabled` | bool | `False` | Whether to enable torchair graph mode. Currently only DeepSeek series models and PanguProMoE are supported to use torchair graph mode |
|
||||
| `mode` | str | `None` | When using reduce-overhead mode for torchair, mode needs to be set |
|
||||
| `enable_multistream_mla`| bool | `False` | Whether to put vector ops of MLA to another stream. This option only takes effects on models using MLA (e.g., DeepSeek). |
|
||||
| `enable_multistream_moe`| bool | `False` | Whether to enable multistream shared expert. This option only takes effects on DeepSeek moe models. |
|
||||
| `enable_view_optimize` | bool | `True` | Whether to enable torchair view optimization |
|
||||
|
||||
@@ -46,6 +46,7 @@ class TestAscendConfig(TestBase):
|
||||
|
||||
torchair_graph_config = ascend_config.torchair_graph_config
|
||||
self.assertFalse(torchair_graph_config.enabled)
|
||||
self.assertEqual(torchair_graph_config.mode, '')
|
||||
self.assertFalse(torchair_graph_config.use_cached_graph)
|
||||
self.assertEqual(torchair_graph_config.graph_batch_sizes, [])
|
||||
self.assertFalse(torchair_graph_config.graph_batch_sizes_init)
|
||||
@@ -294,6 +295,17 @@ class TestAscendConfig(TestBase):
|
||||
}
|
||||
init_ascend_config(test_vllm_config)
|
||||
|
||||
# mode should not be configured without torchair graph mode
|
||||
with self.assertRaises(RuntimeError):
|
||||
test_vllm_config.additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": False,
|
||||
"mode": 'max-autotune',
|
||||
},
|
||||
"refresh": True
|
||||
}
|
||||
init_ascend_config(test_vllm_config)
|
||||
|
||||
# enable_kv_nz should not be enabled without torchair graph mode
|
||||
with self.assertRaises(RuntimeError):
|
||||
test_vllm_config.additional_config = {
|
||||
|
||||
@@ -70,6 +70,7 @@ class TorchairGraphConfig:
|
||||
|
||||
def __init__(self, torchair_graph_config):
|
||||
self.enabled = torchair_graph_config.get("enabled", False)
|
||||
self.mode = torchair_graph_config.get("mode", '')
|
||||
self.use_cached_graph = torchair_graph_config.get(
|
||||
"use_cached_graph", False)
|
||||
self.graph_batch_sizes = torchair_graph_config.get(
|
||||
@@ -91,6 +92,9 @@ class TorchairGraphConfig:
|
||||
"graph_batch_sizes_init is only valid when graph_batch_sizes is empty"
|
||||
)
|
||||
if not self.enabled:
|
||||
if self.mode:
|
||||
raise RuntimeError(
|
||||
"mode is valid only when Torchair graph mode is enabled")
|
||||
if self.use_cached_graph:
|
||||
raise RuntimeError(
|
||||
"use_cached_graph is valid only when Torchair graph mode is enabled"
|
||||
|
||||
@@ -324,6 +324,8 @@ class NPUTorchairModelRunner(NPUModelRunner):
|
||||
communication_adaptation_310p()
|
||||
|
||||
config = torchair.CompilerConfig()
|
||||
if get_ascend_config().torchair_graph_config.mode:
|
||||
config.mode = get_ascend_config().torchair_graph_config.mode
|
||||
config.experimental_config.frozen_parameter = True
|
||||
# enabling tiling_schedule_optimize on 300I Duo has some bugs, so we have to
|
||||
# disable it on 300I Duo platform now.
|
||||
|
||||
Reference in New Issue
Block a user