[Feat](sfa,dcp) support dcp for sfa (#6563)
### What this PR does / why we need it? This PR adds DCP support to the SFA backend. Please note that due to operator constraints, the current implementation has to all-gather the entire KV cache and modify the block table to satisfy the operator input requirements. This results in significantly increased communication overhead and peak memory usage. Therefore, this is only a temporary workaround and will be refactored once the operator provides proper support. Additionally, because of the above limitations, `cp_kv_cache_interleave_size` is currently required to be equal to `block_size`. This restriction will also be removed after the refactor. #### Test accuracy test using DeepSeek-V3.2-Exp-W8A8 with dp2tp8dcp8 | dataset | version | metric | mode | vllm-api-general-stream | |----- | ----- | ----- | ----- | -----| | gsm8kdataset | - | accuracy | gen | 96.35 | - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0 --------- Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
This commit is contained in:
@@ -378,10 +378,11 @@ class NPUPlatform(Platform):
|
||||
vllm_config.scheduler_config.enable_chunked_prefill = True
|
||||
vllm_config.scheduler_config.SLO_limits_for_dynamic_batch = ascend_config.SLO_limits_for_dynamic_batch
|
||||
|
||||
cp_size = parallel_config.decode_context_parallel_size * parallel_config.prefill_context_parallel_size
|
||||
if (
|
||||
vllm_config.kv_transfer_config is not None
|
||||
and cache_config.block_size != parallel_config.cp_kv_cache_interleave_size
|
||||
and parallel_config.decode_context_parallel_size * parallel_config.prefill_context_parallel_size > 1
|
||||
and cp_size > 1
|
||||
):
|
||||
raise AssertionError(
|
||||
f"cp_kv_cache_interleave_size({parallel_config.cp_kv_cache_interleave_size}) "
|
||||
@@ -389,6 +390,20 @@ class NPUPlatform(Platform):
|
||||
"needs to be equal if use pcp or dcp > 1 in P/D disaggregate and kv pool scenario."
|
||||
)
|
||||
|
||||
use_sparse = (
|
||||
model_config is not None
|
||||
and model_config.hf_text_config is not None
|
||||
and hasattr(model_config.hf_text_config, "index_topk")
|
||||
)
|
||||
if use_sparse and cp_size > 1 and parallel_config.cp_kv_cache_interleave_size != cache_config.block_size:
|
||||
logger.warning_once(
|
||||
"The current SFA's PCP&DCP implementation requires"
|
||||
f"cp_kv_cache_interleave_size({parallel_config.cp_kv_cache_interleave_size})"
|
||||
f" == block_size({cache_config.block_size}). "
|
||||
f"Override cp_kv_cache_interleave_size to {cache_config.block_size}."
|
||||
)
|
||||
vllm_config.parallel_config.cp_kv_cache_interleave_size = cache_config.block_size
|
||||
|
||||
if is_vl_model(vllm_config):
|
||||
if bool(int(os.getenv("VLLM_ASCEND_ENABLE_FLASHCOMM", "0"))) or bool(
|
||||
int(os.getenv("VLLM_ASCEND_ENABLE_FLASHCOMM1", "0"))
|
||||
|
||||
Reference in New Issue
Block a user