diff --git a/examples/disaggregated_prefill_v1/mooncake_connector_store_deployment_guide.md b/examples/disaggregated_prefill_v1/mooncake_connector_store_deployment_guide.md index b91705a..3bf9240 100644 --- a/examples/disaggregated_prefill_v1/mooncake_connector_store_deployment_guide.md +++ b/examples/disaggregated_prefill_v1/mooncake_connector_store_deployment_guide.md @@ -64,6 +64,7 @@ export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json" export VLLM_USE_V1=1 export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 export ASCEND_TRANSPORT_PRINT=1 +export ACL_OP_INIT_MODE=1 # The upper boundary environment variable for memory swap logging is set to mooncake, where 1 indicates enabled and 0 indicates disabled. export ASCEND_AGGREGATE_ENABLE=1 # The upper-level environment variable is the switch for enabling the mooncake aggregation function, where 1 means on and 0 means off. @@ -104,6 +105,7 @@ python3 -m vllm.entrypoints.openai.api_server \ { "kv_connector": "MooncakeConnectorStoreV1", "kv_role": "kv_producer", + "mooncake_rpc_port":"0" } ] } @@ -124,6 +126,7 @@ export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm export MOONCAKE_CONFIG_PATH="/xxxxx/mooncake.json" export VLLM_USE_V1=1 export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7 +export ACL_OP_INIT_MODE=1 export ASCEND_TRANSPORT_PRINT=1 # The upper boundary environment variable for memory swap logging is set to mooncake, where 1 indicates enabled and 0 indicates disabled. export ASCEND_AGGREGATE_ENABLE=1 @@ -165,6 +168,7 @@ python3 -m vllm.entrypoints.openai.api_server \ { "kv_connector": "MooncakeConnectorStoreV1", "kv_role": "kv_consumer", + "mooncake_rpc_port":"1" } ] } @@ -223,6 +227,7 @@ export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json" export VLLM_USE_V1=1 export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 +export ACL_OP_INIT_MODE=1 export ASCEND_TRANSPORT_PRINT=1 # The upper boundary environment variable for memory swap logging is set to mooncake, where 1 indicates enabled and 0 indicates disabled. export ASCEND_AGGREGATE_ENABLE=1 @@ -242,9 +247,10 @@ python3 -m vllm.entrypoints.openai.api_server \ --kv-transfer-config \ '{ "kv_connector": "MooncakeConnectorStoreV1", - "kv_role": "kv_producer", + "kv_role": "kv_both", "kv_connector_extra_config": { - "use_layerwise": false + "use_layerwise": false, + "mooncake_rpc_port":"0" } }' > mix.log 2>&1 ``` diff --git a/vllm_ascend/distributed/mooncake/mooncake_engine.py b/vllm_ascend/distributed/mooncake/mooncake_engine.py index 53c2724..d89dcd7 100644 --- a/vllm_ascend/distributed/mooncake/mooncake_engine.py +++ b/vllm_ascend/distributed/mooncake/mooncake_engine.py @@ -119,7 +119,7 @@ class MooncakeEngine: if self.use_layerwise: self.get_event = threading.Event() - if self.kv_role == 'kv_producer': + if self.kv_role in ['kv_producer', 'kv_both']: ready_event_sending = threading.Event() self.kv_send_thread = KVCacheStoreLayerSendingThread( self.tp_rank, self.tp_size, self.m_store, @@ -135,7 +135,7 @@ class MooncakeEngine: self.kv_recv_thread.start() ready_event.wait() else: - if self.kv_role == 'kv_producer': + if self.kv_role in ['kv_producer', 'kv_both']: ready_event_sending = threading.Event() self.kv_send_thread = KVCacheStoreSendingThread( self.tp_rank, self.tp_size, self.m_store, @@ -429,7 +429,7 @@ class MooncakeEngine: done_sending = ( self.kv_send_thread. get_and_clear_finished_requests( # type: ignore[union-attr] - ) if self.kv_role == 'kv_producer' else set()) + ) if self.kv_role in ['kv_producer', 'kv_both'] else set()) done_recving = self.kv_recv_thread.get_and_clear_finished_requests( # type: ignore[union-attr] ) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index a7cdd26..3d2e6f3 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2406,7 +2406,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): dtype=np.int32) # Force dummy run on prefill stage when this node is deemed as kv producer. - if self.is_kv_producer: + if self.is_kv_producer and not self.is_kv_consumer: with_prefill = True attn_metadata = self._build_attention_metadata(