From e68464a1d6ed2f99af74ad90459bc42e944ffd98 Mon Sep 17 00:00:00 2001 From: Canlin Guo Date: Mon, 23 Mar 2026 09:26:24 +0800 Subject: [PATCH] [Bugfix] Fix slow hasattr in ACLGraphWrapper.__getattr__ (#7442) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? Follow https://github.com/vllm-project/vllm/pull/37425, https://github.com/vllm-project/vllm-omni/pull/1982 Copied from them: Notice that `hasattr(self.model, "flush_pending_metadata")` cost 6ms per decode step when profiling Qwen3 Omni. The original `CUDAGraphWrapper.__getattr__` raises: ```python raise AttributeError(f"... cudagraph wrapper: {self.runnable}") ``` When hasattr() is called for a non-existent attribute, Python internally calls __getattr__ which constructs this AttributeError. The {self.runnable} triggers `__repr__()` on the underlying model (e.g., `Qwen3OmniMoeForConditionalGeneration`), which recursivelytraverses the entire nn.Module tree to generate an 18,000+ character string. This takes ~6-7ms per call. Since `hasattr(self.model, "flush_pending_metadata") ` is called every decode step in the Talker forward path, this adds ~6ms overhead per step, severely impacting audio inter-chunk latency (ICL). ```Python hasattr(self.model, "flush_pending_metadata") → getattr(self.model, "flush_pending_metadata") → not found in CUDAGraphWrapper.__dict__ → not found in the CUDAGraphWrapper class hierarchy → triggers CUDAGraphWrapper.__getattr__("flush_pending_metadata") → hasattr(self.runnable, "flush_pending_metadata") # runnable also doesn't have it → executes raise AttributeError(f"... {self.runnable}") → Python needs to construct the exception object → the f-string triggers self.runnable.__repr__() → Qwen3OmniMoeForConditionalGeneration.__repr__() → recursively traverses the entire nn.Module tree → generates a 18,000+ character string → takes ~6 ms → AttributeError object is created → hasattr catches the AttributeError and returns False → the 18,000-character string is immediately discarded (no one ever sees it) ``` ### Does this PR introduce _any_ user-facing change? NO. ### How was this patch tested? See https://github.com/vllm-project/vllm-omni/pull/1982 - vLLM version: v0.17.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4497431df654e46fb1fb5e64bf8611e762ae5d87 --------- Signed-off-by: gcanlin --- tests/ut/compilation/test_acl_graph.py | 2 +- vllm_ascend/compilation/acl_graph.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/ut/compilation/test_acl_graph.py b/tests/ut/compilation/test_acl_graph.py index afee5ef5..7440fc41 100644 --- a/tests/ut/compilation/test_acl_graph.py +++ b/tests/ut/compilation/test_acl_graph.py @@ -723,7 +723,7 @@ class TestACLGraphWrapper(TestBase): with self.assertRaises(AttributeError) as context: _ = wrapper.non_existent_attr - self.assertIn("Attribute non_existent_attr not exists", + self.assertIn("Attribute non_existent_attr not found", str(context.exception)) def test_unwrap_method(self): diff --git a/vllm_ascend/compilation/acl_graph.py b/vllm_ascend/compilation/acl_graph.py index e22d1b5a..1ceb3434 100644 --- a/vllm_ascend/compilation/acl_graph.py +++ b/vllm_ascend/compilation/acl_graph.py @@ -74,6 +74,7 @@ class ACLGraphWrapper: self.first_run_finished = False self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG" + self._runnable_str = str(runnable) if self.is_debugging_mode else None # assert runtime_mode is not NONE(no aclgraph), otherwise, we don't # need to initialize a ACLGraphWrapper. @@ -91,7 +92,11 @@ class ACLGraphWrapper: # allow accessing the attributes of the runnable. if hasattr(self.runnable, key): return getattr(self.runnable, key) - raise AttributeError(f"Attribute {key} not exists in the runnable of aclgraph wrapper: {self.runnable}") + if self.is_debugging_mode: + raise AttributeError( + f"Attribute {key} not exists in the runnable of aclgraph wrapper: {self._runnable_str}" + ) + raise AttributeError(f"Attribute {key} not found. Set VLLM_LOGGING_LEVEL=DEBUG for more details.") def unwrap(self) -> Callable: # in case we need to access the original runnable.