From 53ecd89e8ff405302be040a76effa8c012cbaaeb Mon Sep 17 00:00:00 2001 From: Lucas Kabela Date: Fri, 19 Sep 2025 17:22:30 -0700 Subject: [PATCH] [Bugfix] Remove `VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE` (#2969) ### What this PR does / why we need it? This PR prepares for deleting this enviroment variable, `VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE`, as vllm requires `fullgraph=True` to run - Fixes https://github.com/vllm-project/vllm/issues/21834 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? See CI - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/99cc41ad50c08e745571abe568226f9fcae61ccd --------- Signed-off-by: Lucas Kabela --- vllm_ascend/spec_decode/mtp_proposer.py | 12 +++++------- vllm_ascend/torchair/torchair_model_runner.py | 12 +++++------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index 0a96b25..b0a8cf5 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -3,7 +3,6 @@ import types import torch import torch.nn as nn import torchair -import vllm.envs as envs_vllm from torchair import patch_for_hcom from vllm.attention.layer import Attention from vllm.config import (VllmConfig, get_layers_from_vllm_config, @@ -596,11 +595,10 @@ class MtpProposer(Proposer): torch.npu.set_compile_mode(jit_compile=False) if not self.runner.use_cached_npu_graph: npu_backend = torchair.get_npu_backend(compiler_config=config) - self.torchair_compiled_model = torch.compile( - self.model, - dynamic=True, - fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, - backend=npu_backend) + self.torchair_compiled_model = torch.compile(self.model, + dynamic=True, + fullgraph=True, + backend=npu_backend) return self.torchair_compiled_model else: # Generate a new forward proxy code object to prevent the invalidation of @@ -622,7 +620,7 @@ class MtpProposer(Proposer): batch_size] = torchair.inference.cache_compile( self.model.__dict__[forward_proxy_name], dynamic=True, - fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, + fullgraph=True, cache_dir=TORCHAIR_CACHE_DIR, config=config, ge_cache=False) diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py index c9c2d61..0c715fd 100644 --- a/vllm_ascend/torchair/torchair_model_runner.py +++ b/vllm_ascend/torchair/torchair_model_runner.py @@ -25,7 +25,6 @@ import torch import torch.distributed as dist import torch.nn as nn import torch_npu -import vllm.envs as envs_vllm from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_dp_group @@ -373,11 +372,10 @@ class NPUTorchairModelRunner(NPUModelRunner): torch.npu.set_compile_mode(jit_compile=False) if not self.use_cached_npu_graph: npu_backend = torchair.get_npu_backend(compiler_config=config) - self.torchair_compiled_model = torch.compile( - self.model, - dynamic=True, - fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, - backend=npu_backend) + self.torchair_compiled_model = torch.compile(self.model, + dynamic=True, + fullgraph=True, + backend=npu_backend) return self.torchair_compiled_model else: # Generate a new forward proxy code object to prevent the invalidation of @@ -399,7 +397,7 @@ class NPUTorchairModelRunner(NPUModelRunner): batch_size] = torchair.inference.cache_compile( self.model.__dict__[forward_proxy_name], dynamic=True, - fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, + fullgraph=True, cache_dir=TORCHAIR_CACHE_DIR, config=config, ge_cache=False)