From 53ecd89e8ff405302be040a76effa8c012cbaaeb Mon Sep 17 00:00:00 2001
From: Lucas Kabela <lucasakabela@gmail.com>
Date: Fri, 19 Sep 2025 17:22:30 -0700
Subject: [PATCH] [Bugfix] Remove `VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE` (#2969)

### What this PR does / why we need it?
This PR prepares for deleting this enviroment variable,
`VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE`, as vllm requires `fullgraph=True`
to run

- Fixes https://github.com/vllm-project/vllm/issues/21834

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
See CI

- vLLM version: v0.10.2
- vLLM main:
https://github.com/vllm-project/vllm/commit/99cc41ad50c08e745571abe568226f9fcae61ccd

---------

Signed-off-by: Lucas Kabela <lucaskabela@meta.com>
---
 vllm_ascend/spec_decode/mtp_proposer.py       | 12 +++++-------
 vllm_ascend/torchair/torchair_model_runner.py | 12 +++++-------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py
index 0a96b25..b0a8cf5 100644
--- a/vllm_ascend/spec_decode/mtp_proposer.py
+++ b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -3,7 +3,6 @@ import types
 import torch
 import torch.nn as nn
 import torchair
-import vllm.envs as envs_vllm
 from torchair import patch_for_hcom
 from vllm.attention.layer import Attention
 from vllm.config import (VllmConfig, get_layers_from_vllm_config,
@@ -596,11 +595,10 @@ class MtpProposer(Proposer):
         torch.npu.set_compile_mode(jit_compile=False)
         if not self.runner.use_cached_npu_graph:
             npu_backend = torchair.get_npu_backend(compiler_config=config)
-            self.torchair_compiled_model = torch.compile(
-                self.model,
-                dynamic=True,
-                fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
-                backend=npu_backend)
+            self.torchair_compiled_model = torch.compile(self.model,
+                                                         dynamic=True,
+                                                         fullgraph=True,
+                                                         backend=npu_backend)
             return self.torchair_compiled_model
         else:
             # Generate a new forward proxy code object to prevent the invalidation of
@@ -622,7 +620,7 @@ class MtpProposer(Proposer):
                 batch_size] = torchair.inference.cache_compile(
                     self.model.__dict__[forward_proxy_name],
                     dynamic=True,
-                    fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                    fullgraph=True,
                     cache_dir=TORCHAIR_CACHE_DIR,
                     config=config,
                     ge_cache=False)
diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py
index c9c2d61..0c715fd 100644
--- a/vllm_ascend/torchair/torchair_model_runner.py
+++ b/vllm_ascend/torchair/torchair_model_runner.py
@@ -25,7 +25,6 @@ import torch
 import torch.distributed as dist
 import torch.nn as nn
 import torch_npu
-import vllm.envs as envs_vllm
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_dp_group
@@ -373,11 +372,10 @@ class NPUTorchairModelRunner(NPUModelRunner):
         torch.npu.set_compile_mode(jit_compile=False)
         if not self.use_cached_npu_graph:
             npu_backend = torchair.get_npu_backend(compiler_config=config)
-            self.torchair_compiled_model = torch.compile(
-                self.model,
-                dynamic=True,
-                fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
-                backend=npu_backend)
+            self.torchair_compiled_model = torch.compile(self.model,
+                                                         dynamic=True,
+                                                         fullgraph=True,
+                                                         backend=npu_backend)
             return self.torchair_compiled_model
         else:
             # Generate a new forward proxy code object to prevent the invalidation of
@@ -399,7 +397,7 @@ class NPUTorchairModelRunner(NPUModelRunner):
                 batch_size] = torchair.inference.cache_compile(
                     self.model.__dict__[forward_proxy_name],
                     dynamic=True,
-                    fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                    fullgraph=True,
                     cache_dir=TORCHAIR_CACHE_DIR,
                     config=config,
                     ge_cache=False)