[Bugfix] Remove VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE (#2969)
### What this PR does / why we need it?
This PR prepares for deleting this enviroment variable,
`VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE`, as vllm requires `fullgraph=True`
to run
- Fixes https://github.com/vllm-project/vllm/issues/21834
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
See CI
- vLLM version: v0.10.2
- vLLM main:
99cc41ad50
---------
Signed-off-by: Lucas Kabela <lucaskabela@meta.com>
This commit is contained in:
@@ -3,7 +3,6 @@ import types
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torchair
|
import torchair
|
||||||
import vllm.envs as envs_vllm
|
|
||||||
from torchair import patch_for_hcom
|
from torchair import patch_for_hcom
|
||||||
from vllm.attention.layer import Attention
|
from vllm.attention.layer import Attention
|
||||||
from vllm.config import (VllmConfig, get_layers_from_vllm_config,
|
from vllm.config import (VllmConfig, get_layers_from_vllm_config,
|
||||||
@@ -596,11 +595,10 @@ class MtpProposer(Proposer):
|
|||||||
torch.npu.set_compile_mode(jit_compile=False)
|
torch.npu.set_compile_mode(jit_compile=False)
|
||||||
if not self.runner.use_cached_npu_graph:
|
if not self.runner.use_cached_npu_graph:
|
||||||
npu_backend = torchair.get_npu_backend(compiler_config=config)
|
npu_backend = torchair.get_npu_backend(compiler_config=config)
|
||||||
self.torchair_compiled_model = torch.compile(
|
self.torchair_compiled_model = torch.compile(self.model,
|
||||||
self.model,
|
dynamic=True,
|
||||||
dynamic=True,
|
fullgraph=True,
|
||||||
fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
|
backend=npu_backend)
|
||||||
backend=npu_backend)
|
|
||||||
return self.torchair_compiled_model
|
return self.torchair_compiled_model
|
||||||
else:
|
else:
|
||||||
# Generate a new forward proxy code object to prevent the invalidation of
|
# Generate a new forward proxy code object to prevent the invalidation of
|
||||||
@@ -622,7 +620,7 @@ class MtpProposer(Proposer):
|
|||||||
batch_size] = torchair.inference.cache_compile(
|
batch_size] = torchair.inference.cache_compile(
|
||||||
self.model.__dict__[forward_proxy_name],
|
self.model.__dict__[forward_proxy_name],
|
||||||
dynamic=True,
|
dynamic=True,
|
||||||
fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
|
fullgraph=True,
|
||||||
cache_dir=TORCHAIR_CACHE_DIR,
|
cache_dir=TORCHAIR_CACHE_DIR,
|
||||||
config=config,
|
config=config,
|
||||||
ge_cache=False)
|
ge_cache=False)
|
||||||
|
|||||||
@@ -25,7 +25,6 @@ import torch
|
|||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch_npu
|
import torch_npu
|
||||||
import vllm.envs as envs_vllm
|
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||||
from vllm.distributed.parallel_state import get_dp_group
|
from vllm.distributed.parallel_state import get_dp_group
|
||||||
@@ -373,11 +372,10 @@ class NPUTorchairModelRunner(NPUModelRunner):
|
|||||||
torch.npu.set_compile_mode(jit_compile=False)
|
torch.npu.set_compile_mode(jit_compile=False)
|
||||||
if not self.use_cached_npu_graph:
|
if not self.use_cached_npu_graph:
|
||||||
npu_backend = torchair.get_npu_backend(compiler_config=config)
|
npu_backend = torchair.get_npu_backend(compiler_config=config)
|
||||||
self.torchair_compiled_model = torch.compile(
|
self.torchair_compiled_model = torch.compile(self.model,
|
||||||
self.model,
|
dynamic=True,
|
||||||
dynamic=True,
|
fullgraph=True,
|
||||||
fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
|
backend=npu_backend)
|
||||||
backend=npu_backend)
|
|
||||||
return self.torchair_compiled_model
|
return self.torchair_compiled_model
|
||||||
else:
|
else:
|
||||||
# Generate a new forward proxy code object to prevent the invalidation of
|
# Generate a new forward proxy code object to prevent the invalidation of
|
||||||
@@ -399,7 +397,7 @@ class NPUTorchairModelRunner(NPUModelRunner):
|
|||||||
batch_size] = torchair.inference.cache_compile(
|
batch_size] = torchair.inference.cache_compile(
|
||||||
self.model.__dict__[forward_proxy_name],
|
self.model.__dict__[forward_proxy_name],
|
||||||
dynamic=True,
|
dynamic=True,
|
||||||
fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
|
fullgraph=True,
|
||||||
cache_dir=TORCHAIR_CACHE_DIR,
|
cache_dir=TORCHAIR_CACHE_DIR,
|
||||||
config=config,
|
config=config,
|
||||||
ge_cache=False)
|
ge_cache=False)
|
||||||
|
|||||||
Reference in New Issue
Block a user