[Misc] Cleanup useless print and logger (#5220)
1. Remove useless print
2. use vLLM logger
3. change useless INFO to DEBUG level
- vLLM version: release/v0.13.0
- vLLM main:
ad32e3e19c
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -1,7 +1,6 @@
|
|||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
# Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
|
# Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
|
||||||
import json
|
import json
|
||||||
import logging
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import matplotlib.pyplot as plt # type: ignore
|
import matplotlib.pyplot as plt # type: ignore
|
||||||
@@ -11,8 +10,6 @@ import torch
|
|||||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||||
|
|
||||||
logger = logging.getLogger("msit_logger")
|
|
||||||
|
|
||||||
|
|
||||||
def save_matrix_to_json(output_path, file_name, deployment):
|
def save_matrix_to_json(output_path, file_name, deployment):
|
||||||
num_layers = deployment.shape[0]
|
num_layers = deployment.shape[0]
|
||||||
|
|||||||
@@ -15,13 +15,12 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
import logging
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch._inductor.pattern_matcher as pm
|
import torch._inductor.pattern_matcher as pm
|
||||||
from torch._inductor.pattern_matcher import PatternMatcherPass
|
from torch._inductor.pattern_matcher import PatternMatcherPass
|
||||||
from vllm.compilation.vllm_inductor_pass import VllmInductorPass
|
from vllm.compilation.vllm_inductor_pass import VllmInductorPass
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
|
from vllm.logger import logger
|
||||||
|
|
||||||
|
|
||||||
class AddRMSNormQuantPattern:
|
class AddRMSNormQuantPattern:
|
||||||
@@ -288,7 +287,7 @@ class AddRMSNormQuantFusionPass(VllmInductorPass):
|
|||||||
|
|
||||||
dtype = vllm_config.model_config.dtype
|
dtype = vllm_config.model_config.dtype
|
||||||
if dtype not in (torch.bfloat16, torch.float16):
|
if dtype not in (torch.bfloat16, torch.float16):
|
||||||
logging.info("Quant fusion not enabled: unsupported dtype %s",
|
logger.debug("Quant fusion not enabled: unsupported dtype %s",
|
||||||
dtype)
|
dtype)
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -306,7 +305,7 @@ class AddRMSNormQuantFusionPass(VllmInductorPass):
|
|||||||
def __call__(self, graph: torch.fx.Graph):
|
def __call__(self, graph: torch.fx.Graph):
|
||||||
self.begin()
|
self.begin()
|
||||||
self.matched_count = self.pattern_match_passes.apply(graph)
|
self.matched_count = self.pattern_match_passes.apply(graph)
|
||||||
logging.debug("Replaced %s patterns", self.matched_count)
|
logger.debug("Replaced %s patterns", self.matched_count)
|
||||||
self.end_and_log()
|
self.end_and_log()
|
||||||
|
|
||||||
def is_applicable(self, runtime_shape: int | None = None) -> bool:
|
def is_applicable(self, runtime_shape: int | None = None) -> bool:
|
||||||
|
|||||||
@@ -15,8 +15,6 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
import logging
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch._inductor.pattern_matcher as pm
|
import torch._inductor.pattern_matcher as pm
|
||||||
from torch._inductor.pattern_matcher import (PatternMatcherPass,
|
from torch._inductor.pattern_matcher import (PatternMatcherPass,
|
||||||
@@ -24,6 +22,7 @@ from torch._inductor.pattern_matcher import (PatternMatcherPass,
|
|||||||
from vllm.attention.layer import Attention
|
from vllm.attention.layer import Attention
|
||||||
from vllm.compilation.vllm_inductor_pass import VllmInductorPass
|
from vllm.compilation.vllm_inductor_pass import VllmInductorPass
|
||||||
from vllm.config import VllmConfig, get_layers_from_vllm_config
|
from vllm.config import VllmConfig, get_layers_from_vllm_config
|
||||||
|
from vllm.logger import logger
|
||||||
|
|
||||||
|
|
||||||
class QKNormRopeFusionPattern:
|
class QKNormRopeFusionPattern:
|
||||||
@@ -237,7 +236,7 @@ class QKNormRopeFusionPass(VllmInductorPass):
|
|||||||
|
|
||||||
dtype = vllm_config.model_config.dtype
|
dtype = vllm_config.model_config.dtype
|
||||||
if dtype not in (torch.bfloat16, torch.float16):
|
if dtype not in (torch.bfloat16, torch.float16):
|
||||||
logging.info(
|
logger.debug(
|
||||||
"QKNorm and Rope fusion not enabled: unsupported dtype %s",
|
"QKNorm and Rope fusion not enabled: unsupported dtype %s",
|
||||||
dtype)
|
dtype)
|
||||||
return
|
return
|
||||||
@@ -246,14 +245,14 @@ class QKNormRopeFusionPass(VllmInductorPass):
|
|||||||
attn_layers: dict[str, Attention] = get_layers_from_vllm_config(
|
attn_layers: dict[str, Attention] = get_layers_from_vllm_config(
|
||||||
vllm_config, Attention)
|
vllm_config, Attention)
|
||||||
if len(attn_layers) == 0:
|
if len(attn_layers) == 0:
|
||||||
logging.info(
|
logger.debug(
|
||||||
"QKNorm and Rope fusion enabled, but no Attention layers were discovered."
|
"QKNorm and Rope fusion enabled, but no Attention layers were discovered."
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
layer = next(iter(attn_layers.values()))
|
layer = next(iter(attn_layers.values()))
|
||||||
for epsilon in [1e-6, 1e-5]:
|
for epsilon in [1e-6, 1e-5]:
|
||||||
if layer.head_size != 128:
|
if layer.head_size != 128:
|
||||||
logging.debug(
|
logger.debug(
|
||||||
"QKNorm and Rope fusion not enabled: head_dim %d is not equal of 128",
|
"QKNorm and Rope fusion not enabled: head_dim %d is not equal of 128",
|
||||||
layer.head_size)
|
layer.head_size)
|
||||||
continue
|
continue
|
||||||
@@ -274,13 +273,13 @@ class QKNormRopeFusionPass(VllmInductorPass):
|
|||||||
def __call__(self, graph: torch.fx.Graph):
|
def __call__(self, graph: torch.fx.Graph):
|
||||||
self.begin()
|
self.begin()
|
||||||
self.matched_count = self.pattern_match_passes.apply(graph)
|
self.matched_count = self.pattern_match_passes.apply(graph)
|
||||||
logging.debug("Fused %s QKNorm and Rope patterns", self.matched_count)
|
logger.debug("Fused %s QKNorm and Rope patterns", self.matched_count)
|
||||||
logging.debug("Patterns registered for replacement:")
|
logger.debug("Patterns registered for replacement:")
|
||||||
pattern_idx = 0
|
pattern_idx = 0
|
||||||
for pattern_entry in self.pattern_match_passes.patterns.values():
|
for pattern_entry in self.pattern_match_passes.patterns.values():
|
||||||
for p in pattern_entry:
|
for p in pattern_entry:
|
||||||
p_str = PatternPrettyPrinter.run(p.pattern)
|
p_str = PatternPrettyPrinter.run(p.pattern)
|
||||||
logging.debug("Pattern %d: %s", pattern_idx, p_str)
|
logger.debug("Pattern %d: %s", pattern_idx, p_str)
|
||||||
pattern_idx += 1
|
pattern_idx += 1
|
||||||
self.end_and_log()
|
self.end_and_log()
|
||||||
|
|
||||||
|
|||||||
@@ -202,7 +202,6 @@ class DynamicEplbV2(EplbPolicy):
|
|||||||
for index, target_weight in enumerate(sorted_weights):
|
for index, target_weight in enumerate(sorted_weights):
|
||||||
expert_id, original_weight = target_weight
|
expert_id, original_weight = target_weight
|
||||||
if original_weight == -1:
|
if original_weight == -1:
|
||||||
print("Error:Redundant expert failure re-occurred")
|
|
||||||
redundancy_successful = True
|
redundancy_successful = True
|
||||||
break
|
break
|
||||||
redundancy_successful = False
|
redundancy_successful = False
|
||||||
@@ -712,7 +711,6 @@ class DynamicEplbV2(EplbPolicy):
|
|||||||
max_heat_per_layer_after = np.zeros([layer_num])
|
max_heat_per_layer_after = np.zeros([layer_num])
|
||||||
sum_num = 0
|
sum_num = 0
|
||||||
for layer in range(layer_num):
|
for layer in range(layer_num):
|
||||||
# print(f"Load imbalance ratio of layer {layer} under the new workload", layer_initial_imbalance[layer])
|
|
||||||
if layer_initial_imbalance[layer] < 1.01:
|
if layer_initial_imbalance[layer] < 1.01:
|
||||||
global_deployment[layer] = info.placement_table[layer]
|
global_deployment[layer] = info.placement_table[layer]
|
||||||
continue
|
continue
|
||||||
@@ -734,13 +732,11 @@ class DynamicEplbV2(EplbPolicy):
|
|||||||
layer_workloads[layer], info.placement_table[layer],
|
layer_workloads[layer], info.placement_table[layer],
|
||||||
expert_from_device[layer], num_node, is_node_redundant,
|
expert_from_device[layer], num_node, is_node_redundant,
|
||||||
rendun_pos)
|
rendun_pos)
|
||||||
# print(layer, f"Imbalance Ratio after Redundancy Adjustment:", self.safe_divide(max_workload, ave_workload))
|
|
||||||
|
|
||||||
global_deployment[layer], new_max_workload = self.exchange_experts(
|
global_deployment[layer], new_max_workload = self.exchange_experts(
|
||||||
result, com_between_devices, num_node, num_npus,
|
result, com_between_devices, num_node, num_npus,
|
||||||
is_node_redundant, ave_workload, increment,
|
is_node_redundant, ave_workload, increment,
|
||||||
num_redundancy_expert, info.placement_table[layer])
|
num_redundancy_expert, info.placement_table[layer])
|
||||||
# print(layer, f"Imbalance Ratio after Swap Adjustment:", self.safe_divide(new_max_workload, ave_workload))
|
|
||||||
|
|
||||||
for device_id in range(num_npus):
|
for device_id in range(num_npus):
|
||||||
com_between_devices[device_id] = {
|
com_between_devices[device_id] = {
|
||||||
|
|||||||
@@ -411,7 +411,6 @@ class FlashLB(EplbPolicy):
|
|||||||
def compute_rank_load(self, deployment: np.ndarray, hotness: np.ndarray):
|
def compute_rank_load(self, deployment: np.ndarray, hotness: np.ndarray):
|
||||||
n_stage, N = hotness.shape
|
n_stage, N = hotness.shape
|
||||||
if np.any(deployment < 0):
|
if np.any(deployment < 0):
|
||||||
print(f"Invalid deployment with negative values: {deployment}")
|
|
||||||
raise ValueError("Deployment table contains negative values.")
|
raise ValueError("Deployment table contains negative values.")
|
||||||
counts = np.bincount(deployment.reshape(-1), minlength=N)
|
counts = np.bincount(deployment.reshape(-1), minlength=N)
|
||||||
unit_hotness = np.divide(hotness,
|
unit_hotness = np.divide(hotness,
|
||||||
@@ -504,8 +503,6 @@ class FlashLB(EplbPolicy):
|
|||||||
stage_weights,
|
stage_weights,
|
||||||
recorsive=False,
|
recorsive=False,
|
||||||
)
|
)
|
||||||
if np.any(new_deployment < 0):
|
|
||||||
print(f"{new_deployment=}")
|
|
||||||
new_par = self.compute_rank_load(new_deployment, hotness)
|
new_par = self.compute_rank_load(new_deployment, hotness)
|
||||||
|
|
||||||
return new_deployment, new_par, current_par
|
return new_deployment, new_par, current_par
|
||||||
|
|||||||
@@ -1007,7 +1007,6 @@ def get_flashcomm2_config_and_validate(ascend_config, vllm_config):
|
|||||||
|
|
||||||
if not flashcomm2_enable():
|
if not flashcomm2_enable():
|
||||||
flashcomm2_oproj_shared = False
|
flashcomm2_oproj_shared = False
|
||||||
logger.info("FLASHCOMM2 not enable.")
|
|
||||||
return flashcomm2_oproj_tp_size, flashcomm2_oproj_shared
|
return flashcomm2_oproj_tp_size, flashcomm2_oproj_shared
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|||||||
Reference in New Issue
Block a user