diff --git a/examples/eplb/eplb_strategy.py b/examples/eplb/eplb_strategy.py index bcccbf23..0994cc4d 100644 --- a/examples/eplb/eplb_strategy.py +++ b/examples/eplb/eplb_strategy.py @@ -1,7 +1,6 @@ # coding=utf-8 # Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. import json -import logging import os import matplotlib.pyplot as plt # type: ignore @@ -11,8 +10,6 @@ import torch os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -logger = logging.getLogger("msit_logger") - def save_matrix_to_json(output_path, file_name, deployment): num_layers = deployment.shape[0] diff --git a/vllm_ascend/compilation/passes/norm_quant_fusion_pass.py b/vllm_ascend/compilation/passes/norm_quant_fusion_pass.py index 3cdeaaf3..f929c1a4 100644 --- a/vllm_ascend/compilation/passes/norm_quant_fusion_pass.py +++ b/vllm_ascend/compilation/passes/norm_quant_fusion_pass.py @@ -15,13 +15,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import logging - import torch import torch._inductor.pattern_matcher as pm from torch._inductor.pattern_matcher import PatternMatcherPass from vllm.compilation.vllm_inductor_pass import VllmInductorPass from vllm.config import VllmConfig +from vllm.logger import logger class AddRMSNormQuantPattern: @@ -288,7 +287,7 @@ class AddRMSNormQuantFusionPass(VllmInductorPass): dtype = vllm_config.model_config.dtype if dtype not in (torch.bfloat16, torch.float16): - logging.info("Quant fusion not enabled: unsupported dtype %s", + logger.debug("Quant fusion not enabled: unsupported dtype %s", dtype) return @@ -306,7 +305,7 @@ class AddRMSNormQuantFusionPass(VllmInductorPass): def __call__(self, graph: torch.fx.Graph): self.begin() self.matched_count = self.pattern_match_passes.apply(graph) - logging.debug("Replaced %s patterns", self.matched_count) + logger.debug("Replaced %s patterns", self.matched_count) self.end_and_log() def is_applicable(self, runtime_shape: int | None = None) -> bool: diff --git a/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py b/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py index 2f34c408..f8355a15 100644 --- a/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py +++ b/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py @@ -15,8 +15,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import logging - import torch import torch._inductor.pattern_matcher as pm from torch._inductor.pattern_matcher import (PatternMatcherPass, @@ -24,6 +22,7 @@ from torch._inductor.pattern_matcher import (PatternMatcherPass, from vllm.attention.layer import Attention from vllm.compilation.vllm_inductor_pass import VllmInductorPass from vllm.config import VllmConfig, get_layers_from_vllm_config +from vllm.logger import logger class QKNormRopeFusionPattern: @@ -237,7 +236,7 @@ class QKNormRopeFusionPass(VllmInductorPass): dtype = vllm_config.model_config.dtype if dtype not in (torch.bfloat16, torch.float16): - logging.info( + logger.debug( "QKNorm and Rope fusion not enabled: unsupported dtype %s", dtype) return @@ -246,14 +245,14 @@ class QKNormRopeFusionPass(VllmInductorPass): attn_layers: dict[str, Attention] = get_layers_from_vllm_config( vllm_config, Attention) if len(attn_layers) == 0: - logging.info( + logger.debug( "QKNorm and Rope fusion enabled, but no Attention layers were discovered." ) return layer = next(iter(attn_layers.values())) for epsilon in [1e-6, 1e-5]: if layer.head_size != 128: - logging.debug( + logger.debug( "QKNorm and Rope fusion not enabled: head_dim %d is not equal of 128", layer.head_size) continue @@ -274,13 +273,13 @@ class QKNormRopeFusionPass(VllmInductorPass): def __call__(self, graph: torch.fx.Graph): self.begin() self.matched_count = self.pattern_match_passes.apply(graph) - logging.debug("Fused %s QKNorm and Rope patterns", self.matched_count) - logging.debug("Patterns registered for replacement:") + logger.debug("Fused %s QKNorm and Rope patterns", self.matched_count) + logger.debug("Patterns registered for replacement:") pattern_idx = 0 for pattern_entry in self.pattern_match_passes.patterns.values(): for p in pattern_entry: p_str = PatternPrettyPrinter.run(p.pattern) - logging.debug("Pattern %d: %s", pattern_idx, p_str) + logger.debug("Pattern %d: %s", pattern_idx, p_str) pattern_idx += 1 self.end_and_log() diff --git a/vllm_ascend/eplb/core/policy/policy_dynamic_ep_v2.py b/vllm_ascend/eplb/core/policy/policy_dynamic_ep_v2.py index 198eeee0..a676e4d4 100644 --- a/vllm_ascend/eplb/core/policy/policy_dynamic_ep_v2.py +++ b/vllm_ascend/eplb/core/policy/policy_dynamic_ep_v2.py @@ -202,7 +202,6 @@ class DynamicEplbV2(EplbPolicy): for index, target_weight in enumerate(sorted_weights): expert_id, original_weight = target_weight if original_weight == -1: - print("Error:Redundant expert failure re-occurred") redundancy_successful = True break redundancy_successful = False @@ -712,7 +711,6 @@ class DynamicEplbV2(EplbPolicy): max_heat_per_layer_after = np.zeros([layer_num]) sum_num = 0 for layer in range(layer_num): - # print(f"Load imbalance ratio of layer {layer} under the new workload", layer_initial_imbalance[layer]) if layer_initial_imbalance[layer] < 1.01: global_deployment[layer] = info.placement_table[layer] continue @@ -734,13 +732,11 @@ class DynamicEplbV2(EplbPolicy): layer_workloads[layer], info.placement_table[layer], expert_from_device[layer], num_node, is_node_redundant, rendun_pos) - # print(layer, f"Imbalance Ratio after Redundancy Adjustment:", self.safe_divide(max_workload, ave_workload)) global_deployment[layer], new_max_workload = self.exchange_experts( result, com_between_devices, num_node, num_npus, is_node_redundant, ave_workload, increment, num_redundancy_expert, info.placement_table[layer]) - # print(layer, f"Imbalance Ratio after Swap Adjustment:", self.safe_divide(new_max_workload, ave_workload)) for device_id in range(num_npus): com_between_devices[device_id] = { diff --git a/vllm_ascend/eplb/core/policy/policy_flashlb.py b/vllm_ascend/eplb/core/policy/policy_flashlb.py index 2bf6551d..7a13bee2 100644 --- a/vllm_ascend/eplb/core/policy/policy_flashlb.py +++ b/vllm_ascend/eplb/core/policy/policy_flashlb.py @@ -411,7 +411,6 @@ class FlashLB(EplbPolicy): def compute_rank_load(self, deployment: np.ndarray, hotness: np.ndarray): n_stage, N = hotness.shape if np.any(deployment < 0): - print(f"Invalid deployment with negative values: {deployment}") raise ValueError("Deployment table contains negative values.") counts = np.bincount(deployment.reshape(-1), minlength=N) unit_hotness = np.divide(hotness, @@ -504,8 +503,6 @@ class FlashLB(EplbPolicy): stage_weights, recorsive=False, ) - if np.any(new_deployment < 0): - print(f"{new_deployment=}") new_par = self.compute_rank_load(new_deployment, hotness) return new_deployment, new_par, current_par diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index d6737178..a28197b6 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -1007,7 +1007,6 @@ def get_flashcomm2_config_and_validate(ascend_config, vllm_config): if not flashcomm2_enable(): flashcomm2_oproj_shared = False - logger.info("FLASHCOMM2 not enable.") return flashcomm2_oproj_tp_size, flashcomm2_oproj_shared logger.info(