Skip to content

Commit d848800

Browse files
[Misc] Move print_*_once from utils to logger (#11298)
Signed-off-by: DarkLight1337 <[email protected]> Signed-off-by: Maxime Fournioux <[email protected]> Co-authored-by: Maxime Fournioux <[email protected]>
1 parent 730e959 commit d848800

File tree

21 files changed

+129
-72
lines changed

21 files changed

+129
-72
lines changed

.github/workflows/lint-and-deploy.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ jobs:
6464
run: |
6565
export AWS_ACCESS_KEY_ID=minioadmin
6666
export AWS_SECRET_ACCESS_KEY=minioadmin
67+
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
6768
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
6869
6970
- name: curl test

vllm/attention/backends/torch_sdpa.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,12 @@
1313
from vllm.attention.backends.utils import CommonAttentionState
1414
from vllm.attention.ops.ipex_attn import PagedAttention
1515
from vllm.attention.ops.paged_attn import PagedAttentionMetadata
16-
from vllm.utils import make_tensor_with_pad, print_warning_once
16+
from vllm.logger import init_logger
17+
from vllm.utils import make_tensor_with_pad
1718
from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
1819

20+
logger = init_logger(__name__)
21+
1922

2023
class TorchSDPABackend(AttentionBackend):
2124

@@ -396,8 +399,8 @@ def __init__(
396399
raise ValueError(
397400
"Torch SPDA does not support block-sparse attention.")
398401
if logits_soft_cap is not None:
399-
print_warning_once("Torch SPDA does not support logits soft cap. "
400-
"Outputs may be slightly off.")
402+
logger.warning_once("Torch SPDA does not support logits soft cap. "
403+
"Outputs may be slightly off.")
401404
self.num_heads = num_heads
402405
self.head_size = head_size
403406
self.scale = float(scale)

vllm/attention/backends/xformers.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
1818
from vllm.attention.ops.paged_attn import (PagedAttention,
1919
PagedAttentionMetadata)
20-
from vllm.utils import print_warning_once
20+
from vllm.logger import init_logger
21+
22+
logger = init_logger(__name__)
2123

2224

2325
class XFormersBackend(AttentionBackend):
@@ -385,8 +387,8 @@ def __init__(
385387
raise ValueError(
386388
"XFormers does not support block-sparse attention.")
387389
if logits_soft_cap is not None:
388-
print_warning_once("XFormers does not support logits soft cap. "
389-
"Outputs may be slightly off.")
390+
logger.warning_once("XFormers does not support logits soft cap. "
391+
"Outputs may be slightly off.")
390392
self.num_heads = num_heads
391393
self.head_size = head_size
392394
self.scale = float(scale)

vllm/config.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,7 @@
3232
from vllm.transformers_utils.s3_utils import S3Model
3333
from vllm.transformers_utils.utils import is_s3
3434
from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
35-
get_cpu_memory, print_warning_once, random_uuid,
36-
resolve_obj_by_qualname)
35+
get_cpu_memory, random_uuid, resolve_obj_by_qualname)
3736

3837
if TYPE_CHECKING:
3938
from ray.util.placement_group import PlacementGroup
@@ -314,7 +313,7 @@ def __init__(self,
314313
sliding_window_len_min = get_min_sliding_window(
315314
self.hf_text_config.sliding_window)
316315

317-
print_warning_once(
316+
logger.warning_once(
318317
f"{self.hf_text_config.model_type} has interleaved "
319318
"attention, which is currently not supported by the "
320319
"XFORMERS backend. Disabling sliding window and capping "
@@ -2758,7 +2757,7 @@ def uuid(self):
27582757

27592758
def model_post_init(self, __context: Any) -> None:
27602759
if not self.enable_reshape and self.enable_fusion:
2761-
print_warning_once(
2760+
logger.warning_once(
27622761
"Fusion enabled but reshape elimination disabled."
27632762
"RMSNorm + quant (fp8) fusion might not work")
27642763

@@ -3151,7 +3150,7 @@ def __post_init__(self):
31513150
self.scheduler_config.chunked_prefill_enabled and \
31523151
self.model_config.dtype == torch.float32 and \
31533152
current_platform.get_device_capability() == (7, 5):
3154-
print_warning_once(
3153+
logger.warning_once(
31553154
"Turing devices tensor cores do not support float32 matmul. "
31563155
"To workaround this limitation, vLLM will set 'ieee' input "
31573156
"precision for chunked prefill triton kernels.")

vllm/entrypoints/chat_utils.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
from vllm.multimodal import MultiModalDataDict
3636
from vllm.multimodal.utils import MediaConnector
3737
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
38-
from vllm.utils import print_warning_once
3938

4039
logger = init_logger(__name__)
4140

@@ -985,14 +984,14 @@ def apply_mistral_chat_template(
985984
**kwargs: Any,
986985
) -> List[int]:
987986
if chat_template is not None:
988-
print_warning_once(
987+
logger.warning_once(
989988
"'chat_template' cannot be overridden for mistral tokenizer.")
990989
if "add_generation_prompt" in kwargs:
991-
print_warning_once(
990+
logger.warning_once(
992991
"'add_generation_prompt' is not supported for mistral tokenizer, "
993992
"so it will be ignored.")
994993
if "continue_final_message" in kwargs:
995-
print_warning_once(
994+
logger.warning_once(
996995
"'continue_final_message' is not supported for mistral tokenizer, "
997996
"so it will be ignored.")
998997

vllm/inputs/preprocess.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2
1111
from vllm.prompt_adapter.request import PromptAdapterRequest
1212
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
13-
from vllm.utils import print_info_once, print_warning_once
1413

1514
from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs,
1615
PromptType, SingletonInputs, SingletonPrompt, token_inputs)
@@ -68,21 +67,24 @@ def get_decoder_start_token_id(self) -> Optional[int]:
6867
'''
6968

7069
if not self.model_config.is_encoder_decoder:
71-
print_warning_once("Using None for decoder start token id because "
72-
"this is not an encoder/decoder model.")
70+
logger.warning_once(
71+
"Using None for decoder start token id because "
72+
"this is not an encoder/decoder model.")
7373
return None
7474

7575
if (self.model_config is None or self.model_config.hf_config is None):
76-
print_warning_once("Using None for decoder start token id because "
77-
"model config is not available.")
76+
logger.warning_once(
77+
"Using None for decoder start token id because "
78+
"model config is not available.")
7879
return None
7980

8081
dec_start_token_id = getattr(self.model_config.hf_config,
8182
'decoder_start_token_id', None)
8283
if dec_start_token_id is None:
83-
print_warning_once("Falling back on <BOS> for decoder start token "
84-
"id because decoder start token id is not "
85-
"available.")
84+
logger.warning_once(
85+
"Falling back on <BOS> for decoder start token "
86+
"id because decoder start token id is not "
87+
"available.")
8688
dec_start_token_id = self.get_bos_token_id()
8789

8890
return dec_start_token_id
@@ -231,7 +233,7 @@ def _can_process_multimodal(self) -> bool:
231233
# updated to use the new multi-modal processor
232234
can_process_multimodal = self.mm_registry.has_processor(model_config)
233235
if not can_process_multimodal:
234-
print_info_once(
236+
logger.info_once(
235237
"Your model uses the legacy input pipeline instead of the new "
236238
"multi-modal processor. Please note that the legacy pipeline "
237239
"will be removed in a future release. For more details, see: "

vllm/inputs/registry.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from vllm.transformers_utils.processor import cached_get_processor
1313
from vllm.transformers_utils.tokenizer import AnyTokenizer
1414
from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
15-
print_warning_once, resolve_mm_processor_kwargs)
15+
resolve_mm_processor_kwargs)
1616

1717
from .data import ProcessorInputs, SingletonInputs
1818
from .parse import is_encoder_decoder_inputs
@@ -352,7 +352,7 @@ def dummy_data_for_profiling(
352352
num_tokens = dummy_data.seq_data.prompt_token_ids
353353
if len(num_tokens) < seq_len:
354354
if is_encoder_data:
355-
print_warning_once(
355+
logger.warning_once(
356356
f"Expected at least {seq_len} dummy encoder tokens for "
357357
f"profiling, but found {len(num_tokens)} tokens instead.")
358358
else:

vllm/logger.py

Lines changed: 52 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@
44
import logging
55
import os
66
import sys
7-
from functools import partial
7+
from functools import lru_cache, partial
88
from logging import Logger
99
from logging.config import dictConfig
1010
from os import path
11-
from typing import Dict, Optional
11+
from types import MethodType
12+
from typing import Any, Optional, cast
1213

1314
import vllm.envs as envs
1415

@@ -49,8 +50,44 @@
4950
}
5051

5152

53+
@lru_cache
54+
def _print_info_once(logger: Logger, msg: str) -> None:
55+
# Set the stacklevel to 2 to print the original caller's line info
56+
logger.info(msg, stacklevel=2)
57+
58+
59+
@lru_cache
60+
def _print_warning_once(logger: Logger, msg: str) -> None:
61+
# Set the stacklevel to 2 to print the original caller's line info
62+
logger.warning(msg, stacklevel=2)
63+
64+
65+
class _VllmLogger(Logger):
66+
"""
67+
Note:
68+
This class is just to provide type information.
69+
We actually patch the methods directly on the :class:`logging.Logger`
70+
instance to avoid conflicting with other libraries such as
71+
`intel_extension_for_pytorch.utils._logger`.
72+
"""
73+
74+
def info_once(self, msg: str) -> None:
75+
"""
76+
As :meth:`info`, but subsequent calls with the same message
77+
are silently dropped.
78+
"""
79+
_print_info_once(self, msg)
80+
81+
def warning_once(self, msg: str) -> None:
82+
"""
83+
As :meth:`warning`, but subsequent calls with the same message
84+
are silently dropped.
85+
"""
86+
_print_warning_once(self, msg)
87+
88+
5289
def _configure_vllm_root_logger() -> None:
53-
logging_config: Dict = {}
90+
logging_config = dict[str, Any]()
5491

5592
if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
5693
raise RuntimeError(
@@ -84,12 +121,22 @@ def _configure_vllm_root_logger() -> None:
84121
dictConfig(logging_config)
85122

86123

87-
def init_logger(name: str) -> Logger:
124+
def init_logger(name: str) -> _VllmLogger:
88125
"""The main purpose of this function is to ensure that loggers are
89126
retrieved in such a way that we can be sure the root vllm logger has
90127
already been configured."""
91128

92-
return logging.getLogger(name)
129+
logger = logging.getLogger(name)
130+
131+
methods_to_patch = {
132+
"info_once": _print_info_once,
133+
"warning_once": _print_warning_once,
134+
}
135+
136+
for method_name, method in methods_to_patch.items():
137+
setattr(logger, method_name, MethodType(method, logger))
138+
139+
return cast(_VllmLogger, logger)
93140

94141

95142
# The root logger is initialized when the module is imported.

vllm/lora/peft_helper.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
from dataclasses import MISSING, dataclass, field, fields
55
from typing import Literal, Optional, Union
66

7-
from vllm.utils import print_info_once
7+
from vllm.logger import init_logger
8+
9+
logger = init_logger(__name__)
810

911

1012
@dataclass
@@ -42,7 +44,7 @@ def _validate_features(self):
4244
def __post_init__(self):
4345
self._validate_features()
4446
if self.use_rslora:
45-
print_info_once("Loading LoRA weights trained with rsLoRA.")
47+
logger.info_once("Loading LoRA weights trained with rsLoRA.")
4648
self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r)
4749
else:
4850
self.vllm_lora_scaling_factor = self.lora_alpha / self.r
Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,21 @@
1+
from vllm.logger import init_logger
12
from vllm.platforms import current_platform
2-
from vllm.utils import print_info_once
33

44
from .punica_base import PunicaWrapperBase
55

6+
logger = init_logger(__name__)
7+
68

79
def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
810
if current_platform.is_cuda_alike():
911
# Lazy import to avoid ImportError
1012
from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
11-
print_info_once("Using PunicaWrapperGPU.")
13+
logger.info_once("Using PunicaWrapperGPU.")
1214
return PunicaWrapperGPU(*args, **kwargs)
1315
elif current_platform.is_hpu():
1416
# Lazy import to avoid ImportError
1517
from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
16-
print_info_once("Using PunicaWrapperHPU.")
18+
logger.info_once("Using PunicaWrapperHPU.")
1719
return PunicaWrapperHPU(*args, **kwargs)
1820
else:
1921
raise NotImplementedError

vllm/model_executor/custom_op.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from vllm.config import get_current_vllm_config
66
from vllm.logger import init_logger
77
from vllm.platforms import current_platform
8-
from vllm.utils import print_warning_once
98

109
logger = init_logger(__name__)
1110

@@ -91,7 +90,7 @@ def enabled(cls) -> bool:
9190
compilation_config = get_current_vllm_config().compilation_config
9291
custom_ops = compilation_config.custom_ops
9392
if not hasattr(cls, "name"):
94-
print_warning_once(
93+
logger.warning_once(
9594
f"Custom op {cls.__name__} was not registered, "
9695
f"which means it won't appear in the op registry. "
9796
f"It will be enabled/disabled based on the global settings.")

vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import vllm.model_executor.layers.fused_moe # noqa
1010
from vllm import _custom_ops as ops
11+
from vllm.logger import init_logger
1112
from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
1213
FusedMoeWeightScaleSupported)
1314
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
@@ -16,7 +17,8 @@
1617
all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
1718
from vllm.model_executor.utils import set_weight_attrs
1819
from vllm.platforms import current_platform
19-
from vllm.utils import print_warning_once
20+
21+
logger = init_logger(__name__)
2022

2123

2224
class GPTQMarlinState(Enum):
@@ -142,10 +144,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
142144
"activation scales are None.")
143145
if (not all_close_1d(layer.w13_input_scale)
144146
or not all_close_1d(layer.w2_input_scale)):
145-
print_warning_once(
147+
logger.warning_once(
146148
"Found input_scales that are not equal for "
147149
"fp8 MoE layer. Using the maximum across experts "
148-
"for each layer. ")
150+
"for each layer.")
149151
layer.w13_input_scale = torch.nn.Parameter(
150152
layer.w13_input_scale.max(), requires_grad=False)
151153
layer.w2_input_scale = torch.nn.Parameter(

vllm/model_executor/layers/quantization/fp8.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
PerTensorScaleParameter)
2929
from vllm.model_executor.utils import set_weight_attrs
3030
from vllm.platforms import current_platform
31-
from vllm.utils import print_warning_once
3231

3332
ACTIVATION_SCHEMES = ["static", "dynamic"]
3433

@@ -539,10 +538,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
539538
"activation scales are None.")
540539
if (not all_close_1d(layer.w13_input_scale)
541540
or not all_close_1d(layer.w2_input_scale)):
542-
print_warning_once(
541+
logger.warning_once(
543542
"Found input_scales that are not equal for "
544543
"fp8 MoE layer. Using the maximum across experts "
545-
"for each layer. ")
544+
"for each layer.")
546545
layer.w13_input_scale = torch.nn.Parameter(
547546
layer.w13_input_scale.max(), requires_grad=False)
548547
layer.w2_input_scale = torch.nn.Parameter(

0 commit comments

Comments
 (0)