Skip to content

Commit b87cd4f

Browse files
DarkLight1337rasmith
authored andcommitted
[Misc] Remove deprecated code (vllm-project#12383)
Signed-off-by: DarkLight1337 <[email protected]>
1 parent f548527 commit b87cd4f

File tree

6 files changed

+25
-78
lines changed

6 files changed

+25
-78
lines changed

tests/async_engine/test_api_server.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,27 +25,32 @@ def _query_server_long(prompt: str) -> dict:
2525

2626

2727
@pytest.fixture
28-
def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
28+
def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
2929
script_path = Path(__file__).parent.joinpath(
3030
"api_server_async_engine.py").absolute()
3131
commands = [
32-
sys.executable, "-u",
33-
str(script_path), "--model", "facebook/opt-125m", "--host",
34-
"127.0.0.1", "--tokenizer-pool-size",
35-
str(tokenizer_pool_size)
32+
sys.executable,
33+
"-u",
34+
str(script_path),
35+
"--model",
36+
"facebook/opt-125m",
37+
"--host",
38+
"127.0.0.1",
39+
"--tokenizer-pool-size",
40+
str(tokenizer_pool_size),
41+
"--distributed-executor-backend",
42+
distributed_executor_backend,
3643
]
3744

38-
if worker_use_ray:
39-
commands.append("--worker-use-ray")
4045
uvicorn_process = subprocess.Popen(commands)
4146
yield
4247
uvicorn_process.terminate()
4348

4449

4550
@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
46-
@pytest.mark.parametrize("worker_use_ray", [False, True])
51+
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
4752
def test_api_server(api_server, tokenizer_pool_size: int,
48-
worker_use_ray: bool):
53+
distributed_executor_backend: str):
4954
"""
5055
Run the API server and test it.
5156

tests/basic_correctness/test_preemption.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@ def check_settings():
2929

3030

3131
@pytest.fixture
32-
def worker_use_ray() -> bool:
33-
# When SPMD worker is used, use ray_use_worker=True
32+
def distributed_executor_backend() -> str:
33+
# When SPMD worker is used, use distributed_executor_backend="ray"
3434
# to test delta input optimization works with preemption.
35-
return envs.VLLM_USE_RAY_SPMD_WORKER
35+
return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp"
3636

3737

3838
@pytest.mark.parametrize("model", MODELS)
@@ -47,7 +47,7 @@ def test_chunked_prefill_recompute(
4747
dtype: str,
4848
max_tokens: int,
4949
chunked_prefill_token_size: int,
50-
worker_use_ray: bool,
50+
distributed_executor_backend: str,
5151
) -> None:
5252
"""Ensure that chunked prefill works with preemption."""
5353
max_num_seqs = min(chunked_prefill_token_size, 256)
@@ -66,7 +66,7 @@ def test_chunked_prefill_recompute(
6666
max_num_batched_tokens=max_num_batched_tokens,
6767
enable_chunked_prefill=enable_chunked_prefill,
6868
max_num_seqs=max_num_seqs,
69-
worker_use_ray=worker_use_ray,
69+
distributed_executor_backend=distributed_executor_backend,
7070
disable_log_stats=False,
7171
) as vllm_model:
7272
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -93,7 +93,7 @@ def test_preemption(
9393
model: str,
9494
dtype: str,
9595
max_tokens: int,
96-
worker_use_ray: bool,
96+
distributed_executor_backend: str,
9797
) -> None:
9898
"""By default, recompute preemption is enabled"""
9999

@@ -104,7 +104,7 @@ def test_preemption(
104104
model,
105105
dtype=dtype,
106106
disable_log_stats=False,
107-
worker_use_ray=worker_use_ray,
107+
distributed_executor_backend=distributed_executor_backend,
108108
) as vllm_model:
109109
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
110110
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
@@ -144,7 +144,7 @@ def test_preemption_infeasible(
144144
model: str,
145145
dtype: str,
146146
max_tokens: int,
147-
worker_use_ray: bool,
147+
distributed_executor_backend: str,
148148
) -> None:
149149
"""Verify infeasible preemption request will be ignored."""
150150
BLOCK_SIZE = 16
@@ -159,7 +159,7 @@ def test_preemption_infeasible(
159159
# ignored instead of hanging forever.
160160
num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
161161
max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
162-
worker_use_ray=worker_use_ray,
162+
distributed_executor_backend=distributed_executor_backend,
163163
) as vllm_model:
164164
sampling_params = SamplingParams(max_tokens=max_tokens,
165165
ignore_eos=True)

tests/multi_step/test_correctness_async_llm.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616
NUM_PROMPTS = [10]
1717

1818
DEFAULT_SERVER_ARGS: List[str] = [
19-
"--worker-use-ray",
19+
"--distributed-executor-backend",
20+
"ray",
2021
"--gpu-memory-utilization",
2122
"0.85",
2223
"--swap-space",

vllm/config.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1227,9 +1227,6 @@ class ParallelConfig:
12271227
pipeline_parallel_size: int = 1 # Number of pipeline parallel groups.
12281228
tensor_parallel_size: int = 1 # Number of tensor parallel groups.
12291229

1230-
# Deprecated, use distributed_executor_backend instead.
1231-
worker_use_ray: Optional[bool] = None
1232-
12331230
# Maximum number of multiple batches
12341231
# when load model sequentially. To avoid RAM OOM when using tensor
12351232
# parallel and large models.
@@ -1283,13 +1280,6 @@ def __post_init__(self) -> None:
12831280
self.world_size = self.pipeline_parallel_size * \
12841281
self.tensor_parallel_size
12851282

1286-
if self.worker_use_ray:
1287-
if self.distributed_executor_backend is None:
1288-
self.distributed_executor_backend = "ray"
1289-
elif not self.use_ray:
1290-
raise ValueError(f"worker-use-ray can't be used with "
1291-
f"distributed executor backend "
1292-
f"'{self.distributed_executor_backend}'.")
12931283
ray_only_devices = ["tpu"]
12941284
from vllm.platforms import current_platform
12951285
if (current_platform.device_type in ray_only_devices

vllm/engine/arg_utils.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,6 @@ class EngineArgs:
100100
kv_cache_dtype: str = 'auto'
101101
seed: int = 0
102102
max_model_len: Optional[int] = None
103-
worker_use_ray: bool = False
104103
# Note: Specifying a custom executor backend by passing a class
105104
# is intended for expert use only. The API may change without
106105
# notice.
@@ -389,10 +388,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
389388
'to "ray" if Ray is installed and fail otherwise. Note that tpu '
390389
'only supports Ray for distributed inference.')
391390

392-
parser.add_argument(
393-
'--worker-use-ray',
394-
action='store_true',
395-
help='Deprecated, use ``--distributed-executor-backend=ray``.')
396391
parser.add_argument('--pipeline-parallel-size',
397392
'-pp',
398393
type=int,
@@ -1071,7 +1066,6 @@ def create_engine_config(self,
10711066
parallel_config = ParallelConfig(
10721067
pipeline_parallel_size=self.pipeline_parallel_size,
10731068
tensor_parallel_size=self.tensor_parallel_size,
1074-
worker_use_ray=self.worker_use_ray,
10751069
max_parallel_loading_workers=self.max_parallel_loading_workers,
10761070
disable_custom_all_reduce=self.disable_custom_all_reduce,
10771071
tokenizer_pool_config=TokenizerPoolConfig.create_config(

vllm/engine/metrics.py

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -259,21 +259,6 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
259259
documentation="Number of emitted tokens.",
260260
labelnames=labelnames))
261261

262-
# Deprecated in favor of vllm:prompt_tokens_total
263-
self.gauge_avg_prompt_throughput = self._gauge_cls(
264-
name="vllm:avg_prompt_throughput_toks_per_s",
265-
documentation="Average prefill throughput in tokens/s.",
266-
labelnames=labelnames,
267-
multiprocess_mode="sum",
268-
)
269-
# Deprecated in favor of vllm:generation_tokens_total
270-
self.gauge_avg_generation_throughput = self._gauge_cls(
271-
name="vllm:avg_generation_throughput_toks_per_s",
272-
documentation="Average generation throughput in tokens/s.",
273-
labelnames=labelnames,
274-
multiprocess_mode="sum",
275-
)
276-
277262

278263
# end-metrics-definitions
279264

@@ -635,20 +620,6 @@ def _log_prometheus(self, stats: Stats) -> None:
635620
self._log_histogram(self.metrics.histogram_max_tokens_request,
636621
stats.max_tokens_requests)
637622

638-
def _log_prometheus_interval(self, prompt_throughput: float,
639-
generation_throughput: float) -> None:
640-
# Logs metrics to prometheus that are computed every logging_interval.
641-
# Support legacy gauge metrics that make throughput calculations on
642-
# the vLLM side. Moving forward, we should use counters like
643-
# counter_prompt_tokens, counter_generation_tokens
644-
# Which log raw data and calculate summaries using rate() on the
645-
# grafana/prometheus side. See
646-
# https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
647-
self.metrics.gauge_avg_prompt_throughput.labels(
648-
**self.labels).set(prompt_throughput)
649-
self.metrics.gauge_avg_generation_throughput.labels(
650-
**self.labels).set(generation_throughput)
651-
652623
def log(self, stats: Stats):
653624
"""Logs to prometheus and tracked stats every iteration."""
654625
# Log to prometheus.
@@ -664,20 +635,6 @@ def log(self, stats: Stats):
664635
# Log locally every local_interval seconds.
665636
if local_interval_elapsed(stats.now, self.last_local_log,
666637
self.local_interval):
667-
# Compute summary metrics for tracked stats (and log them
668-
# to promethus if applicable).
669-
prompt_throughput = get_throughput(self.num_prompt_tokens,
670-
now=stats.now,
671-
last_log=self.last_local_log)
672-
generation_throughput = get_throughput(
673-
self.num_generation_tokens,
674-
now=stats.now,
675-
last_log=self.last_local_log)
676-
677-
self._log_prometheus_interval(
678-
prompt_throughput=prompt_throughput,
679-
generation_throughput=generation_throughput)
680-
681638
if self.spec_decode_metrics is not None:
682639
self._log_gauge(
683640
self.metrics.gauge_spec_decode_draft_acceptance_rate,

0 commit comments

Comments
 (0)