Skip to content

Commit 16f9409

Browse files
authored
[1.7 Cherry Picks] (#1572)
* [server] Disable the elastic scheduler when continuous batching is enabled (#1569) * update server to disable the context/elastic scheduler when continuous batching is enabled * clean up when context is created * [TextGeneration] Fix initialization; don't try v1 init for text gen (#1571) * only check capacity condition durin prefill; already have check in generation * dont try v1 if running text gen; just raise error
1 parent 73d0471 commit 16f9409

File tree

5 files changed

+49
-17
lines changed

5 files changed

+49
-17
lines changed

src/deepsparse/pipeline.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
SchedulerGroup,
2828
)
2929
from deepsparse.subgraph_execute import SubGraphExecutor
30+
from deepsparse.tasks import SupportedTasks
3031
from deepsparse.utils import InferenceState, PipelineState
3132
from deepsparse.utils.subgraph import SubGraph
3233
from deepsparse.utils.time import TIMER_KEY, InferenceStages, TimerManager
@@ -139,7 +140,10 @@ def create(cls, task: str, **kwargs) -> "Pipeline":
139140
"Pipeline was not created for the given task. The "
140141
"provided task should be registered using the OperatorRegistry"
141142
)
142-
except Exception:
143+
except Exception as e:
144+
if SupportedTasks.is_text_generation(task):
145+
raise e
146+
143147
_LOGGER.warning(f"Could not create v2 '{task}' pipeline, trying legacy")
144148
from deepsparse.legacy import Pipeline
145149

src/deepsparse/server/deepsparse_server.py

+23-4
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
import logging
16+
from concurrent.futures import ThreadPoolExecutor
1617
from functools import partial
1718

1819
from deepsparse import Pipeline
@@ -73,12 +74,30 @@ def _add_endpoint(
7374
endpoint_config: EndpointConfig,
7475
):
7576
pipeline_config = endpoint_config.to_pipeline_config()
76-
pipeline_config.kwargs["executor"] = self.executor
7777

7878
_LOGGER.info(f"Initializing pipeline for '{endpoint_config.name}'")
79-
pipeline = Pipeline.from_config(
80-
pipeline_config, context=self.context, logger=self.server_logger
81-
)
79+
if pipeline_config.kwargs.get("continuous_batch_sizes"):
80+
pipeline_config.kwargs["executor"] = ThreadPoolExecutor(
81+
max_workers=self.server_config.num_workers
82+
)
83+
_LOGGER.info(
84+
"for continuous batching, the single stream scheduler will be enabled."
85+
)
86+
pipeline_config.num_cores = self.server_config.num_cores
87+
pipeline_config.scheduler = "single"
88+
89+
pipeline = Pipeline.from_config(
90+
pipeline_config,
91+
num_streams=self.server_config.num_workers,
92+
logger=self.server_logger,
93+
)
94+
else:
95+
pipeline_config.kwargs["executor"] = ThreadPoolExecutor(
96+
max_workers=self.context.num_streams
97+
)
98+
pipeline = Pipeline.from_config(
99+
pipeline_config, context=self.context, logger=self.server_logger
100+
)
82101

83102
_LOGGER.info(f"Adding endpoints for '{endpoint_config.name}'")
84103
self._add_inference_endpoints(

src/deepsparse/server/openai_server.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,19 @@ def _add_model(
376376
f"{SupportedTasks.code_generation._fields}"
377377
)
378378

379-
pipeline = Pipeline.from_config(pipeline_config, context=self.context)
379+
if pipeline_config.kwargs.get("continuous_batch_sizes"):
380+
_LOGGER.info(
381+
"for continuous batching, the single stream scheduler will be enabled."
382+
)
383+
pipeline_config.num_cores = self.server_config.num_cores
384+
pipeline_config.scheduler = "single"
385+
386+
pipeline = Pipeline.from_config(
387+
pipeline_config,
388+
num_streams=self.server_config.num_workers,
389+
)
390+
else:
391+
pipeline = Pipeline.from_config(pipeline_config, context=self.context)
380392

381393
if not self.model_to_pipeline.get(endpoint_config.model):
382394
model_card = ModelCard(

src/deepsparse/server/server.py

+4-10
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
import os
1717
from abc import abstractmethod
1818
from collections import Counter
19-
from concurrent.futures import ThreadPoolExecutor
2019
from copy import deepcopy
2120
from typing import AsyncGenerator, List, Optional, Union
2221

@@ -76,10 +75,11 @@ def __init__(self, server_config: Union[str, ServerConfig]):
7675
self.server_config = server_config
7776

7877
_LOGGER.info(f"Using config: {repr(self.server_config)}")
79-
80-
self.context = None
81-
self.executor = None
8278
self.server_logger = server_logger_from_config(self.server_config)
79+
self.context = Context(
80+
num_cores=self.server_config.num_cores,
81+
num_streams=self.server_config.num_workers,
82+
)
8383

8484
def start_server(
8585
self,
@@ -109,12 +109,6 @@ def start_server(
109109
self.config_path, f"http://{host}:{port}/endpoints", 0.5
110110
)
111111

112-
self.context = Context(
113-
num_cores=self.server_config.num_cores,
114-
num_streams=self.server_config.num_workers,
115-
)
116-
self.executor = ThreadPoolExecutor(max_workers=self.context.num_streams)
117-
118112
app = self._build_app()
119113

120114
uvicorn.run(

src/deepsparse/transformers/pipelines/text_generation/autoregressive_preprocess_operator.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,10 @@ def can_operate(self, inp: Any) -> bool:
5151
if inp.get("in_generation"):
5252
return True
5353

54-
if kv_cache.total_num_processed_tokens >= kv_cache.capacity:
54+
if (
55+
kv_cache.total_num_processed_tokens >= kv_cache.capacity
56+
and inp.get("in_generation") is None
57+
):
5558
raise RuntimeError(
5659
"Not enough kv_cache capacity to run generation. Please use a larger "
5760
"sequence_length or a shorter prompt"

0 commit comments

Comments
 (0)