13
13
# limitations under the License.
14
14
15
15
import logging
16
+ import warnings
16
17
from typing import List , Optional , Tuple , Type , Union
17
18
18
19
import numpy
19
20
from pydantic import BaseModel , Field
20
21
21
22
from deepsparse import Pipeline
23
+ from deepsparse .cpu import cpu_avx512_compatible
22
24
from deepsparse .pipeline import DEEPSPARSE_ENGINE
23
25
from deepsparse .transformers .engines import NLDecoderEngine
24
26
from deepsparse .transformers .pipelines import TransformersPipeline
@@ -115,9 +117,17 @@ def __init__(
115
117
# TODO: Set this to 64 once we modify the OPT injection logic
116
118
prompt_processing_sequence_length : int = 128 ,
117
119
force_max_tokens : bool = False ,
118
- use_deepsparse_cache : bool = False ,
120
+ use_deepsparse_cache : bool = True ,
119
121
** kwargs ,
120
122
):
123
+ if not cpu_avx512_compatible () and kwargs ["engine_type" ] == DEEPSPARSE_ENGINE :
124
+ warnings .warn (
125
+ "AVX512 support not detected, disabling internal management "
126
+ "of KV cache which may affect performance. To enable full "
127
+ "performance, deploy on an AVX512-compatible system."
128
+ )
129
+ use_deepsparse_cache = False
130
+
121
131
if use_deepsparse_cache :
122
132
if kwargs ["engine_type" ] != DEEPSPARSE_ENGINE :
123
133
raise ValueError (
@@ -126,10 +136,6 @@ def __init__(
126
136
f"is { kwargs ['engine_type' ]} . "
127
137
f"Make sure to set `engine_type` to { DEEPSPARSE_ENGINE } "
128
138
)
129
- raise NotImplementedError (
130
- "The deepsparse kv cache is not yet "
131
- "supported for text generation pipelines"
132
- )
133
139
134
140
super ().__init__ (
135
141
** kwargs , _delay_engine_initialize = True , _delay_overwriting_inputs = True
0 commit comments