13
13
# limitations under the License.
14
14
15
15
import logging
16
+ import warnings
16
17
from typing import List , Optional , Tuple , Type , Union
17
18
18
19
import numpy
19
20
from pydantic import BaseModel , Field
20
21
21
22
from deepsparse import Pipeline
23
+ from deepsparse .cpu import cpu_avx512_compatible
22
24
from deepsparse .pipeline import DEEPSPARSE_ENGINE
23
25
from deepsparse .transformers .engines import NLDecoderEngine
24
26
from deepsparse .transformers .pipelines import TransformersPipeline
@@ -115,9 +117,19 @@ def __init__(
115
117
# TODO: Set this to 64 once we modify the OPT injection logic
116
118
prompt_processing_sequence_length : int = 128 ,
117
119
force_max_tokens : bool = False ,
118
- use_deepsparse_cache : bool = False ,
120
+ use_deepsparse_cache : bool = True ,
119
121
** kwargs ,
120
122
):
123
+ print (cpu_avx512_compatible ())
124
+ if not cpu_avx512_compatible () and kwargs ["engine_type" ] == DEEPSPARSE_ENGINE :
125
+ warnings .warn (
126
+ "Detected CPU is not AVX512 compatible. "
127
+ "The kv cache management will not be supported "
128
+ "by the optimized engine. The user may experience "
129
+ "non optimal performance."
130
+ )
131
+ use_deepsparse_cache = False
132
+
121
133
if use_deepsparse_cache :
122
134
if kwargs ["engine_type" ] != DEEPSPARSE_ENGINE :
123
135
raise ValueError (
@@ -126,10 +138,6 @@ def __init__(
126
138
f"is { kwargs ['engine_type' ]} . "
127
139
f"Make sure to set `engine_type` to { DEEPSPARSE_ENGINE } "
128
140
)
129
- raise NotImplementedError (
130
- "The deepsparse kv cache is not yet "
131
- "supported for text generation pipelines"
132
- )
133
141
134
142
super ().__init__ (
135
143
** kwargs , _delay_engine_initialize = True , _delay_overwriting_inputs = True
0 commit comments