Skip to content

Commit cb5149d

Browse files
authored
Remove token-counting library for conversation history truncation (#2449)
* Simplify RenderedPrompt class * Fix reference * Remove (EXAMPLE) from prompts * Remove long history test * Fix test with missing async, run on Windows too * Remove Windows again as a few tests fail
1 parent 236b592 commit cb5149d

15 files changed

+30
-390
lines changed

Diff for: .github/workflows/python-test.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ jobs:
5959
run: black . --check --verbose
6060
- name: Run Python tests
6161
if: runner.os != 'Windows'
62-
run: pytest -s -vv --cov --cov-fail-under=86
62+
run: pytest -s -vv --cov --cov-fail-under=89
6363
- name: Run E2E tests with Playwright
6464
id: e2e
6565
if: runner.os != 'Windows'

Diff for: app/backend/approaches/approach.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,6 @@ class ThoughtStep:
9292

9393
class Approach(ABC):
9494

95-
# Allows usage of non-GPT model even if no tokenizer is available for accurate token counting
96-
# Useful for using local small language models, for example
97-
ALLOW_NON_GPT_MODELS = True
98-
9995
def __init__(
10096
self,
10197
search_client: SearchClient,
@@ -147,8 +143,8 @@ async def search(
147143
use_vector_search: bool,
148144
use_semantic_ranker: bool,
149145
use_semantic_captions: bool,
150-
minimum_search_score: Optional[float],
151-
minimum_reranker_score: Optional[float],
146+
minimum_search_score: Optional[float] = None,
147+
minimum_reranker_score: Optional[float] = None,
152148
use_query_rewriting: Optional[bool] = None,
153149
) -> List[Document]:
154150
search_text = query_text if use_text_search else ""

Diff for: app/backend/approaches/chatreadretrieveread.py

+4-28
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
ChatCompletionMessageParam,
1010
ChatCompletionToolParam,
1111
)
12-
from openai_messages_token_helper import build_messages, get_token_limit
1312

1413
from approaches.approach import ThoughtStep
1514
from approaches.chatapproach import ChatApproach
@@ -53,7 +52,6 @@ def __init__(
5352
self.content_field = content_field
5453
self.query_language = query_language
5554
self.query_speller = query_speller
56-
self.chatgpt_token_limit = get_token_limit(chatgpt_model, default_to_minimum=self.ALLOW_NON_GPT_MODELS)
5755
self.prompt_manager = prompt_manager
5856
self.query_rewrite_prompt = self.prompt_manager.load_prompt("chat_query_rewrite.prompty")
5957
self.query_rewrite_tools = self.prompt_manager.load_tools("chat_query_rewrite_tools.json")
@@ -99,30 +97,18 @@ async def run_until_final_call(
9997
if not isinstance(original_user_query, str):
10098
raise ValueError("The most recent message content must be a string.")
10199

102-
rendered_query_prompt = self.prompt_manager.render_prompt(
100+
query_messages = self.prompt_manager.render_prompt(
103101
self.query_rewrite_prompt, {"user_query": original_user_query, "past_messages": messages[:-1]}
104102
)
105103
tools: List[ChatCompletionToolParam] = self.query_rewrite_tools
106104

107105
# STEP 1: Generate an optimized keyword search query based on the chat history and the last question
108-
query_response_token_limit = 100
109-
query_messages = build_messages(
110-
model=self.chatgpt_model,
111-
system_prompt=rendered_query_prompt.system_content,
112-
few_shots=rendered_query_prompt.few_shot_messages,
113-
past_messages=rendered_query_prompt.past_messages,
114-
new_user_content=rendered_query_prompt.new_user_content,
115-
tools=tools,
116-
max_tokens=self.chatgpt_token_limit - query_response_token_limit,
117-
fallback_to_default=self.ALLOW_NON_GPT_MODELS,
118-
)
119-
120106
chat_completion: ChatCompletion = await self.openai_client.chat.completions.create(
121107
messages=query_messages, # type: ignore
122108
# Azure OpenAI takes the deployment name as the model name
123109
model=self.chatgpt_deployment if self.chatgpt_deployment else self.chatgpt_model,
124110
temperature=0.0, # Minimize creativity for search query generation
125-
max_tokens=query_response_token_limit, # Setting too low risks malformed JSON, setting too high may affect performance
111+
max_tokens=100, # Setting too low risks malformed JSON, setting too high may affect performance
126112
n=1,
127113
tools=tools,
128114
seed=seed,
@@ -153,7 +139,7 @@ async def run_until_final_call(
153139

154140
# STEP 3: Generate a contextual and content specific answer using the search results and chat history
155141
text_sources = self.get_sources_content(results, use_semantic_captions, use_image_citation=False)
156-
rendered_answer_prompt = self.prompt_manager.render_prompt(
142+
messages = self.prompt_manager.render_prompt(
157143
self.answer_prompt,
158144
self.get_system_prompt_variables(overrides.get("prompt_template"))
159145
| {
@@ -164,16 +150,6 @@ async def run_until_final_call(
164150
},
165151
)
166152

167-
response_token_limit = 1024
168-
messages = build_messages(
169-
model=self.chatgpt_model,
170-
system_prompt=rendered_answer_prompt.system_content,
171-
past_messages=rendered_answer_prompt.past_messages,
172-
new_user_content=rendered_answer_prompt.new_user_content,
173-
max_tokens=self.chatgpt_token_limit - response_token_limit,
174-
fallback_to_default=self.ALLOW_NON_GPT_MODELS,
175-
)
176-
177153
extra_info = {
178154
"data_points": {"text": text_sources},
179155
"thoughts": [
@@ -220,7 +196,7 @@ async def run_until_final_call(
220196
model=self.chatgpt_deployment if self.chatgpt_deployment else self.chatgpt_model,
221197
messages=messages,
222198
temperature=overrides.get("temperature", 0.3),
223-
max_tokens=response_token_limit,
199+
max_tokens=1024,
224200
n=1,
225201
stream=should_stream,
226202
seed=seed,

Diff for: app/backend/approaches/chatreadretrievereadvision.py

+8-32
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
ChatCompletionMessageParam,
1010
ChatCompletionToolParam,
1111
)
12-
from openai_messages_token_helper import build_messages, get_token_limit
1312

1413
from approaches.approach import ThoughtStep
1514
from approaches.chatapproach import ChatApproach
@@ -64,7 +63,6 @@ def __init__(
6463
self.query_speller = query_speller
6564
self.vision_endpoint = vision_endpoint
6665
self.vision_token_provider = vision_token_provider
67-
self.chatgpt_token_limit = get_token_limit(gpt4v_model, default_to_minimum=self.ALLOW_NON_GPT_MODELS)
6866
self.prompt_manager = prompt_manager
6967
self.query_rewrite_prompt = self.prompt_manager.load_prompt("chat_query_rewrite.prompty")
7068
self.query_rewrite_tools = self.prompt_manager.load_tools("chat_query_rewrite_tools.json")
@@ -97,30 +95,18 @@ async def run_until_final_call(
9795
raise ValueError("The most recent message content must be a string.")
9896

9997
# Use prompty to prepare the query prompt
100-
rendered_query_prompt = self.prompt_manager.render_prompt(
98+
query_messages = self.prompt_manager.render_prompt(
10199
self.query_rewrite_prompt, {"user_query": original_user_query, "past_messages": messages[:-1]}
102100
)
103101
tools: List[ChatCompletionToolParam] = self.query_rewrite_tools
104102

105103
# STEP 1: Generate an optimized keyword search query based on the chat history and the last question
106-
query_response_token_limit = 100
107-
query_model = self.chatgpt_model
108-
query_deployment = self.chatgpt_deployment
109-
query_messages = build_messages(
110-
model=query_model,
111-
system_prompt=rendered_query_prompt.system_content,
112-
few_shots=rendered_query_prompt.few_shot_messages,
113-
past_messages=rendered_query_prompt.past_messages,
114-
new_user_content=rendered_query_prompt.new_user_content,
115-
max_tokens=self.chatgpt_token_limit - query_response_token_limit,
116-
)
117-
118104
chat_completion: ChatCompletion = await self.openai_client.chat.completions.create(
119105
messages=query_messages,
120106
# Azure OpenAI takes the deployment name as the model name
121-
model=query_deployment if query_deployment else query_model,
107+
model=self.chatgpt_deployment if self.chatgpt_deployment else self.chatgpt_model,
122108
temperature=0.0, # Minimize creativity for search query generation
123-
max_tokens=query_response_token_limit,
109+
max_tokens=100,
124110
n=1,
125111
tools=tools,
126112
seed=seed,
@@ -166,7 +152,7 @@ async def run_until_final_call(
166152
if url:
167153
image_sources.append(url)
168154

169-
rendered_answer_prompt = self.prompt_manager.render_prompt(
155+
messages = self.prompt_manager.render_prompt(
170156
self.answer_prompt,
171157
self.get_system_prompt_variables(overrides.get("prompt_template"))
172158
| {
@@ -178,16 +164,6 @@ async def run_until_final_call(
178164
},
179165
)
180166

181-
response_token_limit = 1024
182-
messages = build_messages(
183-
model=self.gpt4v_model,
184-
system_prompt=rendered_answer_prompt.system_content,
185-
past_messages=rendered_answer_prompt.past_messages,
186-
new_user_content=rendered_answer_prompt.new_user_content,
187-
max_tokens=self.chatgpt_token_limit - response_token_limit,
188-
fallback_to_default=self.ALLOW_NON_GPT_MODELS,
189-
)
190-
191167
extra_info = {
192168
"data_points": {
193169
"text": text_sources,
@@ -198,9 +174,9 @@ async def run_until_final_call(
198174
"Prompt to generate search query",
199175
query_messages,
200176
(
201-
{"model": query_model, "deployment": query_deployment}
202-
if query_deployment
203-
else {"model": query_model}
177+
{"model": self.chatgpt_model, "deployment": self.chatgpt_deployment}
178+
if self.chatgpt_deployment
179+
else {"model": self.chatgpt_model}
204180
),
205181
),
206182
ThoughtStep(
@@ -236,7 +212,7 @@ async def run_until_final_call(
236212
model=self.gpt4v_deployment if self.gpt4v_deployment else self.gpt4v_model,
237213
messages=messages,
238214
temperature=overrides.get("temperature", 0.3),
239-
max_tokens=response_token_limit,
215+
max_tokens=1024,
240216
n=1,
241217
stream=should_stream,
242218
seed=seed,

Diff for: app/backend/approaches/promptmanager.py

+3-48
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,10 @@
11
import json
22
import pathlib
3-
from dataclasses import dataclass
43

54
import prompty
65
from openai.types.chat import ChatCompletionMessageParam
76

87

9-
@dataclass
10-
class RenderedPrompt:
11-
all_messages: list[ChatCompletionMessageParam]
12-
system_content: str
13-
few_shot_messages: list[ChatCompletionMessageParam]
14-
past_messages: list[ChatCompletionMessageParam]
15-
new_user_content: str
16-
17-
188
class PromptManager:
199

2010
def load_prompt(self, path: str):
@@ -23,7 +13,7 @@ def load_prompt(self, path: str):
2313
def load_tools(self, path: str):
2414
raise NotImplementedError
2515

26-
def render_prompt(self, prompt, data) -> RenderedPrompt:
16+
def render_prompt(self, prompt, data) -> list[ChatCompletionMessageParam]:
2717
raise NotImplementedError
2818

2919

@@ -37,40 +27,5 @@ def load_prompt(self, path: str):
3727
def load_tools(self, path: str):
3828
return json.loads(open(self.PROMPTS_DIRECTORY / path).read())
3929

40-
def render_prompt(self, prompt, data) -> RenderedPrompt:
41-
# Assumes that the first message is the system message, the last message is the user message,
42-
# and the messages in-between are either examples or past messages.
43-
44-
all_messages: list = prompty.prepare(prompt, data)
45-
remaining_messages = all_messages.copy()
46-
47-
system_content = None
48-
if all_messages[0]["role"] == "system":
49-
system_content = all_messages[0]["content"]
50-
remaining_messages.pop(0)
51-
else:
52-
raise ValueError("The first message in the prompt must be a system message.")
53-
54-
new_user_content = None
55-
if all_messages[-1]["role"] == "user":
56-
new_user_content = all_messages[-1]["content"]
57-
remaining_messages.pop(-1)
58-
else:
59-
raise ValueError("The last message in the prompt must be a user message.")
60-
61-
few_shot_messages = []
62-
past_messages = []
63-
for user_message, assistant_message in zip(remaining_messages[0::2], remaining_messages[1::2]):
64-
if user_message["content"].startswith("(EXAMPLE)"):
65-
user_message["content"] = user_message["content"][9:].lstrip()
66-
few_shot_messages.extend([user_message, assistant_message])
67-
else:
68-
past_messages.extend([user_message, assistant_message])
69-
70-
return RenderedPrompt(
71-
all_messages=all_messages,
72-
system_content=system_content,
73-
few_shot_messages=few_shot_messages,
74-
past_messages=past_messages,
75-
new_user_content=new_user_content,
76-
)
30+
def render_prompt(self, prompt, data) -> list[ChatCompletionMessageParam]:
31+
return prompty.prepare(prompt, data)

Diff for: app/backend/approaches/prompts/ask_answer_question.prompty

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ If you cannot answer using the sources below, say you don't know. Use below exam
2323
{% endif %}
2424

2525
user:
26-
(EXAMPLE) What is the deductible for the employee plan for a visit to Overlake in Bellevue?
26+
What is the deductible for the employee plan for a visit to Overlake in Bellevue?
2727

2828
Sources:
2929
info1.txt: deductibles depend on whether you are in-network or out-of-network. In-network deductibles are $500 for employee and $1000 for family. Out-of-network deductibles are $1000 for employee and $2000 for family.

Diff for: app/backend/approaches/prompts/chat_query_rewrite.prompty

+2-2
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,13 @@ If the question is not in English, translate the question to English before gene
2424
If you cannot generate a search query, return just the number 0.
2525

2626
user:
27-
(EXAMPLE) How did crypto do last year?
27+
How did crypto do last year?
2828

2929
assistant:
3030
Summarize Cryptocurrency Market Dynamics from last year
3131

3232
user:
33-
(EXAMPLE) What are my health plans?
33+
What are my health plans?
3434

3535
assistant:
3636
Show available health plans

Diff for: app/backend/approaches/retrievethenread.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from azure.search.documents.models import VectorQuery
55
from openai import AsyncOpenAI
66
from openai.types.chat import ChatCompletionMessageParam
7-
from openai_messages_token_helper import get_token_limit
87

98
from approaches.approach import Approach, ThoughtStep
109
from approaches.promptmanager import PromptManager
@@ -48,7 +47,6 @@ def __init__(
4847
self.content_field = content_field
4948
self.query_language = query_language
5049
self.query_speller = query_speller
51-
self.chatgpt_token_limit = get_token_limit(chatgpt_model, self.ALLOW_NON_GPT_MODELS)
5250
self.prompt_manager = prompt_manager
5351
self.answer_prompt = self.prompt_manager.load_prompt("ask_answer_question.prompty")
5452

@@ -95,7 +93,7 @@ async def run(
9593

9694
# Process results
9795
text_sources = self.get_sources_content(results, use_semantic_captions, use_image_citation=False)
98-
rendered_answer_prompt = self.prompt_manager.render_prompt(
96+
messages = self.prompt_manager.render_prompt(
9997
self.answer_prompt,
10098
self.get_system_prompt_variables(overrides.get("prompt_template"))
10199
| {"user_query": q, "text_sources": text_sources},
@@ -104,7 +102,7 @@ async def run(
104102
chat_completion = await self.openai_client.chat.completions.create(
105103
# Azure OpenAI takes the deployment name as the model name
106104
model=self.chatgpt_deployment if self.chatgpt_deployment else self.chatgpt_model,
107-
messages=rendered_answer_prompt.all_messages,
105+
messages=messages,
108106
temperature=overrides.get("temperature", 0.3),
109107
max_tokens=1024,
110108
n=1,
@@ -133,7 +131,7 @@ async def run(
133131
),
134132
ThoughtStep(
135133
"Prompt to generate answer",
136-
rendered_answer_prompt.all_messages,
134+
messages,
137135
(
138136
{"model": self.chatgpt_model, "deployment": self.chatgpt_deployment}
139137
if self.chatgpt_deployment

Diff for: app/backend/approaches/retrievethenreadvision.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from openai.types.chat import (
77
ChatCompletionMessageParam,
88
)
9-
from openai_messages_token_helper import get_token_limit
109

1110
from approaches.approach import Approach, ThoughtStep
1211
from approaches.promptmanager import PromptManager
@@ -56,7 +55,6 @@ def __init__(
5655
self.query_speller = query_speller
5756
self.vision_endpoint = vision_endpoint
5857
self.vision_token_provider = vision_token_provider
59-
self.gpt4v_token_limit = get_token_limit(gpt4v_model, self.ALLOW_NON_GPT_MODELS)
6058
self.prompt_manager = prompt_manager
6159
self.answer_prompt = self.prompt_manager.load_prompt("ask_answer_question_vision.prompty")
6260

@@ -123,15 +121,15 @@ async def run(
123121
if url:
124122
image_sources.append(url)
125123

126-
rendered_answer_prompt = self.prompt_manager.render_prompt(
124+
messages = self.prompt_manager.render_prompt(
127125
self.answer_prompt,
128126
self.get_system_prompt_variables(overrides.get("prompt_template"))
129127
| {"user_query": q, "text_sources": text_sources, "image_sources": image_sources},
130128
)
131129

132130
chat_completion = await self.openai_client.chat.completions.create(
133131
model=self.gpt4v_deployment if self.gpt4v_deployment else self.gpt4v_model,
134-
messages=rendered_answer_prompt.all_messages,
132+
messages=messages,
135133
temperature=overrides.get("temperature", 0.3),
136134
max_tokens=1024,
137135
n=1,
@@ -161,7 +159,7 @@ async def run(
161159
),
162160
ThoughtStep(
163161
"Prompt to generate answer",
164-
rendered_answer_prompt.all_messages,
162+
messages,
165163
(
166164
{"model": self.gpt4v_model, "deployment": self.gpt4v_deployment}
167165
if self.gpt4v_deployment

0 commit comments

Comments
 (0)