Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade to text-embedding-3-large model as default, with vector storage optimizations #2470

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
2 changes: 2 additions & 0 deletions .azdo/pipelines/azure-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ steps:
AZURE_SEARCH_QUERY_SPELLER: $(AZURE_SEARCH_QUERY_SPELLER)
AZURE_SEARCH_SEMANTIC_RANKER: $(AZURE_SEARCH_SEMANTIC_RANKER)
AZURE_SEARCH_QUERY_REWRITING: $(AZURE_SEARCH_QUERY_REWRITING)
AZURE_SEARCH_FIELD_NAME_EMBEDDING: $(AZURE_SEARCH_FIELD_NAME_EMBEDDING)
AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING: $(AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING)
AZURE_STORAGE_ACCOUNT: $(AZURE_STORAGE_ACCOUNT)
AZURE_STORAGE_RESOURCE_GROUP: $(AZURE_STORAGE_RESOURCE_GROUP)
AZURE_STORAGE_SKU: $(AZURE_STORAGE_SKU)
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/azure-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ jobs:
AZURE_SEARCH_QUERY_SPELLER: ${{ vars.AZURE_SEARCH_QUERY_SPELLER }}
AZURE_SEARCH_SEMANTIC_RANKER: ${{ vars.AZURE_SEARCH_SEMANTIC_RANKER }}
AZURE_SEARCH_QUERY_REWRITING: ${{ vars.AZURE_SEARCH_QUERY_REWRITING }}
AZURE_SEARCH_FIELD_NAME_EMBEDDING: ${{ vars.AZURE_SEARCH_FIELD_NAME_EMBEDDING }}
AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING: ${{ vars.AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING }}
AZURE_STORAGE_ACCOUNT: ${{ vars.AZURE_STORAGE_ACCOUNT }}
AZURE_STORAGE_RESOURCE_GROUP: ${{ vars.AZURE_STORAGE_RESOURCE_GROUP }}
AZURE_STORAGE_SKU: ${{ vars.AZURE_STORAGE_SKU }}
Expand Down
13 changes: 12 additions & 1 deletion app/backend/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,9 @@ async def setup_clients():
AZURE_SEARCH_QUERY_SPELLER = os.getenv("AZURE_SEARCH_QUERY_SPELLER") or "lexicon"
AZURE_SEARCH_SEMANTIC_RANKER = os.getenv("AZURE_SEARCH_SEMANTIC_RANKER", "free").lower()
AZURE_SEARCH_QUERY_REWRITING = os.getenv("AZURE_SEARCH_QUERY_REWRITING", "false").lower()
# This defaults to the previous field name "embedding", for backwards compatibility
AZURE_SEARCH_FIELD_NAME_EMBEDDING = os.getenv("AZURE_SEARCH_FIELD_NAME_EMBEDDING", "embedding")
AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING = os.getenv("AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING", "imageEmbedding")

AZURE_SPEECH_SERVICE_ID = os.getenv("AZURE_SPEECH_SERVICE_ID")
AZURE_SPEECH_SERVICE_LOCATION = os.getenv("AZURE_SPEECH_SERVICE_LOCATION")
Expand Down Expand Up @@ -579,7 +582,11 @@ async def setup_clients():
disable_vectors=os.getenv("USE_VECTORS", "").lower() == "false",
)
ingester = UploadUserFileStrategy(
search_info=search_info, embeddings=text_embeddings_service, file_processors=file_processors
search_info=search_info,
embeddings=text_embeddings_service,
file_processors=file_processors,
search_field_name_embedding=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
search_field_name_image_embedding=AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING,
)
current_app.config[CONFIG_INGESTER] = ingester

Expand Down Expand Up @@ -676,6 +683,7 @@ async def setup_clients():
embedding_model=OPENAI_EMB_MODEL,
embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
embedding_dimensions=OPENAI_EMB_DIMENSIONS,
embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
sourcepage_field=KB_FIELDS_SOURCEPAGE,
content_field=KB_FIELDS_CONTENT,
query_language=AZURE_SEARCH_QUERY_LANGUAGE,
Expand All @@ -694,6 +702,7 @@ async def setup_clients():
embedding_model=OPENAI_EMB_MODEL,
embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
embedding_dimensions=OPENAI_EMB_DIMENSIONS,
embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
sourcepage_field=KB_FIELDS_SOURCEPAGE,
content_field=KB_FIELDS_CONTENT,
query_language=AZURE_SEARCH_QUERY_LANGUAGE,
Expand Down Expand Up @@ -733,6 +742,7 @@ async def setup_clients():
embedding_model=OPENAI_EMB_MODEL,
embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
embedding_dimensions=OPENAI_EMB_DIMENSIONS,
embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
sourcepage_field=KB_FIELDS_SOURCEPAGE,
content_field=KB_FIELDS_CONTENT,
query_language=AZURE_SEARCH_QUERY_LANGUAGE,
Expand All @@ -754,6 +764,7 @@ async def setup_clients():
embedding_model=OPENAI_EMB_MODEL,
embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
embedding_dimensions=OPENAI_EMB_DIMENSIONS,
embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
sourcepage_field=KB_FIELDS_SOURCEPAGE,
content_field=KB_FIELDS_CONTENT,
query_language=AZURE_SEARCH_QUERY_LANGUAGE,
Expand Down
11 changes: 8 additions & 3 deletions app/backend/approaches/approach.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,10 @@ class Document:
reranker_score: Optional[float] = None

def serialize_for_results(self) -> dict[str, Any]:
return {
result_dict = {
"id": self.id,
"content": self.content,
# Should we rename to its actual field name in the index?
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a pending question as to whether to send it down using the field of the index?

"embedding": Document.trim_embedding(self.embedding),
"imageEmbedding": Document.trim_embedding(self.image_embedding),
"category": self.category,
Expand All @@ -78,6 +79,7 @@ def serialize_for_results(self) -> dict[str, Any]:
"score": self.score,
"reranker_score": self.reranker_score,
}
return result_dict

@classmethod
def trim_embedding(cls, embedding: Optional[List[float]]) -> Optional[str]:
Expand Down Expand Up @@ -162,6 +164,7 @@ def __init__(
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
embedding_model: str,
embedding_dimensions: int,
embedding_field: str,
openai_host: str,
vision_endpoint: str,
vision_token_provider: Callable[[], Awaitable[str]],
Expand All @@ -176,6 +179,7 @@ def __init__(
self.embedding_deployment = embedding_deployment
self.embedding_model = embedding_model
self.embedding_dimensions = embedding_dimensions
self.embedding_field = embedding_field
self.openai_host = openai_host
self.vision_endpoint = vision_endpoint
self.vision_token_provider = vision_token_provider
Expand Down Expand Up @@ -241,7 +245,7 @@ async def search(
Document(
id=document.get("id"),
content=document.get("content"),
embedding=document.get("embedding"),
embedding=document.get(self.embedding_field),
image_embedding=document.get("imageEmbedding"),
category=document.get("category"),
sourcepage=document.get("sourcepage"),
Expand Down Expand Up @@ -317,7 +321,8 @@ class ExtraArgs(TypedDict, total=False):
**dimensions_args,
)
query_vector = embedding.data[0].embedding
return VectorizedQuery(vector=query_vector, k_nearest_neighbors=50, fields="embedding")
# TODO: use optimizations from rag time journey 3
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return VectorizedQuery(vector=query_vector, k_nearest_neighbors=50, fields=self.embedding_field)

async def compute_image_embedding(self, q: str):
endpoint = urljoin(self.vision_endpoint, "computervision/retrieval:vectorizeText")
Expand Down
2 changes: 2 additions & 0 deletions app/backend/approaches/chatreadretrieveread.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def __init__(
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
embedding_model: str,
embedding_dimensions: int,
embedding_field: str,
sourcepage_field: str,
content_field: str,
query_language: str,
Expand All @@ -49,6 +50,7 @@ def __init__(
self.embedding_deployment = embedding_deployment
self.embedding_model = embedding_model
self.embedding_dimensions = embedding_dimensions
self.embedding_field = embedding_field
self.sourcepage_field = sourcepage_field
self.content_field = content_field
self.query_language = query_language
Expand Down
10 changes: 6 additions & 4 deletions app/backend/approaches/chatreadretrievereadvision.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
embedding_model: str,
embedding_dimensions: int,
embedding_field: str,
sourcepage_field: str,
content_field: str,
query_language: str,
Expand All @@ -57,6 +58,7 @@ def __init__(
self.embedding_deployment = embedding_deployment
self.embedding_model = embedding_model
self.embedding_dimensions = embedding_dimensions
self.embedding_field = embedding_field
self.sourcepage_field = sourcepage_field
self.content_field = content_field
self.query_language = query_language
Expand Down Expand Up @@ -88,7 +90,7 @@ async def run_until_final_call(
minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0)
filter = self.build_filter(overrides, auth_claims)

vector_fields = overrides.get("vector_fields", ["embedding"])
vector_fields = overrides.get("vector_fields", [self.embedding_field])
send_text_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "texts", None]
send_images_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "images", None]

Expand Down Expand Up @@ -123,9 +125,9 @@ async def run_until_final_call(
if use_vector_search:
for field in vector_fields:
vector = (
await self.compute_text_embedding(query_text)
if field == "embedding"
else await self.compute_image_embedding(query_text)
await self.compute_image_embedding(query_text)
if field.startswith("image")
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was a bit tricky, feels a bit code smelly.

else await self.compute_text_embedding(query_text)
)
vectors.append(vector)

Expand Down
2 changes: 2 additions & 0 deletions app/backend/approaches/retrievethenread.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def __init__(
embedding_model: str,
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
embedding_dimensions: int,
embedding_field: str,
sourcepage_field: str,
content_field: str,
query_language: str,
Expand All @@ -44,6 +45,7 @@ def __init__(
self.embedding_dimensions = embedding_dimensions
self.chatgpt_deployment = chatgpt_deployment
self.embedding_deployment = embedding_deployment
self.embedding_field = embedding_field
self.sourcepage_field = sourcepage_field
self.content_field = content_field
self.query_language = query_language
Expand Down
10 changes: 6 additions & 4 deletions app/backend/approaches/retrievethenreadvision.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def __init__(
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
embedding_model: str,
embedding_dimensions: int,
embedding_field: str,
sourcepage_field: str,
content_field: str,
query_language: str,
Expand All @@ -47,6 +48,7 @@ def __init__(
self.embedding_model = embedding_model
self.embedding_deployment = embedding_deployment
self.embedding_dimensions = embedding_dimensions
self.embedding_field = embedding_field
self.sourcepage_field = sourcepage_field
self.content_field = content_field
self.gpt4v_deployment = gpt4v_deployment
Expand Down Expand Up @@ -83,7 +85,7 @@ async def run(
minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0)
filter = self.build_filter(overrides, auth_claims)

vector_fields = overrides.get("vector_fields", ["embedding"])
vector_fields = overrides.get("vector_fields", [self.embedding_field])
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar here, need to decide whether the frontend should render the actual field names.

send_text_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "texts", None]
send_images_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "images", None]

Expand All @@ -92,9 +94,9 @@ async def run(
if use_vector_search:
for field in vector_fields:
vector = (
await self.compute_text_embedding(q)
if field == "embedding"
else await self.compute_image_embedding(q)
await self.compute_image_embedding(q)
if field.startswith("image")
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same logic here

else await self.compute_text_embedding(q)
)
vectors.append(vector)

Expand Down
5 changes: 5 additions & 0 deletions app/backend/prepdocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,8 @@ async def main(strategy: Strategy, setup_index: bool = True):
blob_manager=blob_manager,
document_action=document_action,
embeddings=openai_embeddings_service,
search_field_name_embedding=os.environ["AZURE_SEARCH_FIELD_NAME_EMBEDDING"],
search_field_name_image_embedding=os.environ["AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING"],
subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
search_service_user_assigned_id=args.searchserviceassignedid,
search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
Expand Down Expand Up @@ -430,6 +432,9 @@ async def main(strategy: Strategy, setup_index: bool = True):
embeddings=openai_embeddings_service,
image_embeddings=image_embeddings_service,
search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
# Default to the previous field names for backward compatibility
search_field_name_embedding=os.getenv("AZURE_SEARCH_FIELD_NAME_EMBEDDING", "embedding"),
search_field_name_image_embedding=os.getenv("AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING", "imageEmbedding"),
use_acls=use_acls,
category=args.category,
use_content_understanding=use_content_understanding,
Expand Down
40 changes: 30 additions & 10 deletions app/backend/prepdocslib/filestrategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ def __init__(
embeddings: Optional[OpenAIEmbeddings] = None,
image_embeddings: Optional[ImageEmbeddings] = None,
search_analyzer_name: Optional[str] = None,
search_field_name_embedding: Optional[str] = None,
search_field_name_image_embedding: Optional[str] = None,
use_acls: bool = False,
category: Optional[str] = None,
use_content_understanding: bool = False,
Expand All @@ -63,22 +65,29 @@ def __init__(
self.embeddings = embeddings
self.image_embeddings = image_embeddings
self.search_analyzer_name = search_analyzer_name
self.search_field_name_embedding = search_field_name_embedding
self.search_field_name_image_embedding = search_field_name_image_embedding
self.search_info = search_info
self.use_acls = use_acls
self.category = category
self.use_content_understanding = use_content_understanding
self.content_understanding_endpoint = content_understanding_endpoint

async def setup(self):
search_manager = SearchManager(
def setup_search_manager(self):
self.search_manager = SearchManager(
self.search_info,
self.search_analyzer_name,
self.use_acls,
False,
self.embeddings,
field_name_embedding=self.search_field_name_embedding,
field_name_image_embedding=self.search_field_name_image_embedding,
search_images=self.image_embeddings is not None,
)
await search_manager.create_index()

async def setup(self):
self.setup_search_manager()
await self.search_manager.create_index()

if self.use_content_understanding:
if self.content_understanding_endpoint is None:
Expand All @@ -91,9 +100,7 @@ async def setup(self):
await cu_manager.create_analyzer()

async def run(self):
search_manager = SearchManager(
self.search_info, self.search_analyzer_name, self.use_acls, False, self.embeddings
)
self.setup_search_manager()
if self.document_action == DocumentAction.Add:
files = self.list_file_strategy.list()
async for file in files:
Expand All @@ -104,18 +111,18 @@ async def run(self):
blob_image_embeddings: Optional[List[List[float]]] = None
if self.image_embeddings and blob_sas_uris:
blob_image_embeddings = await self.image_embeddings.create_embeddings(blob_sas_uris)
await search_manager.update_content(sections, blob_image_embeddings, url=file.url)
await self.search_manager.update_content(sections, blob_image_embeddings, url=file.url)
finally:
if file:
file.close()
elif self.document_action == DocumentAction.Remove:
paths = self.list_file_strategy.list_paths()
async for path in paths:
await self.blob_manager.remove_blob(path)
await search_manager.remove_content(path)
await self.search_manager.remove_content(path)
elif self.document_action == DocumentAction.RemoveAll:
await self.blob_manager.remove_blob()
await search_manager.remove_content()
await self.search_manager.remove_content()


class UploadUserFileStrategy:
Expand All @@ -129,12 +136,25 @@ def __init__(
file_processors: dict[str, FileProcessor],
embeddings: Optional[OpenAIEmbeddings] = None,
image_embeddings: Optional[ImageEmbeddings] = None,
search_field_name_embedding: Optional[str] = None,
search_field_name_image_embedding: Optional[str] = None,
):
self.file_processors = file_processors
self.embeddings = embeddings
self.image_embeddings = image_embeddings
self.search_info = search_info
self.search_manager = SearchManager(self.search_info, None, True, False, self.embeddings)
self.search_manager = SearchManager(
search_info=self.search_info,
search_analyzer_name=None,
use_acls=True,
use_int_vectorization=False,
embeddings=self.embeddings,
field_name_embedding=search_field_name_embedding,
field_name_image_embedding=search_field_name_image_embedding,
search_images=False,
)
self.search_field_name_embedding = search_field_name_embedding
self.search_field_name_image_embedding = search_field_name_image_embedding

async def add_file(self, file: File):
if self.image_embeddings:
Expand Down
Loading
Loading