diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_build_mlindex.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_build_mlindex.py index 5e8036a081ed..1db912ebbf69 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_build_mlindex.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_build_mlindex.py @@ -38,8 +38,8 @@ def build_index( from azure.ai.generative.index._documents import DocumentChunksIterator, split_documents from azure.ai.generative.index._embeddings import EmbeddingsContainer from azure.ai.generative.index._tasks.update_acs import create_index_from_raw_embeddings - from azure.ai.generative.index._utils.connections import get_connection_by_id_v2 from azure.ai.generative.index._utils.logging import disable_mlflow + from azure.ai.resources._index._utils.connections import get_connection_by_id_v2 except ImportError as e: print("In order to use build_index to build an Index locally, you must have azure-ai-generative[index] installed") raise e @@ -175,7 +175,7 @@ def _create_mlindex_from_existing_acs( ) -> Index: try: from azure.ai.generative.index._embeddings import EmbeddingsContainer - from azure.ai.generative.index._utils.connections import get_connection_by_id_v2 + from azure.ai.resources._index._utils.connections import get_connection_by_id_v2 except ImportError as e: print("In order to use build_index to build an Index locally, you must have azure-ai-generative[index] installed") raise e diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/__init__.py deleted file mode 100644 index 5328bdc55b18..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- -"""DataIndex configuration and operations.""" - -__path__ = __import__("pkgutil").extend_path(__path__, __name__) - -from azure.ai.generative.index._dataindex.entities import Data, CitationRegex, DataIndex, Embedding, IndexSource, IndexStore, index_data -from azure.ai.generative.index._dataindex.operations import DataOperations - -__all__ = [ - "DataOperations", - "DataIndex", - "IndexSource", - "Data", - "CitationRegex", - "Embedding", - "IndexStore", - "index_data", -] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/_schema/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/_schema/__init__.py deleted file mode 100644 index 624f5ee88ecf..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/_schema/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- - - -__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/_schema/_data_index/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/_schema/_data_index/__init__.py deleted file mode 100644 index 14f3af959399..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/_schema/_data_index/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- - -__path__ = __import__("pkgutil").extend_path(__path__, __name__) - -from .data_index import ( - CitationRegexSchema, - DataIndexSchema, - DataIndexTypes, - EmbeddingSchema, - IndexSourceSchema, - IndexStoreSchema, -) - -__all__ = [ - "DataIndexSchema", - "IndexSourceSchema", - "CitationRegexSchema", - "EmbeddingSchema", - "IndexStoreSchema", - "DataIndexTypes", -] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/_schema/_data_index/data_index.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/_schema/_data_index/data_index.py deleted file mode 100644 index bfe70f705f37..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/_schema/_data_index/data_index.py +++ /dev/null @@ -1,226 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- - -# pylint: disable=unused-argument - -from marshmallow import fields, post_load - -from azure.ai.ml._schema.assets.data import DataSchema -from azure.ai.ml._schema.core.fields import ArmVersionedStr, LocalPathField, NestedField, StringTransformedEnum, UnionField -from azure.ai.ml._schema.core.schema import PatchedSchemaMeta -from azure.ai.ml._schema.job.input_output_entry import generate_datastore_property -from azure.ai.ml._utils._experimental import experimental -from azure.ai.ml.constants._common import AssetTypes, AzureMLResourceType, InputOutputModes - - -# FROM: azure.ai.ml._schema.job.input_output_entry -def generate_path_property(azureml_type, **kwargs): - return UnionField( - [ - ArmVersionedStr(azureml_type=azureml_type), - fields.Str(metadata={"pattern": r"^(http(s)?):.*"}), - fields.Str(metadata={"pattern": r"^(wasb(s)?):.*"}), - LocalPathField(pattern=r"^file:.*"), - LocalPathField( - pattern=r"^(?!(azureml|http(s)?|wasb(s)?|file):).*", - ), - ], - is_strict=True, - **kwargs, - ) - - -class DataIndexTypes: - """DataIndexTypes is an enumeration of values for the types out indexes which can be written to by DataIndex.""" - - ACS = "acs" - """Azure Cognitive Search index type.""" - FAISS = "faiss" - """Faiss index type.""" - - -class CitationRegexSchema(metaclass=PatchedSchemaMeta): - match_pattern = fields.Str( - required=True, - metadata={"description": "Regex to match citation in the citation_url + input file path. e.g. '\\1/\\2'"}, - ) - replacement_pattern = fields.Str( - required=True, - metadata={"description": r"Replacement string for citation. e.g. '(.*)/articles/(.*)(\.[^.]+)$'"}, - ) - - @post_load - def make(self, data, **kwargs): - from azure.ai.generative.index._dataindex.entities.data_index import CitationRegex - - return CitationRegex(**data) - - -class InputDataSchema(metaclass=PatchedSchemaMeta): - mode = StringTransformedEnum( - allowed_values=[ - InputOutputModes.RO_MOUNT, - InputOutputModes.RW_MOUNT, - InputOutputModes.DOWNLOAD, - ], - required=False, - ) - type = StringTransformedEnum( - allowed_values=[ - AssetTypes.URI_FILE, - AssetTypes.URI_FOLDER, - ] - ) - path = generate_path_property(azureml_type=AzureMLResourceType.DATA) - datastore = generate_datastore_property() - - @post_load - def make(self, data, **kwargs): - from azure.ai.ml.entities import Data - - return Data(**data) - - -class InputMLTableSchema(metaclass=PatchedSchemaMeta): - mode = StringTransformedEnum( - allowed_values=[ - InputOutputModes.EVAL_MOUNT, - InputOutputModes.EVAL_DOWNLOAD, - ], - required=False, - ) - type = StringTransformedEnum(allowed_values=[AssetTypes.MLTABLE]) - path = generate_path_property(azureml_type=AzureMLResourceType.DATA) - datastore = generate_datastore_property() - - @post_load - def make(self, data, **kwargs): - from azure.ai.ml.entities import Data - - return Data(**data) - - -class IndexSourceSchema(metaclass=PatchedSchemaMeta): - input_data = UnionField( - [NestedField(InputDataSchema), NestedField(InputMLTableSchema)], - required=True, - allow_none=False, - metadata={"description": "Input Data to index files from. MLTable type inputs will use `mode: eval_mount`."}, - ) - input_glob = fields.Str( - required=False, - metadata={ - "description": "Glob pattern to filter files from input_data. If not specified, all files will be indexed." - }, - ) - chunk_size = fields.Int( - required=False, - allow_none=False, - metadata={"description": "Maximum number of tokens to put in each chunk."}, - ) - chunk_overlap = fields.Int( - required=False, - allow_none=False, - metadata={"description": "Number of tokens to overlap between chunks."}, - ) - citation_url = fields.Str( - required=False, - metadata={"description": "Base URL to join with file paths to create full source file URL for chunk metadata."}, - ) - citation_url_replacement_regex = NestedField( - CitationRegexSchema, - required=False, - metadata={ - "description": "Regex match and replacement patterns for citation url. Useful if the paths in `input_data` " - "don't match the desired citation format." - }, - ) - - @post_load - def make(self, data, **kwargs): - from azure.ai.generative.index._dataindex.entities.data_index import IndexSource - - return IndexSource(**data) - - -class EmbeddingSchema(metaclass=PatchedSchemaMeta): - model = fields.Str( - required=True, - allow_none=False, - metadata={ - "description": "The model to use to embed data. E.g. 'hugging_face://model/sentence-transformers/" - "all-mpnet-base-v2' or 'azure_open_ai://deployment/{{deployment_name}}/model/{{model_name}}'" - }, - ) - connection = fields.Str( - required=False, - metadata={ - "description": "Connection reference to use for embedding model information, " - "only needed for hosted embeddings models (such as Azure OpenAI)." - }, - ) - cache_path = generate_path_property( - azureml_type=AzureMLResourceType.DATASTORE, - required=False, - metadata={ - "description": "Folder containing previously generated embeddings. " - "Should be parent folder of the 'embeddings' output path used for for this component. " - "Will compare input data to existing embeddings and only embed changed/new data, " - "reusing existing chunks." - }, - ) - - @post_load - def make(self, data, **kwargs): - from azure.ai.generative.index._dataindex.entities.data_index import Embedding - - return Embedding(**data) - - -class IndexStoreSchema(metaclass=PatchedSchemaMeta): - type = StringTransformedEnum( - allowed_values=[ - DataIndexTypes.ACS, - DataIndexTypes.FAISS, - ], - metadata={"description": "The type of index to write to. Currently supported types are 'acs' and 'faiss'."}, - ) - name = fields.Str( - required=False, - metadata={"description": "Name of the index to write to. If not specified, a name will be generated."}, - ) - connection = fields.Str( - required=False, - metadata={ - "description": "Connection reference to use for index information, " - "only needed for hosted indexes (such as Azure Cognitive Search)." - }, - ) - config = fields.Dict( - required=False, - metadata={ - "description": "Configuration for the index. Primary use is to configure Azure Cognitive Search specific settings." - "Such as custom `field_mapping` for known field types." - } - ) - - @post_load - def make(self, data, **kwargs): - from azure.ai.generative.index._dataindex.entities.data_index import IndexStore - - return IndexStore(**data) - - -@experimental -class DataIndexSchema(DataSchema): - source = NestedField(IndexSourceSchema, required=True, allow_none=False) - embedding = NestedField(EmbeddingSchema, required=True, allow_none=False) - index = NestedField(IndexStoreSchema, required=True, allow_none=False) - incremental_update = fields.Bool() - - @post_load - def make(self, data, **kwargs): - from azure.ai.generative.index._dataindex.entities.data_index import DataIndex - - return DataIndex(**data) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/constants/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/constants/__init__.py deleted file mode 100644 index 624f5ee88ecf..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/constants/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- - - -__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/constants/_component.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/constants/_component.py deleted file mode 100644 index 1c927e0b8972..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/constants/_component.py +++ /dev/null @@ -1,26 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- - -class DataIndexComponentUri(object): - DATA_INDEX_COG_SEARCH = "azureml://registries/azureml/components/llm_ingest_dataset_to_acs_basic/labels/default" - DATA_INDEX_FAISS = "azureml://registries/azureml/components/llm_ingest_dataset_to_faiss_basic/labels/default" - - @staticmethod - def with_registry(component_uri: str, registry_name: str) -> str: - return component_uri.replace("azureml://registries/azureml", f"azureml://registries/{registry_name}") - - -class LLMRAGComponentUri(object): - LLM_RAG_CRACK_AND_CHUNK = "azureml://registries/azureml/components/llm_rag_crack_and_chunk/labels/default" - LLM_RAG_GENERATE_EMBEDDINGS = "azureml://registries/azureml/components/llm_rag_generate_embeddings/labels/default" - LLM_RAG_CRACK_AND_CHUNK_AND_EMBED = ( - "azureml://registries/azureml/components/llm_rag_crack_and_chunk_and_embed/labels/default" - ) - LLM_RAG_UPDATE_ACS_INDEX = "azureml://registries/azureml/components/llm_rag_update_acs_index/labels/default" - LLM_RAG_CREATE_FAISS_INDEX = "azureml://registries/azureml/components/llm_rag_create_faiss_index/labels/default" - LLM_RAG_REGISTER_MLINDEX_ASSET = ( - "azureml://registries/azureml/components/llm_rag_register_mlindex_asset/labels/default" - ) - LLM_RAG_VALIDATE_DEPLOYMENTS = "azureml://registries/azureml/components/llm_rag_validate_deployments/labels/default" - LLM_RAG_CREATE_PROMPTFLOW = "azureml://registries/azureml/components/llm_rag_create_promptflow/labels/default" diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/data_index/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/data_index/__init__.py deleted file mode 100644 index 37ec50189954..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/data_index/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- -"""DataIndex configuration and operations.""" - -from azure.ai.generative.index._dataindex.data_index.models import build_model_protocol -from azure.ai.generative.index._dataindex.entities.data_index import CitationRegex, Data, DataIndex, Embedding, IndexSource, IndexStore -from azure.ai.generative.index._dataindex.entities._builders.data_index_func import index_data - -__all__ = [ - "DataIndex", - "IndexSource", - "Data", - "CitationRegex", - "Embedding", - "IndexStore", - "index_data", - "build_model_protocol", -] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/data_index/models.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/data_index/models.py deleted file mode 100644 index 22521bf94d4b..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/data_index/models.py +++ /dev/null @@ -1,26 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- -"""DataIndex embedding model helpers.""" -import re -from typing import Optional - -OPEN_AI_PROTOCOL_TEMPLATE = "azure_open_ai://deployment/{}/model/{}" -OPEN_AI_PROTOCOL_REGEX_PATTERN = OPEN_AI_PROTOCOL_TEMPLATE.format(".*", ".*") -OPEN_AI_SHORT_FORM_PROTOCOL_TEMPLATE = "azure_open_ai://deployments?/{}" -OPEN_AI_PROTOCOL_REGEX_PATTERN = OPEN_AI_SHORT_FORM_PROTOCOL_TEMPLATE.format(".*") - -HUGGINGFACE_PROTOCOL_TEMPLATE = "hugging_face://model/{}" -HUGGINGFACE_PROTOCOL_REGEX_PATTERN = HUGGINGFACE_PROTOCOL_TEMPLATE.format(".*") - - -def build_model_protocol(model: Optional[str] = None): - """Build a model protocol from user input.""" - if not model or re.match(OPEN_AI_PROTOCOL_REGEX_PATTERN, model, re.IGNORECASE): - return model - if re.match(OPEN_AI_SHORT_FORM_PROTOCOL_TEMPLATE, model, re.IGNORECASE): - return model - if re.match(HUGGINGFACE_PROTOCOL_REGEX_PATTERN, model, re.IGNORECASE): - return model - - return OPEN_AI_PROTOCOL_TEMPLATE.format(model, model) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/dsl/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/dsl/__init__.py deleted file mode 100644 index 624f5ee88ecf..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/dsl/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- - - -__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/dsl/_pipeline_decorator.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/dsl/_pipeline_decorator.py deleted file mode 100644 index 033d63b445df..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/dsl/_pipeline_decorator.py +++ /dev/null @@ -1,259 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- - -# pylint: disable=protected-access - -import inspect -import logging -from collections import OrderedDict -from functools import wraps -from inspect import Parameter, signature -from pathlib import Path -from typing import Callable, Dict, List, Optional, TypeVar, Union, overload - -from typing_extensions import ParamSpec - -from azure.ai.ml._utils.utils import is_private_preview_enabled -from azure.ai.ml.entities import Data, Model, PipelineJob, PipelineJobSettings -from azure.ai.ml.entities._builders.pipeline import Pipeline -from azure.ai.ml.entities._inputs_outputs import Input, is_group -from azure.ai.ml.entities._job.pipeline._io import NodeOutput, PipelineInput, _GroupAttrDict -from azure.ai.ml.entities._job.pipeline._pipeline_expression import PipelineExpression -from azure.ai.ml.exceptions import ( - MultipleValueError, - ParamValueNotExistsError, - TooManyPositionalArgsError, - UnexpectedKeywordError, - UnsupportedParameterKindError, - UserErrorException, -) - -from azure.ai.ml.entities._builders import BaseNode -from azure.ai.ml.dsl._pipeline_component_builder import PipelineComponentBuilder, _is_inside_dsl_pipeline_func -from azure.ai.ml.dsl._pipeline_decorator import _validate_args -from azure.ai.ml.dsl._settings import _dsl_settings_stack -from azure.ai.ml.dsl._utils import _resolve_source_file - -SUPPORTED_INPUT_TYPES = ( - PipelineInput, - NodeOutput, - Input, - Model, - Data, # For the case use a Data object as an input, we will convert it to Input object - Pipeline, # For the case use a pipeline node as the input, we use its only one output as the real input. - str, - bool, - int, - float, - PipelineExpression, - _GroupAttrDict, -) -module_logger = logging.getLogger(__name__) - -T = TypeVar("T") -P = ParamSpec("P") - - -# Overload the returns a decorator when func is None -@overload -def pipeline( # type: ignore[misc] - # TODO: Bug 2876412 - func: None = None, - *, - name: Optional[str] = None, - version: Optional[str] = None, - display_name: Optional[str] = None, - description: Optional[str] = None, - experiment_name: Optional[str] = None, - tags: Optional[Dict[str, str]] = None, - **kwargs, -) -> Callable[[Callable[P, T]], Callable[P, PipelineJob]]: - ... - - -# Overload the returns a decorated function when func isn't None -@overload -def pipeline( - func: Optional[Callable[P, T]] = None, - *, - name: Optional[str] = None, - version: Optional[str] = None, - display_name: Optional[str] = None, - description: Optional[str] = None, - experiment_name: Optional[str] = None, - tags: Optional[Dict[str, str]] = None, - **kwargs, -) -> Callable[P, PipelineJob]: - ... - - -def pipeline( - func: Optional[Callable[P, T]] = None, - *, - name: Optional[str] = None, - version: Optional[str] = None, - display_name: Optional[str] = None, - description: Optional[str] = None, - experiment_name: Optional[str] = None, - tags: Optional[Dict[str, str]] = None, - **kwargs, -) -> Union[Callable[[Callable[P, T]], Callable[P, PipelineJob]], Callable[P, PipelineJob]]: - """Build a pipeline which contains all component nodes defined in this function. - - :param func: The user pipeline function to be decorated. - :type func: types.FunctionType - :keyword name: The name of pipeline component, defaults to function name. - :paramtype name: str - :keyword version: The version of pipeline component, defaults to "1". - :paramtype version: str - :keyword display_name: The display name of pipeline component, defaults to function name. - :paramtype display_name: str - :keyword description: The description of the built pipeline. - :paramtype description: str - :keyword experiment_name: Name of the experiment the job will be created under, \ - if None is provided, experiment will be set to current directory. - :paramtype experiment_name: str - :keyword tags: The tags of pipeline component. - :paramtype tags: dict[str, str] - :keyword kwargs: A dictionary of additional configuration parameters. - :paramtype kwargs: dict - - .. admonition:: Example: - - .. literalinclude:: ../../../../samples/ml_samples_pipeline_job_configurations.py - :start-after: [START configure_pipeline] - :end-before: [END configure_pipeline] - :language: python - :dedent: 8 - :caption: Shows how to create a pipeline using this decorator. - :return: Either - * A decorator, if `func` is None - * The decorated `func` - :rtype: Union[ - Callable[[Callable], Callable[..., PipelineJob]], - Callable[P, PipelineJob] - ] - """ - get_component = kwargs.get("get_component", False) - - def pipeline_decorator(func: Callable[P, T]) -> Callable[P, PipelineJob]: - # pylint: disable=isinstance-second-argument-not-valid-type - if not isinstance(func, Callable): # type: ignore - raise UserErrorException(f"Dsl pipeline decorator accept only function type, got {type(func)}.") - - non_pipeline_inputs = kwargs.get("non_pipeline_inputs", []) or kwargs.get("non_pipeline_parameters", []) - # compute variable names changed from default_compute_targe -> compute -> default_compute -> none - # to support legacy usage, we support them with priority. - compute = kwargs.get("compute", None) - default_compute_target = kwargs.get("default_compute_target", None) - default_compute_target = kwargs.get("default_compute", None) or default_compute_target - continue_on_step_failure = kwargs.get("continue_on_step_failure", None) - on_init = kwargs.get("on_init", None) - on_finalize = kwargs.get("on_finalize", None) - - default_datastore = kwargs.get("default_datastore", None) - force_rerun = kwargs.get("force_rerun", None) - job_settings = { - "default_datastore": default_datastore, - "continue_on_step_failure": continue_on_step_failure, - "force_rerun": force_rerun, - "default_compute": default_compute_target, - "on_init": on_init, - "on_finalize": on_finalize, - } - func_entry_path = _resolve_source_file() - if not func_entry_path: - func_path = Path(inspect.getfile(func)) - # in notebook, func_path may be a fake path and will raise error when trying to resolve this fake path - if func_path.exists(): - func_entry_path = func_path.resolve().absolute() - - job_settings = {k: v for k, v in job_settings.items() if v is not None} - pipeline_builder = PipelineComponentBuilder( - func=func, - name=name, - version=version, - display_name=display_name, - description=description, - default_datastore=default_datastore, - tags=tags, - source_path=str(func_entry_path), - non_pipeline_inputs=non_pipeline_inputs, - ) - - @wraps(func) - def wrapper(*args: P.args, **kwargs: P.kwargs) -> PipelineJob: - # Default args will be added here. - # pylint: disable=abstract-class-instantiated - # Node: push/pop stack here instead of put it inside build() - # Because we only want to enable dsl settings on top level pipeline - _dsl_settings_stack.push() # use this stack to track on_init/on_finalize settings - try: - # Convert args to kwargs - provided_positional_kwargs = _validate_args(func, args, kwargs, non_pipeline_inputs) - - # When pipeline supports variable params, update pipeline component to support the inputs in **kwargs. - pipeline_parameters = { - k: v for k, v in provided_positional_kwargs.items() if k not in non_pipeline_inputs - } - pipeline_builder._update_inputs(pipeline_parameters) - - non_pipeline_params_dict = { - k: v for k, v in provided_positional_kwargs.items() if k in non_pipeline_inputs - } - - # TODO: cache built pipeline component - pipeline_component = pipeline_builder.build( - user_provided_kwargs=provided_positional_kwargs, - non_pipeline_inputs_dict=non_pipeline_params_dict, - non_pipeline_inputs=non_pipeline_inputs, - ) - finally: - # use `finally` to ensure pop operation from the stack - dsl_settings = _dsl_settings_stack.pop() - - # update on_init/on_finalize settings if init/finalize job is set - if dsl_settings.init_job_set: - job_settings["on_init"] = dsl_settings.init_job_name(pipeline_component.jobs) - if dsl_settings.finalize_job_set: - job_settings["on_finalize"] = dsl_settings.finalize_job_name(pipeline_component.jobs) - - # TODO: pass compute & default_compute separately? - common_init_args = { - "experiment_name": experiment_name, - "component": pipeline_component, - "inputs": pipeline_parameters, - "tags": tags, - } - if _is_inside_dsl_pipeline_func() or get_component: - # on_init/on_finalize is not supported for pipeline component - if job_settings.get("on_init") is not None or job_settings.get("on_finalize") is not None: - raise UserErrorException("On_init/on_finalize is not supported for pipeline component.") - # Build pipeline node instead of pipeline job if inside dsl. - built_pipeline = Pipeline(_from_component_func=True, **common_init_args) - if job_settings: - module_logger.warning( - ("Job settings %s on pipeline function %r are ignored when using inside PipelineJob."), - job_settings, - func.__name__, - ) - else: - built_pipeline = PipelineJob( - jobs=pipeline_component.jobs, - compute=compute, - settings=PipelineJobSettings(**job_settings), - **common_init_args, - ) - - return built_pipeline - - wrapper._is_dsl_func = True # type: ignore[attr-defined] - wrapper._job_settings = job_settings # type: ignore[attr-defined] - wrapper._pipeline_builder = pipeline_builder # type: ignore[attr-defined] - return wrapper - - # enable use decorator without "()" if all arguments are default values - if func is not None: - return pipeline_decorator(func) - return pipeline_decorator diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/__init__.py deleted file mode 100644 index 87592ba0f0af..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- -"""DataIndex entities.""" - -__path__ = __import__("pkgutil").extend_path(__path__, __name__) - -from azure.ai.generative.index._dataindex.entities._assets import Data -from azure.ai.generative.index._dataindex.entities.data_index import CitationRegex, DataIndex, Embedding, IndexSource, IndexStore -from azure.ai.generative.index._dataindex.entities._builders.data_index_func import index_data - -__all__ = [ - "DataIndex", - "IndexSource", - "Data", - "CitationRegex", - "Embedding", - "IndexStore", - "index_data", -] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_assets/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_assets/__init__.py deleted file mode 100644 index e016d3136023..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_assets/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- - -__path__ = __import__("pkgutil").extend_path(__path__, __name__) - -from azure.ai.generative.index._dataindex.entities._assets._artifacts import Data - -__all__ = [ - "Data", -] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_assets/_artifacts/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_assets/_artifacts/__init__.py deleted file mode 100644 index fa5f7425e90f..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_assets/_artifacts/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- - -__path__ = __import__("pkgutil").extend_path(__path__, __name__) - -from azure.ai.generative.index._dataindex.entities._assets._artifacts.data import Data - -__all__ = [ - "Data", -] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_assets/_artifacts/data.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_assets/_artifacts/data.py deleted file mode 100644 index 25f83ac660e1..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_assets/_artifacts/data.py +++ /dev/null @@ -1,24 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- - -from azure.ai.ml.entities._assets._artifacts.data import Data - - -@classmethod # type: ignore[misc] -# TODO: Bug 2874139 -def _resolve_cls_and_type(cls, data, params_override): - from azure.ai.ml.entities._data_import.data_import import DataImport - from azure.ai.generative.index._dataindex.entities.data_index import DataIndex - print("Hellllooo") - - if "index" in data: - return DataIndex, None - - if "source" in data: - return DataImport, None - return cls, None - - -# Override the _resolve_cls_and_type function in the Data class to support serilization of DataIndex -Data._resolve_cls_and_type = _resolve_cls_and_type diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_builders/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_builders/__init__.py deleted file mode 100644 index 624f5ee88ecf..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_builders/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- - - -__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_builders/data_index_func.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_builders/data_index_func.py deleted file mode 100644 index edfa2f18b737..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_builders/data_index_func.py +++ /dev/null @@ -1,773 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- -# pylint: disable=protected-access -# pylint: disable=no-member -# pylint: disable=unused-argument - -import json -import re -from typing import Any, Callable, Dict, Optional, Tuple, Union - -from azure.ai.ml._utils._experimental import experimental -from azure.ai.ml.constants._common import AssetTypes, LegacyAssetTypes -from azure.ai.ml.entities import PipelineJob -from azure.ai.ml.entities._builders.base_node import pipeline_node_decorator -from azure.ai.ml.entities._credentials import ManagedIdentityConfiguration, UserIdentityConfiguration -from azure.ai.ml.entities._inputs_outputs import Input, Output -from azure.ai.ml.entities._job.pipeline._component_translatable import ComponentTranslatableMixin -from azure.ai.ml.entities._job.pipeline._io import NodeOutput, PipelineInput -from azure.ai.ml.entities._workspace.connections.workspace_connection import WorkspaceConnection -from azure.ai.ml.exceptions import ErrorCategory, ErrorTarget, ValidationErrorType, ValidationException -from azure.ai.generative.index._dataindex._schema._data_index import DataIndexTypes -from azure.ai.generative.index._dataindex.constants._component import LLMRAGComponentUri -from azure.ai.generative.index._dataindex.entities.data_index import DataIndex - -SUPPORTED_INPUTS = [ - LegacyAssetTypes.PATH, - AssetTypes.URI_FILE, - AssetTypes.URI_FOLDER, - AssetTypes.MLTABLE, -] - - -def _parse_input(input_value): - component_input, job_input = None, None - if isinstance(input_value, Input): - component_input = Input(**input_value._to_dict()) - input_type = input_value.type - if input_type in SUPPORTED_INPUTS: - job_input = Input(**input_value._to_dict()) - elif isinstance(input_value, dict): - # if user provided dict, we try to parse it to Input. - # for job input, only parse for path type - input_type = input_value.get("type", None) - if input_type in SUPPORTED_INPUTS: - job_input = Input(**input_value) - component_input = Input(**input_value) - elif isinstance(input_value, str): - # Input bindings - component_input = ComponentTranslatableMixin._to_input_builder_function(input_value) - job_input = input_value - elif isinstance(input_value, (PipelineInput, NodeOutput)): - # datatransfer node can accept PipelineInput/NodeOutput for export task. - if input_value._data is None or isinstance(input_value._data, Output): - data = Input(type=input_value.type, mode=input_value.mode) - else: - data = input_value._data - component_input, _ = _parse_input(data) - job_input = input_value - else: - msg = ( - f"Unsupported input type: {type(input_value)}, only Input, dict, str, PipelineInput and NodeOutput are " - f"supported." - ) - raise ValidationException( - message=msg, - no_personal_data_message=msg, - target=ErrorTarget.JOB, - error_type=ValidationErrorType.INVALID_VALUE, - ) - return component_input, job_input - - -def _parse_output(output_value): - component_output, job_output = None, None - if isinstance(output_value, Output): - component_output = Output(**output_value._to_dict()) - job_output = Output(**output_value._to_dict()) - elif not output_value: - # output value can be None or empty dictionary - # None output value will be packed into a JobOutput object with mode = ReadWriteMount & type = UriFolder - component_output = ComponentTranslatableMixin._to_output(output_value) - job_output = output_value - elif isinstance(output_value, dict): # When output value is a non-empty dictionary - job_output = Output(**output_value) - component_output = Output(**output_value) - elif isinstance(output_value, str): # When output is passed in from pipeline job yaml - job_output = output_value - else: - msg = f"Unsupported output type: {type(output_value)}, only Output and dict are supported." - raise ValidationException( - message=msg, - no_personal_data_message=msg, - target=ErrorTarget.JOB, - error_type=ValidationErrorType.INVALID_VALUE, - ) - return component_output, job_output - - -def _parse_inputs_outputs(io_dict: Dict, parse_func: Callable) -> Tuple[Dict, Dict]: - component_io_dict, job_io_dict = {}, {} - if io_dict: - for key, val in io_dict.items(): - component_io, job_io = parse_func(val) - component_io_dict[key] = component_io - job_io_dict[key] = job_io - return component_io_dict, job_io_dict - - -def _build_data_index(io_dict: Union[Dict, DataIndex]): - if io_dict is None: - return io_dict - if isinstance(io_dict, DataIndex): - component_io = io_dict - else: - if isinstance(io_dict, dict): - component_io = DataIndex(**io_dict) - else: - msg = "data_index only support dict and DataIndex" - raise ValidationException( - message=msg, - no_personal_data_message=msg, - target=ErrorTarget.DATA, - error_category=ErrorCategory.USER_ERROR, - error_type=ValidationErrorType.INVALID_VALUE, - ) - - return component_io - - -@experimental -@pipeline_node_decorator -def index_data( - *, - data_index: DataIndex, - description: Optional[str] = None, - tags: Optional[Dict] = None, - name: Optional[str] = None, - display_name: Optional[str] = None, - experiment_name: Optional[str] = None, - compute: Optional[str] = None, - serverless_instance_type: Optional[str] = None, - ml_client: Optional[Any] = None, - identity: Optional[Union[ManagedIdentityConfiguration, UserIdentityConfiguration]] = None, - input_data_override: Optional[Input] = None, - **kwargs, -) -> PipelineJob: - """ - Create a PipelineJob object which can be used inside dsl.pipeline. - - :keywork data_index: The data index configuration. - :type data_index: DataIndex - :keyword description: Description of the job. - :type description: str - :keyword tags: Tag dictionary. Tags can be added, removed, and updated. - :type tags: dict[str, str] - :keyword display_name: Display name of the job. - :type display_name: str - :keyword experiment_name: Name of the experiment the job will be created under. - :type experiment_name: str - :keyword compute: The compute resource the job runs on. - :type compute: str - :keyword serverless_instance_type: The instance type to use for serverless compute. - :type serverless_instance_type: Optional[str] - :keyword ml_client: The ml client to use for the job. - :type ml_client: Any - :keyword identity: Identity configuration for the job. - :type identity: Optional[Union[ManagedIdentityConfiguration, UserIdentityConfiguration]] - :keyword input_data_override: Input data override for the job. - Used to pipe output of step into DataIndex Job in a pipeline. - :type input_data_override: Optional[Input] - :return: A PipelineJob object. - :rtype: ~azure.ai.ml.entities.PipelineJob. - """ - data_index = _build_data_index(data_index) - - if data_index.index.type == DataIndexTypes.FAISS: - configured_component = data_index_faiss( - ml_client, - data_index, - description, - tags, - name, - display_name, - experiment_name, - compute, - serverless_instance_type, - identity, - input_data_override, - ) - elif data_index.index.type == DataIndexTypes.ACS: - if kwargs.get("incremental_update", False): - configured_component = data_index_incremental_update_acs( - ml_client, - data_index, - description, - tags, - name, - display_name, - experiment_name, - compute, - serverless_instance_type, - identity, - input_data_override, - ) - else: - configured_component = data_index_acs( - ml_client, - data_index, - description, - tags, - name, - display_name, - experiment_name, - compute, - serverless_instance_type, - identity, - input_data_override, - ) - else: - raise ValueError(f"Unsupported index type: {data_index.index.type}") - - configured_component.properties["azureml.mlIndexAssetName"] = data_index.name - configured_component.properties["azureml.mlIndexAssetKind"] = data_index.index.type - configured_component.properties["azureml.mlIndexAssetSource"] = "Data Asset" - - return configured_component - - -def data_index_incremental_update_acs( - ml_client: Any, - data_index: DataIndex, - description: Optional[str] = None, - tags: Optional[Dict] = None, - name: Optional[str] = None, - display_name: Optional[str] = None, - experiment_name: Optional[str] = None, - compute: Optional[str] = None, - serverless_instance_type: Optional[str] = None, - identity: Optional[Union[ManagedIdentityConfiguration, UserIdentityConfiguration]] = None, - input_data_override: Optional[Input] = None, -): - from azure.ai.generative.index._dataindex.data_index.models import build_model_protocol - from azure.ai.generative.index._dataindex.dsl._pipeline_decorator import pipeline - - crack_and_chunk_and_embed_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_CRACK_AND_CHUNK_AND_EMBED) - update_acs_index_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_UPDATE_ACS_INDEX) - register_mlindex_asset_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_REGISTER_MLINDEX_ASSET) - - @pipeline( - name=name if name else "data_index_incremental_update_acs", - description=description, - tags=tags, - display_name=display_name if display_name else "LLM - Data to ACS (Incremental Update)", - experiment_name=experiment_name, - compute=compute, - get_component=True, - ) - def data_index_acs_pipeline( - input_data: Input, - embeddings_model: str, - acs_config: str, - acs_connection_id: str, - aoai_connection_id: str, - embeddings_container: Input, - chunk_size: int = 768, - chunk_overlap: Optional[int] = 0, - input_glob: Optional[str] = "**/*", - citation_url: Optional[str] = None, - citation_replacement_regex: Optional[str] = None, - ): - """ - Generate embeddings for a `input_data` source and push them into an Azure Cognitive Search index. - - :param input_data: The input data to be indexed. - :type input_data: Input - :param embeddings_model: The embedding model to use when processing source data chunks. - :type embeddings_model: str - :param acs_config: The configuration for the Azure Cognitive Search index. - :type acs_config: str - :param acs_connection_id: The connection ID for the Azure Cognitive Search index. - :type acs_connection_id: str - :param chunk_size: The size of the chunks to break the input data into. - :type chunk_size: int - :param chunk_overlap: The number of tokens to overlap between chunks. - :type chunk_overlap: Optional[int] - :param input_glob: The glob pattern to use when searching for input data. - :type input_glob: Optional[str] - :param citation_url: The URL to use when generating citations for the input data. - :type citation_url: str - :param citation_replacement_regex: The regex to use when generating citations for the input data. - :type citation_replacement_regex: str - :param aoai_connection_id: The connection ID for the Azure Open AI service. - :type aoai_connection_id: str - :param embeddings_container: The container to use when caching embeddings. - :type embeddings_container: Input - :return: The URI of the generated Azure Cognitive Search index. - :rtype: str. - """ - if input_glob is None: - input_glob = "**/*" - if chunk_overlap is None: - chunk_overlap = 0 - - crack_and_chunk_and_embed = crack_and_chunk_and_embed_component( - input_data=input_data, - input_glob=input_glob, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - citation_url=citation_url, - citation_replacement_regex=citation_replacement_regex, - embeddings_container=embeddings_container, - embeddings_model=embeddings_model, - embeddings_connection_id=aoai_connection_id, - ) - if compute is None or compute == "serverless": - use_automatic_compute(crack_and_chunk_and_embed, instance_type=serverless_instance_type) - if optional_pipeline_input_provided(embeddings_container): - crack_and_chunk_and_embed.outputs.embeddings = Output( - type="uri_folder", path=f"{embeddings_container.path}/{{name}}" - ) - if identity: - crack_and_chunk_and_embed.identity = identity - - update_acs_index = update_acs_index_component( - embeddings=crack_and_chunk_and_embed.outputs.embeddings, acs_config=acs_config - ) - if compute is None or compute == "serverless": - use_automatic_compute(update_acs_index, instance_type=serverless_instance_type) - update_acs_index.environment_variables["AZUREML_WORKSPACE_CONNECTION_ID_ACS"] = acs_connection_id - if identity: - update_acs_index.identity = identity - - register_mlindex_asset = register_mlindex_asset_component( - storage_uri=update_acs_index.outputs.index, - asset_name=data_index.name, - ) - if compute is None or compute == "serverless": - use_automatic_compute(register_mlindex_asset, instance_type=serverless_instance_type) - if identity: - register_mlindex_asset.identity = identity - return { - "mlindex_asset_uri": update_acs_index.outputs.index, - "mlindex_asset_id": register_mlindex_asset.outputs.asset_id, - } - - if input_data_override is not None: - input_data = input_data_override - else: - input_data = Input(type=data_index.source.input_data.type, path=data_index.source.input_data.path) - - acs_config = { - "index_name": data_index.index.name if data_index.index.name is not None else data_index.name, - "full_sync": True, - } - if data_index.index.config is not None: - acs_config.update(data_index.index.config) - - component = data_index_acs_pipeline( - input_data=input_data, - input_glob=data_index.source.input_glob, - chunk_size=data_index.source.chunk_size, # type: ignore[arg-type] - chunk_overlap=data_index.source.chunk_overlap, - citation_url=data_index.source.citation_url, - citation_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict()) - if data_index.source.citation_url_replacement_regex - else None, - embeddings_model=build_model_protocol(data_index.embedding.model), - aoai_connection_id=_resolve_connection_id(ml_client, data_index.embedding.connection), - embeddings_container=Input(type=AssetTypes.URI_FOLDER, path=data_index.embedding.cache_path) if data_index.embedding.cache_path else None, - acs_config=json.dumps(acs_config), - acs_connection_id=_resolve_connection_id(ml_client, data_index.index.connection), - ) - # Hack until full Component classes are implemented that can annotate the optional parameters properly - component.inputs["input_glob"]._meta.optional = True - component.inputs["chunk_size"]._meta.optional = True - component.inputs["chunk_overlap"]._meta.optional = True - component.inputs["citation_url"]._meta.optional = True - component.inputs["citation_replacement_regex"]._meta.optional = True - component.inputs["aoai_connection_id"]._meta.optional = True - component.inputs["embeddings_container"]._meta.optional = True - - if data_index.path: - component.outputs.mlindex_asset_uri = Output(type=AssetTypes.URI_FOLDER, path=data_index.path) - - return component - - -def data_index_faiss( - ml_client: Any, - data_index: DataIndex, - description: Optional[str] = None, - tags: Optional[Dict] = None, - name: Optional[str] = None, - display_name: Optional[str] = None, - experiment_name: Optional[str] = None, - compute: Optional[str] = None, - serverless_instance_type: Optional[str] = None, - identity: Optional[Union[ManagedIdentityConfiguration, UserIdentityConfiguration]] = None, - input_data_override: Optional[Input] = None, -): - from azure.ai.generative.index._dataindex.data_index.models import build_model_protocol - from azure.ai.generative.index._dataindex.dsl._pipeline_decorator import pipeline - - crack_and_chunk_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_CRACK_AND_CHUNK) - generate_embeddings_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_GENERATE_EMBEDDINGS) - create_faiss_index_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_CREATE_FAISS_INDEX) - register_mlindex_asset_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_REGISTER_MLINDEX_ASSET) - - @pipeline( - name=name if name else "data_index_faiss", - description=description, - tags=tags, - display_name=display_name if display_name else "LLM - Data to Faiss", - experiment_name=experiment_name, - compute=compute, - get_component=True, - ) - def data_index_faiss_pipeline( - input_data: Input, - embeddings_model: str, - embeddings_container: Input, - chunk_size: int = 1024, - data_source_glob: str = None, # type: ignore[assignment] - data_source_url: str = None, # type: ignore[assignment] - document_path_replacement_regex: str = None, # type: ignore[assignment] - aoai_connection_id: str = None, # type: ignore[assignment] - ): - """ - Generate embeddings for a `input_data` source and create a Faiss index from them. - - :param input_data: The input data to be indexed. - :type input_data: Input - :param embeddings_model: The embedding model to use when processing source data chunks. - :type embeddings_model: str - :param chunk_size: The size of the chunks to break the input data into. - :type chunk_size: Optional[int] - :param data_source_glob: The glob pattern to use when searching for input data. - :type data_source_glob: str - :param data_source_url: The URL to use when generating citations for the input data. - :type data_source_url: str - :param document_path_replacement_regex: The regex to use when generating citations for the input data. - :type document_path_replacement_regex: str - :param aoai_connection_id: The connection ID for the Azure Open AI service. - :type aoai_connection_id: str - :param embeddings_container: The container to use when caching embeddings. - :type embeddings_container: Input - :return: The URI of the generated Faiss index. - :rtype: str. - """ - if chunk_size is None: - chunk_size = 1024 - - crack_and_chunk = crack_and_chunk_component( - input_data=input_data, - input_glob=data_source_glob, - chunk_size=chunk_size, - data_source_url=data_source_url, - document_path_replacement_regex=document_path_replacement_regex, - ) - if compute is None or compute == "serverless": - use_automatic_compute(crack_and_chunk, instance_type=serverless_instance_type) - if identity: - crack_and_chunk.identity = identity - - generate_embeddings = generate_embeddings_component( - chunks_source=crack_and_chunk.outputs.output_chunks, - embeddings_container=embeddings_container, - embeddings_model=embeddings_model, - ) - if compute is None or compute == "serverless": - use_automatic_compute(generate_embeddings, instance_type=serverless_instance_type) - if optional_pipeline_input_provided(aoai_connection_id): - generate_embeddings.environment_variables["AZUREML_WORKSPACE_CONNECTION_ID_AOAI"] = aoai_connection_id - if optional_pipeline_input_provided(embeddings_container): - generate_embeddings.outputs.embeddings = Output( - type="uri_folder", path=f"{embeddings_container.path}/{{name}}" - ) - if identity: - generate_embeddings.identity = identity - - create_faiss_index = create_faiss_index_component(embeddings=generate_embeddings.outputs.embeddings) - if compute is None or compute == "serverless": - use_automatic_compute(create_faiss_index, instance_type=serverless_instance_type) - if identity: - create_faiss_index.identity = identity - - register_mlindex_asset = register_mlindex_asset_component( - storage_uri=create_faiss_index.outputs.index, - asset_name=data_index.name, - ) - if compute is None or compute == "serverless": - use_automatic_compute(register_mlindex_asset, instance_type=serverless_instance_type) - if identity: - register_mlindex_asset.identity = identity - return { - "mlindex_asset_uri": create_faiss_index.outputs.index, - "mlindex_asset_id": register_mlindex_asset.outputs.asset_id, - } - - if input_data_override is not None: - input_data = input_data_override - else: - input_data = Input(type=data_index.source.input_data.type, path=data_index.source.input_data.path) - - component = data_index_faiss_pipeline( - input_data=input_data, - embeddings_model=build_model_protocol(data_index.embedding.model), - chunk_size=data_index.source.chunk_size, # type: ignore[arg-type] - data_source_glob=data_index.source.input_glob, # type: ignore[arg-type] - data_source_url=data_index.source.citation_url, # type: ignore[arg-type] - document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict()) # type: ignore[arg-type] - if data_index.source.citation_url_replacement_regex - else None, - aoai_connection_id=_resolve_connection_id(ml_client, data_index.embedding.connection), - embeddings_container=Input(type=AssetTypes.URI_FOLDER, path=data_index.embedding.cache_path) if data_index.embedding.cache_path else None, - ) - # Hack until full Component classes are implemented that can annotate the optional parameters properly - component.inputs["data_source_glob"]._meta.optional = True - component.inputs["data_source_url"]._meta.optional = True - component.inputs["document_path_replacement_regex"]._meta.optional = True - component.inputs["aoai_connection_id"]._meta.optional = True - component.inputs["embeddings_container"]._meta.optional = True - if data_index.path: - component.outputs.mlindex_asset_uri = Output(type=AssetTypes.URI_FOLDER, path=data_index.path) - - return component - - -def data_index_acs( - ml_client: Any, - data_index: DataIndex, - description: Optional[str] = None, - tags: Optional[Dict] = None, - name: Optional[str] = None, - display_name: Optional[str] = None, - experiment_name: Optional[str] = None, - compute: Optional[str] = None, - serverless_instance_type: Optional[str] = None, - identity: Optional[Union[ManagedIdentityConfiguration, UserIdentityConfiguration]] = None, - input_data_override: Optional[Input] = None, -): - from azure.ai.generative.index._dataindex.data_index.models import build_model_protocol - from azure.ai.generative.index._dataindex.dsl._pipeline_decorator import pipeline - - crack_and_chunk_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_CRACK_AND_CHUNK) - generate_embeddings_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_GENERATE_EMBEDDINGS) - update_acs_index_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_UPDATE_ACS_INDEX) - register_mlindex_asset_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_REGISTER_MLINDEX_ASSET) - - @pipeline( - name=name if name else "data_index_acs", - description=description, - tags=tags, - display_name=display_name if display_name else "LLM - Data to ACS", - experiment_name=experiment_name, - compute=compute, - get_component=True, - ) - def data_index_acs_pipeline( - input_data: Input, - embeddings_model: str, - acs_config: str, - acs_connection_id: str, - embeddings_container: Input, - chunk_size: int = 1024, - data_source_glob: str = None, # type: ignore[assignment] - data_source_url: str = None, # type: ignore[assignment] - document_path_replacement_regex: str = None, # type: ignore[assignment] - aoai_connection_id: str = None, # type: ignore[assignment] - ): - """ - Generate embeddings for a `input_data` source and push them into an Azure Cognitive Search index. - - :param input_data: The input data to be indexed. - :type input_data: Input - :param embeddings_model: The embedding model to use when processing source data chunks. - :type embeddings_model: str - :param acs_config: The configuration for the Azure Cognitive Search index. - :type acs_config: str - :param acs_connection_id: The connection ID for the Azure Cognitive Search index. - :type acs_connection_id: str - :param chunk_size: The size of the chunks to break the input data into. - :type chunk_size: Optional[int] - :param data_source_glob: The glob pattern to use when searching for input data. - :type data_source_glob: str - :param data_source_url: The URL to use when generating citations for the input data. - :type data_source_url: str - :param document_path_replacement_regex: The regex to use when generating citations for the input data. - :type document_path_replacement_regex: str - :param aoai_connection_id: The connection ID for the Azure Open AI service. - :type aoai_connection_id: str - :param embeddings_container: The container to use when caching embeddings. - :type embeddings_container: Input - :return: The URI of the generated Azure Cognitive Search index. - :rtype: str. - """ - if chunk_size is None: - chunk_size = 1024 - - crack_and_chunk = crack_and_chunk_component( - input_data=input_data, - input_glob=data_source_glob, - chunk_size=chunk_size, - data_source_url=data_source_url, - document_path_replacement_regex=document_path_replacement_regex, - ) - if compute is None or compute == "serverless": - use_automatic_compute(crack_and_chunk, instance_type=serverless_instance_type) - if identity: - crack_and_chunk.identity = identity - - generate_embeddings = generate_embeddings_component( - chunks_source=crack_and_chunk.outputs.output_chunks, - embeddings_container=embeddings_container, - embeddings_model=embeddings_model, - ) - if compute is None or compute == "serverless": - use_automatic_compute(generate_embeddings, instance_type=serverless_instance_type) - if optional_pipeline_input_provided(aoai_connection_id): - generate_embeddings.environment_variables["AZUREML_WORKSPACE_CONNECTION_ID_AOAI"] = aoai_connection_id - if optional_pipeline_input_provided(embeddings_container): - generate_embeddings.outputs.embeddings = Output( - type="uri_folder", path=f"{embeddings_container.path}/{{name}}" - ) - if identity: - generate_embeddings.identity = identity - - update_acs_index = update_acs_index_component( - embeddings=generate_embeddings.outputs.embeddings, acs_config=acs_config - ) - if compute is None or compute == "serverless": - use_automatic_compute(update_acs_index, instance_type=serverless_instance_type) - update_acs_index.environment_variables["AZUREML_WORKSPACE_CONNECTION_ID_ACS"] = acs_connection_id - if identity: - update_acs_index.identity = identity - - register_mlindex_asset = register_mlindex_asset_component( - storage_uri=update_acs_index.outputs.index, - asset_name=data_index.name, - ) - if compute is None or compute == "serverless": - use_automatic_compute(register_mlindex_asset, instance_type=serverless_instance_type) - if identity: - register_mlindex_asset.identity = identity - return { - "mlindex_asset_uri": update_acs_index.outputs.index, - "mlindex_asset_id": register_mlindex_asset.outputs.asset_id, - } - - if input_data_override is not None: - input_data = input_data_override - else: - input_data = Input(type=data_index.source.input_data.type, path=data_index.source.input_data.path) - - acs_config = { - "index_name": data_index.index.name if data_index.index.name is not None else data_index.name, - } - if data_index.index.config is not None: - acs_config.update(data_index.index.config) - - component = data_index_acs_pipeline( - input_data=input_data, - embeddings_model=build_model_protocol(data_index.embedding.model), - acs_config=json.dumps(acs_config), - acs_connection_id=_resolve_connection_id(ml_client, data_index.index.connection), - chunk_size=data_index.source.chunk_size, # type: ignore[arg-type] - data_source_glob=data_index.source.input_glob, # type: ignore[arg-type] - data_source_url=data_index.source.citation_url, # type: ignore[arg-type] - document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict()) # type: ignore[arg-type] - if data_index.source.citation_url_replacement_regex - else None, - aoai_connection_id=_resolve_connection_id(ml_client, data_index.embedding.connection), - embeddings_container=Input(type=AssetTypes.URI_FOLDER, path=data_index.embedding.cache_path) if data_index.embedding.cache_path else None, - ) - # Hack until full Component classes are implemented that can annotate the optional parameters properly - component.inputs["data_source_glob"]._meta.optional = True - component.inputs["data_source_url"]._meta.optional = True - component.inputs["document_path_replacement_regex"]._meta.optional = True - component.inputs["aoai_connection_id"]._meta.optional = True - component.inputs["embeddings_container"]._meta.optional = True - - if data_index.path: - component.outputs.mlindex_asset_uri = Output(type=AssetTypes.URI_FOLDER, path=data_index.path) - - return component - - -def optional_pipeline_input_provided(input: Optional[PipelineInput]): - """ - Checks if optional pipeline inputs are provided. - - :param input: The pipeline input to check. - :type input: Optional[PipelineInput] - :return: True if the input is not None and has a value, False otherwise. - :rtype: bool. - """ - return input is not None and input._data is not None - - -def use_automatic_compute(component, instance_count=1, instance_type=None): - """ - Configure input `component` to use automatic compute with `instance_count` and `instance_type`. - - This avoids the need to provision a compute cluster to run the component. - :param component: The component to configure. - :type component: Any - :param instance_count: The number of instances to use. - :type instance_count: int - :param instance_type: The type of instance to use. - :type instance_type: str - :return: The configured component. - :rtype: Any. - """ - component.set_resources( - instance_count=instance_count, - instance_type=instance_type, - properties={"compute_specification": {"automatic": True}}, - ) - return component - - -def get_component_obj(ml_client, component_uri): - from azure.ai.ml import MLClient - - if not isinstance(component_uri, str): - # Assume Component object - return component_uri - - matches = re.match( - r"azureml://registries/(?P.*)/components/(?P.*)" - r"/(?P.*)/(?P.*)", - component_uri, - ) - if matches is None: - from azure.ai.ml import load_component - - # Assume local path to component - return load_component(source=component_uri) - - registry_name = matches.group("registry_name") - registry_client = MLClient( - subscription_id=ml_client.subscription_id, - resource_group_name=ml_client.resource_group_name, - credential=ml_client._credential, - registry_name=registry_name, - ) - component_obj = registry_client.components.get( - matches.group("component_name"), - **{matches.group("identifier_type").rstrip("s"): matches.group("identifier_name")}, - ) - return component_obj - - -def _resolve_connection_id(ml_client, connection: Optional[Union[str, WorkspaceConnection]] = None) -> str: - if connection is None: - return "" - - if isinstance(connection, str): - short_form = re.match(r"azureml:(?P[^/]*)", connection) - if short_form: - connection_name = short_form.group("connection_name") - else: - # TODO: Handle long form connection sub/rg/ws, ideally reuse logic implemented by connections code. - long_form = re.match(r"(azureml:/)?/.*/connections/(?P[^/]*)", connection) - connection_name = long_form.group("connection_name") if long_form else connection - - connection = ml_client.connections.get(connection_name) - elif hasattr(connection, "_workspace_connection"): - # Handle azure.ai.generative Connections - connection = connection._workspace_connection - - return connection.id diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/data_index.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/data_index.py deleted file mode 100644 index 3351c18af283..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/data_index.py +++ /dev/null @@ -1,278 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- -"""DataIndex entities.""" - -# pylint: disable=no-member - -from os import PathLike -from pathlib import Path -from typing import Dict, Optional, Union - -from azure.ai.generative.index._dataindex._schema._data_index import DataIndexTypes -from azure.ai.ml._utils._experimental import experimental -from azure.ai.ml.constants._common import BASE_PATH_CONTEXT_KEY, PARAMS_OVERRIDE_KEY -from azure.ai.ml.entities._assets import Data -from azure.ai.ml.entities._inputs_outputs.utils import _remove_empty_values -from azure.ai.ml.entities._mixins import DictMixin -from azure.ai.ml.entities._util import load_from_dict - - -@experimental -class CitationRegex(DictMixin): - """ - :keyword match_pattern: Regex to match citation in the citation_url + input file path. - e.g. '(.*)/articles/(.*)(\\.[^.]+)$'. - :type match_pattern: str - :keyword replacement_pattern: Replacement string for citation. e.g. '\\1/\\2'. - :type replacement_pattern: str - """ - - def __init__( - self, - *, - match_pattern: str, - replacement_pattern: str, - ) -> None: - """Initialize a CitationRegex object.""" - self.match_pattern = match_pattern - self.replacement_pattern = replacement_pattern - - def _to_dict(self) -> Dict: - """Convert the Source object to a dict. - :return: The dictionary representation of the class - :rtype: Dict - """ - keys = [ - "match_pattern", - "replacement_pattern", - ] - result = {key: getattr(self, key) for key in keys} - return _remove_empty_values(result) - - -@experimental -class IndexSource(DictMixin): - """Congifuration for the destination index to write processed data to. - :keyword input_data: Input Data to index files from. MLTable type inputs will use `mode: eval_mount`. - :type input_data: Data - :keyword input_glob: Connection reference to use for embedding model information, - only needed for hosted embeddings models (such as Azure OpenAI). - :type input_glob: str, optional - :keyword chunk_size: Maximum number of tokens to put in each chunk. - :type chunk_size: int, optional - :keyword chunk_overlap: Number of tokens to overlap between chunks. - :type chunk_overlap: int, optional - :keyword citation_url: Base URL to join with file paths to create full source file URL for chunk metadata. - :type citation_url: str, optional - :keyword citation_url_replacement_regex: Regex match and replacement patterns for citation url. Useful if the paths - in `input_data` don't match the desired citation format. - :type citation_url_replacement_regex: CitationRegex, optional - :raises ~azure.ai.ml.exceptions.ValidationException: Raised if the IndexSource object cannot be validated. - Details will be provided in the error message. - """ - - def __init__( - self, - *, - input_data: Data, - input_glob: Optional[str] = None, - chunk_size: Optional[int] = None, - chunk_overlap: Optional[int] = None, - citation_url: Optional[str] = None, - citation_url_replacement_regex: Optional[CitationRegex] = None, - ) -> None: - """Initialize a IndexSource object.""" - self.input_data = input_data - self.input_glob = input_glob - self.chunk_size = chunk_size - self.chunk_overlap = chunk_overlap - self.citation_url = citation_url - self.citation_url_replacement_regex = citation_url_replacement_regex - - def _to_dict(self) -> Dict: - """Convert the Source object to a dict. - :return: The dictionary representation of the class - :rtype: Dict - """ - keys = [ - "input_data", - "input_glob", - "chunk_size", - "chunk_overlap", - "citation_url", - "citation_url_replacement_regex", - ] - result = {key: getattr(self, key) for key in keys} - return _remove_empty_values(result) - - -@experimental -class Embedding(DictMixin): - """Congifuration for the destination index to write processed data to. - :keyword model: The model to use to embed data. E.g. 'hugging_face://model/sentence-transformers/all-mpnet-base-v2' - or 'azure_open_ai://deployment/{deployment_name}/model/{model_name}' - :type model: str - :keyword connection: Connection reference to use for embedding model information, - only needed for hosted embeddings models (such as Azure OpenAI). - :type connection: str, optional - :keyword cache_path: Folder containing previously generated embeddings. - Should be parent folder of the 'embeddings' output path used for for this component. - Will compare input data to existing embeddings and only embed changed/new data, reusing existing chunks. - :type cache_path: str, optional - :raises ~azure.ai.ml.exceptions.ValidationException: Raised if the Embedding object cannot be validated. - Details will be provided in the error message. - """ - - def __init__( - self, - *, - model: str, - connection: Optional[str] = None, - cache_path: Optional[str] = None, - ) -> None: - """Initialize a Embedding object.""" - self.model = model - self.connection = connection - self.cache_path = cache_path - - def _to_dict(self) -> Dict: - """Convert the Source object to a dict. - :return: The dictionary representation of the class - :rtype: Dict - """ - keys = [ - "model", - "connection", - "cache_path", - ] - result = {key: getattr(self, key) for key in keys} - return _remove_empty_values(result) - - -@experimental -class IndexStore(DictMixin): - """Congifuration for the destination index to write processed data to. - :keyword type: The type of index to write to. Currently supported types are 'acs' and 'faiss'. - :type type: str - :keyword name: Name of index to update/create, only needed for hosted indexes (such as Azure Cognitive Search). - :type name: str, optional - :keyword connection: Connection reference to use for index information, - only needed for hosted indexes (such as Azure Cognitive Search). - :type connection: str, optional - :keyword config: Configuration for the index. Configuration for the index. Primary use is to configure Azure Cognitive Search specific settings. - Such as custom `field_mapping` for known field types. - :type config: dict, optional - :raises ~azure.ai.ml.exceptions.ValidationException: Raised if the IndexStore object cannot be validated. - Details will be provided in the error message. - """ - - def __init__( - self, - *, - type: str = DataIndexTypes.FAISS, - name: Optional[str] = None, - connection: Optional[str] = None, - config: Optional[Dict] = None, - ) -> None: - """Initialize a IndexStore object.""" - self.type = type - self.name = name - self.connection = connection - self.config = config - - def _to_dict(self) -> Dict: - """Convert the Source object to a dict. - :return: The dictionary representation of the class - :rtype: Dict - """ - keys = [ - "type", - "name", - "connection", - "config" - ] - result = {key: getattr(self, key) for key in keys} - return _remove_empty_values(result) - - -@experimental -class DataIndex(Data): - """Data asset with a creating data index job. - :param name: Name of the asset. - :type name: str - :param path: The path to the asset being created by data index job. - :type path: str - :param source: The source data to be indexed. - :type source: IndexSource - :param embedding: The embedding model to use when processing source data chunks. - :type embedding: Embedding - :param index: The destination index to write processed data to. - :type index: IndexStore - :param incremental_update: Whether to update the index incrementally or not. - :type incremental_update: bool - :param version: Version of the asset created by running this DataIndex Job. - :type version: str - :param description: Description of the resource. - :type description: str - :param tags: Tag dictionary. Tags can be added, removed, and updated. - :type tags: dict[str, str] - :param properties: The asset property dictionary. - :type properties: dict[str, str] - :param kwargs: A dictionary of additional configuration parameters. - :type kwargs: dict - """ - - def __init__( - self, - *, - name: str, - source: IndexSource, - embedding: Embedding, - index: IndexStore, - incremental_update: bool = False, - path: Optional[str] = None, - version: Optional[str] = None, - description: Optional[str] = None, - tags: Optional[Dict] = None, - properties: Optional[Dict] = None, - **kwargs, - ) -> None: - """Initialize a DataIndex object.""" - super().__init__( - name=name, - version=version, - description=description, - tags=tags, - properties=properties, - path=path, - **kwargs, - ) - self.source = source - self.embedding = embedding - self.index = index - self.incremental_update = incremental_update - - @classmethod - def _load( - cls, - data: Optional[Dict] = None, - yaml_path: Optional[Union[PathLike, str]] = None, - params_override: Optional[list] = None, - **kwargs, - ) -> "DataIndex": - from azure.ai.generative.index._dataindex._schema._data_index import DataIndexSchema - - data = data or {} - params_override = params_override or [] - context = { - BASE_PATH_CONTEXT_KEY: Path(yaml_path).parent if yaml_path else Path("./"), - PARAMS_OVERRIDE_KEY: params_override, - } - return load_from_dict(DataIndexSchema, data, context, **kwargs) - - def _to_dict(self) -> Dict: - # pylint: disable=no-member - from azure.ai.generative.index._dataindex._schema._data_index import DataIndexSchema - - return DataIndexSchema(context={BASE_PATH_CONTEXT_KEY: "./"}).dump(self) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/operations/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/operations/__init__.py deleted file mode 100644 index d058f334e5ac..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/operations/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- -"""DataIndex operations.""" - -__path__ = __import__("pkgutil").extend_path(__path__, __name__) - -from azure.ai.generative.index._dataindex.operations._data_operations import DataOperations - -__all__ = [ - "DataOperations", -] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/operations/_data_operations.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/operations/_data_operations.py deleted file mode 100644 index 4e7da155c15c..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/operations/_data_operations.py +++ /dev/null @@ -1,115 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- - -# pylint: disable=protected-access -# pylint: disable=no-member - -from typing import Optional, Union - -from azure.ai.ml._telemetry import ActivityType, monitor_with_activity -from azure.ai.ml._utils._asset_utils import ( - _validate_auto_delete_setting_in_data_output, - _validate_workspace_managed_datastore, -) -from azure.ai.ml._utils._experimental import experimental -from azure.ai.ml.constants._common import ( - AssetTypes, - AzureMLResourceType, -) -from azure.ai.ml.entities import PipelineJob, PipelineJobSettings -from azure.ai.ml.entities._credentials import ManagedIdentityConfiguration, UserIdentityConfiguration -from azure.ai.ml.entities._inputs_outputs import Input -from azure.ai.ml.operations._data_operations import DataOperations, logger -from azure.ai.generative.index._dataindex.data_index import index_data as index_data_func -from azure.ai.generative.index._dataindex.entities.data_index import DataIndex - - -@monitor_with_activity(logger, "Data.IndexData", ActivityType.PUBLICAPI) -@experimental -def index_data( - self, - data_index: DataIndex, - identity: Optional[Union[ManagedIdentityConfiguration, UserIdentityConfiguration]] = None, - compute: str = "serverless", - serverless_instance_type: Optional[str] = None, - input_data_override: Optional[Input] = None, - submit_job: bool = True, - **kwargs, -) -> PipelineJob: - """ - Returns the data import job that is creating the data asset. - - :param data_index: DataIndex object. - :type data_index: azure.ai.ml.entities._dataindex - :param identity: Identity configuration for the job. - :type identity: Optional[Union[ManagedIdentityConfiguration, UserIdentityConfiguration]] - :param compute: The compute target to use for the job. Default: "serverless". - :type compute: str - :param serverless_instance_type: The instance type to use for serverless compute. - :type serverless_instance_type: Optional[str] - :param input_data_override: Input data override for the job. - Used to pipe output of step into DataIndex Job in a pipeline. - :type input_data_override: Optional[Input] - :param submit_job: Whether to submit the job to the service. Default: True. - :type submit_job: bool - :return: data import job object. - :rtype: ~azure.ai.ml.entities.PipelineJob. - """ - from azure.ai.ml import MLClient - - default_name = "data_index_" + data_index.name - experiment_name = kwargs.pop("experiment_name", None) or default_name - data_index.type = AssetTypes.URI_FOLDER - - # avoid specifying auto_delete_setting in job output now - _validate_auto_delete_setting_in_data_output(data_index.auto_delete_setting) - - # block customer specified path on managed datastore - data_index.path = _validate_workspace_managed_datastore(data_index.path) - - # TODO: This is import_data behavior, not sure if it should be default for index_data, or just be documented? - if "${{name}}" not in data_index.path and "{name}" not in data_index.path: - data_index.path = data_index.path.rstrip("/") + "/${{name}}" - - index_job = index_data_func( - description=data_index.description or kwargs.pop("description", None) or default_name, - name=data_index.name or kwargs.pop("name", None), - display_name=kwargs.pop("display_name", None) or default_name, - experiment_name=experiment_name, - compute=compute, - serverless_instance_type=serverless_instance_type, - data_index=data_index, - ml_client=MLClient( - subscription_id=self._subscription_id, - resource_group_name=self._resource_group_name, - workspace_name=self._workspace_name, - credential=self._service_client._config.credential, - ), - identity=identity, - input_data_override=input_data_override, - **kwargs, - ) - index_pipeline = PipelineJob( - description=index_job.description, - tags=index_job.tags, - name=index_job.name, - display_name=index_job.display_name, - experiment_name=experiment_name, - properties=index_job.properties or {}, - settings=PipelineJobSettings(force_rerun=True, default_compute=compute), - jobs={default_name: index_job}, - ) - index_pipeline.properties["azureml.mlIndexAssetName"] = data_index.name - index_pipeline.properties["azureml.mlIndexAssetKind"] = data_index.index.type - index_pipeline.properties["azureml.mlIndexAssetSource"] = kwargs.pop("mlindex_asset_source", "Data Asset") - - if submit_job: - return self._all_operations.all_operations[AzureMLResourceType.JOB].create_or_update( - job=index_pipeline, skip_validation=True, **kwargs - ) - - return index_pipeline - - -DataOperations.index_data = index_data diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_docstore.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_docstore.py deleted file mode 100644 index 3464e1aacbc7..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_docstore.py +++ /dev/null @@ -1,91 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- -"""DocumentStore.""" -from pathlib import Path -from typing import Dict, Optional, Union - -from azure.ai.generative.index._documents import Document, StaticDocument -from azure.ai.generative.index._utils.logging import get_logger - -logger = get_logger(__name__) - - -class FileBasedDocstore: - """Simple docstore which serializes to file and loads into memory.""" - - def __init__(self, _dict: Optional[Dict[str, Document]] = None): - """Initialize with dict.""" - self._dict = _dict if _dict is not None else {} - - def add(self, texts: Dict[str, Document]) -> None: - """ - Add texts to in memory dictionary. - - Args: - ---- - texts: dictionary of id -> document. - - Returns: - ------- - None - """ - overlapping = set(texts).intersection(self._dict) - if overlapping: - raise ValueError(f"Tried to add ids that already exist: {overlapping}") - self._dict = {**self._dict, **texts} - - def delete(self, ids: list) -> None: - """Deleting IDs from in memory dictionary.""" - overlapping = set(ids).intersection(self._dict) - if not overlapping: - raise ValueError(f"Tried to delete ids that does not exist: {ids}") - for _id in ids: - self._dict.pop(_id) - - def search(self, search: str) -> Union[Document, str]: - """ - Search via direct lookup. - - Args: - ---- - search: id of a document to search for. - - Returns: - ------- - Document if found, else error message. - """ - if search not in self._dict: - return f"ID {search} not found." - else: - return self._dict[search] - - def save(self, output_path: str): - """ - Save to JSONL file. - - Args: - ---- - output_path: folder to save doctore contents in. - """ - output_path_obj = Path(output_path) - output_path_obj.mkdir(parents=True, exist_ok=True) - - with (output_path_obj / "docs.jsonl").open("w", encoding="utf-8") as f: - for doc in self._dict.values(): - json_line = doc.dumps() - f.write(json_line + "\n") - - @classmethod - def load(cls, input_path: str) -> "FileBasedDocstore": - """Load from JSONL file.""" - from fsspec.core import url_to_fs - - fs, uri = url_to_fs(input_path) - - documents: Optional[Dict[str, Document]] = {} - with fs.open(f"{input_path.rstrip('/')}/docs.jsonl") as f: - for line in f: - document = StaticDocument.loads(line.strip()) - documents[document.document_id] = document # type: ignore[index] - return cls(documents) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/__init__.py index c3538f847f09..4c27e08e1a80 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/__init__.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/__init__.py @@ -16,8 +16,8 @@ crack_documents, files_to_document_source, ) -from azure.ai.generative.index._documents.document import Document, StaticDocument from azure.ai.generative.index._utils.logging import get_logger +from azure.ai.resources._index._documents.document import Document, StaticDocument logger = get_logger(__name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/chunking.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/chunking.py index 1edb6a1aeb42..f3360060e3e9 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/chunking.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/chunking.py @@ -10,11 +10,12 @@ from functools import lru_cache from typing import Any, Iterable, Iterator, List, Optional, Sequence -from azure.ai.generative.index._documents.document import Document, DocumentSource, StaticDocument -from azure.ai.generative.index._langchain.vendor.text_splitter import TextSplitter +from azure.ai.generative.index._documents.document import DocumentSource from azure.ai.generative.index._utils import merge_dicts from azure.ai.generative.index._utils.logging import get_logger, safe_mlflow_log_metric -from azure.ai.generative.index._utils.tokens import tiktoken_cache_dir, token_length_function +from azure.ai.resources._index._langchain.vendor.text_splitter import TextSplitter +from azure.ai.resources._index._utils.tokens import tiktoken_cache_dir, token_length_function +from azure.ai.resources._index._documents.document import Document, StaticDocument logger = get_logger(__name__) @@ -68,7 +69,7 @@ def get_langchain_splitter(file_extension: str, arguments: dict) -> TextSplitter # Handle non-natural language splitters if file_extension == ".py": - from azure.ai.generative.index._langchain.vendor.text_splitter import Language, RecursiveCharacterTextSplitter + from azure.ai.resources._index._langchain.vendor.text_splitter import Language, RecursiveCharacterTextSplitter with tiktoken_cache_dir(): return RecursiveCharacterTextSplitter.from_tiktoken_encoder( **{ @@ -84,7 +85,7 @@ def get_langchain_splitter(file_extension: str, arguments: dict) -> TextSplitter # If configured to use NLTK for splitting on sentence boundaries use that for non-code text formats if use_nltk: _init_nltk() - from azure.ai.generative.index._langchain.vendor.text_splitter import NLTKTextSplitter + from azure.ai.resources._index._langchain.vendor.text_splitter import NLTKTextSplitter return NLTKTextSplitter( length_function=token_length_function(), @@ -97,7 +98,7 @@ def get_langchain_splitter(file_extension: str, arguments: dict) -> TextSplitter # Finally use any text format specific splitters formats_to_treat_as_txt_once_loaded = [".pdf", ".ppt", ".pptx", ".doc", ".docx", ".xls", ".xlsx"] if file_extension == ".txt" or file_extension in formats_to_treat_as_txt_once_loaded: - from azure.ai.generative.index._langchain.vendor.text_splitter import TokenTextSplitter + from azure.ai.resources._index._langchain.vendor.text_splitter import TokenTextSplitter with tiktoken_cache_dir(): return TokenTextSplitter( @@ -106,7 +107,7 @@ def get_langchain_splitter(file_extension: str, arguments: dict) -> TextSplitter **{**arguments, "disallowed_special": (), "allowed_special": "all"} ) elif file_extension == ".html" or file_extension == ".htm": - from azure.ai.generative.index._langchain.vendor.text_splitter import TokenTextSplitter + from azure.ai.resources._index._langchain.vendor.text_splitter import TokenTextSplitter logger.info("Using HTML splitter.") with tiktoken_cache_dir(): @@ -117,7 +118,7 @@ def get_langchain_splitter(file_extension: str, arguments: dict) -> TextSplitter ) elif file_extension == ".md": if use_rcts: - from azure.ai.generative.index._langchain.vendor.text_splitter import MarkdownTextSplitter + from azure.ai.resources._index._langchain.vendor.text_splitter import MarkdownTextSplitter with tiktoken_cache_dir(): return MarkdownTextSplitter.from_tiktoken_encoder( @@ -265,7 +266,7 @@ class MarkdownHeaderSplitter(TextSplitter): def __init__(self, remove_hyperlinks: bool = True, remove_images: bool = True, **kwargs: Any): """Initialize Markdown Header Splitter.""" - from azure.ai.generative.index._langchain.vendor.text_splitter import TokenTextSplitter + from azure.ai.resources._index._langchain.vendor.text_splitter import TokenTextSplitter self._remove_hyperlinks = remove_hyperlinks self._remove_images = remove_images with tiktoken_cache_dir(): diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/cracking.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/cracking.py index edc020ed77b4..8674d5840dc8 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/cracking.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/cracking.py @@ -10,9 +10,9 @@ from typing import IO, Any, Callable, Iterator, List, Optional, Tuple, Type, Union from azure.ai.generative.index._documents.chunking import ChunkedDocument, DocumentSource -from azure.ai.generative.index._documents.document import Document, StaticDocument from azure.ai.generative.index._langchain.vendor.document_loaders.unstructured import UnstructuredFileIOLoader from azure.ai.generative.index._utils.logging import get_logger, safe_mlflow_log_metric +from azure.ai.resources._index._documents.document import Document, StaticDocument logger = get_logger(__name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/document.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/document.py index 3f08113fda9c..cfc247d4e63a 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/document.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/document.py @@ -2,16 +2,11 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- """Document abstraction.""" -import json -from abc import ABC, abstractmethod +from abc import ABC from dataclasses import dataclass -from typing import Any, Optional, Union +from typing import Optional from pathlib import Path -import mmh3 -from azure.ai.generative.index._utils.tokens import token_length_function - - @dataclass class DocumentSource: """Document Source.""" @@ -34,126 +29,3 @@ def get_metadata(self) -> dict: "url": self.url, "mtime": self.mtime, } - - -class Document(ABC): - """Document.""" - - document_id: str - - def __init__(self, document_id: str): - """Initialize Document.""" - self.document_id = document_id - - @abstractmethod - def modified_time(self) -> Any: - """Get the modified time of the document.""" - pass - - @abstractmethod - def load_data(self) -> str: - """Load the data of the document.""" - pass - - @abstractmethod - def get_metadata(self) -> dict: - """Get the metadata of the document.""" - pass - - @abstractmethod - def set_metadata(self, metadata: dict): - """Set the metadata of the document.""" - pass - - @property - def page_content(self) -> str: - """Get the page content of the document.""" - return self.load_data() - - @property - def metadata(self) -> dict: - """Get the metadata of the document.""" - return self.get_metadata() - - @metadata.setter - def metadata(self, value: dict): - """Set the metadata of the document.""" - self.set_metadata(value) - - @abstractmethod - def dumps(self) -> str: - """Dump the document to a json string.""" - pass - - @classmethod - @abstractmethod - def loads(cls, data: str) -> "Document": - """Load the document from a json string.""" - pass - - -class StaticDocument(Document): - """Static Document holds data in-memory.""" - - data: str - _metadata: dict - - def __init__(self, data: str, metadata: dict, document_id: Optional[str] = None, mtime=None): - """Initialize StaticDocument.""" - if document_id is None: - filename = metadata.get("source", {}).get("filename", None) - if filename is not None: - document_id = f"{filename}{metadata.get('source', {}).get('chunk_id', '')}" - else: - document_id = str(mmh3.hash128(data)) - - super().__init__(document_id) - self.data = data - self._metadata = metadata - self.mtime = mtime - - def modified_time(self) -> Any: - """Get the modified time of the document.""" - return self.mtime - - def load_data(self) -> str: - """Load the data of the document.""" - return self.data - - def get_metadata(self) -> dict: - """Get the metadata of the document.""" - # if "stats" in self._metadata: - # if "source" not in self._metadata: - # self._metadata["source"] = {} - # self._metadata["source"]["stats"] = self._metadata["stats"] - # del self._metadata["stats"] - - self._metadata = {**self._metadata, "stats": self.document_stats()} - return self._metadata - - def set_metadata(self, metadata: dict): - """Set the metadata of the document.""" - self._metadata = metadata - - def document_stats(self) -> dict: - """Get the stats of the document.""" - return { - "tiktokens": token_length_function()(self.data), - "chars": len(self.data), - "lines": len(self.data.splitlines()), - } - - def __repr__(self): - """Get the representation of the document.""" - return f"StaticDocument(id={self.document_id}, mtime={self.mtime}, metadata={self._metadata})" - - def dumps(self) -> str: - """Dump the document to a json string.""" - return json.dumps({"content": self.data, "metadata": self._metadata, "document_id": self.document_id}) - - @classmethod - def loads(cls, data: str) -> "StaticDocument": - """Load the document from a json string.""" - data_dict = json.loads(data) - metadata = data_dict["metadata"] - return cls(data_dict["content"], metadata, data_dict.get("document_id", metadata.get("document_id", metadata.get("id", mmh3.hash128(data_dict["content"]))))) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_embeddings/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_embeddings/__init__.py index c1d8f0049ce8..cc8fe662a705 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_embeddings/__init__.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_embeddings/__init__.py @@ -21,14 +21,15 @@ import pyarrow.parquet as pq import yaml # type: ignore[import] from azure.core.credentials import TokenCredential -from azure.ai.generative.index._documents import Document, DocumentChunksIterator, DocumentSource, StaticDocument -from azure.ai.generative.index._embeddings.openai import OpenAIEmbedder +from azure.ai.generative.index._documents import DocumentChunksIterator, DocumentSource from azure.ai.generative.index._langchain.vendor.document_loaders.base import BaseLoader -from azure.ai.generative.index._langchain.vendor.embeddings.base import Embeddings as Embedder -from azure.ai.generative.index._langchain.vendor.schema.document import Document as LangChainDocument -from azure.ai.generative.index._models import init_open_ai_from_config, parse_model_uri from azure.ai.generative.index._utils.logging import get_logger, track_activity from azure.ai.generative.index._utils.tokens import tiktoken_cache_dir +from azure.ai.resources._index._documents import Document, StaticDocument +from azure.ai.resources._index._embeddings.openai import OpenAIEmbedder +from azure.ai.resources._index._langchain.vendor.embeddings.base import Embeddings as Embedder +from azure.ai.resources._index._langchain.vendor.schema.document import Document as LangChainDocument +from azure.ai.resources._index._models import init_open_ai_from_config, parse_model_uri logger = get_logger(__name__) @@ -54,7 +55,7 @@ def get_langchain_embeddings(embedding_kind: str, arguments: dict, credential: O ) return embedder elif embedding_kind == "hugging_face": - from azure.ai.generative.index._langchain.vendor.embeddings.huggingface import HuggingFaceEmbeddings + from azure.ai.resources._index._langchain.vendor.embeddings.huggingface import HuggingFaceEmbeddings args = copy.deepcopy(arguments) @@ -1004,7 +1005,7 @@ def add_doc(doc_id, emb_doc, documents): import_faiss_or_so_help_me = dependable_faiss_import elif engine.endswith("indexes.faiss.FaissAndDocStore"): from azure.ai.generative.index._docstore import FileBasedDocstore - from azure.ai.generative.index._indexes.faiss import FaissAndDocStore, import_faiss_or_so_help_me + from azure.ai.generative.index._indexes.faiss import FaissAndDocStore, import_faiss_or_so_help_me # type: ignore[no-redef] def add_doc(doc_id, emb_doc, documents): documents.append( diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_embeddings/openai.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_embeddings/openai.py deleted file mode 100644 index efb1b982e71e..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_embeddings/openai.py +++ /dev/null @@ -1,325 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- -"""OpenAI Embeddings generation and management tools.""" -import os -import time -from typing import Any, Dict, List, Optional - -from azure.ai.resources.constants._common import USER_AGENT_HEADER_KEY -from azure.ai.generative._user_agent import USER_AGENT -from azure.ai.generative.index._utils.logging import get_logger -from packaging import version - -logger = get_logger("embeddings.openai") - - -class OpenAIEmbedder: - """OpenAI Embedding client wrapper with retries.""" - - def __init__( - self, - api_base: str, - api_type: str, - api_version: Optional[str] = None, - api_key: Optional[str] = None, - azure_credential: Optional[Any] = None, - model: str = "text-embedding-ada-002", - deployment: Optional[str] = None, - batch_size: Optional[int] = None, - max_retries: Optional[int] = None, - embedding_ctx_length: Optional[int] = None, - show_progress_bar: bool = False, - openai_passthrough_args: Optional[dict] = None, - ): - """Initialize an OpenAI Embedding client.""" - self.api_base = api_base - self.api_type = api_type - self.api_key = api_key or os.getenv("AZURE_OPENAI_KEY") or "" - # TODO: If azure_credential set, check api_type is azure or azure_ad and setup auth accordingly - self.azure_credential = azure_credential - - if batch_size is None and "azure" in self.api_type: - batch_size = 16 - elif batch_size is None: - batch_size = 1000 - self.batch_size = int(batch_size) - self._dynamic_batch_size: Optional[int] = None - - if max_retries is None: - max_retries = 10 - self.max_retries = max_retries - - if model is None: - model = "text-embedding-ada-002" - self.model = model - - if "azure" in self.api_type and deployment is None: - raise ValueError("Azure OpenAI requires a deployment name.") - self.deployment = deployment - - if embedding_ctx_length is None: - embedding_ctx_length = 8191 - self.embedding_ctx_length = embedding_ctx_length - - self.show_progress_bar = show_progress_bar - self.openai_passthrough_args = openai_passthrough_args or {} - - try: - import openai - except ImportError as e: - raise ImportError("Please install openai via `pip install openai`") from e - - if version.parse(openai.version.VERSION) >= version.parse("1.0.0"): - self.openai_v1plus = True - self.api_version = api_version if api_version else "2023-05-15" - - if "azure" in self.api_type: - client = openai.AzureOpenAI( - api_key=self.api_key, - api_version=self.api_version, - azure_endpoint=self.api_base, - azure_deployment=self.deployment, - default_headers={USER_AGENT_HEADER_KEY: USER_AGENT}, - ) - else: - client = openai.OpenAI( - api_key=self.api_key, - base_url=self.api_base, - default_headers={USER_AGENT_HEADER_KEY: USER_AGENT}, - ) - - self.embedding_client = client.embeddings - - self._params = { - "model": self.model, - **self.openai_passthrough_args, - } - self._retry_exceptions = [ - openai._exceptions.APIStatusError, - openai._exceptions.APITimeoutError, - openai._exceptions.APIError, - openai._exceptions.APIConnectionError, - openai._exceptions.RateLimitError, - openai._exceptions.InternalServerError, - openai._exceptions.APIResponseValidationError, - ] - self._RateLimitError = openai._exceptions.RateLimitError - else: - self.openai_v1plus = False - self.api_version = api_version if api_version else "2023-03-15-preview" - self.embedding_client = openai.Embeddings - self._params = { - "model": self.model, - "api_base": self.api_base, - "api_type": self.api_type, - "api_version": self.api_version, - "api_key": self.api_key, - **self.openai_passthrough_args, - } - if self.deployment is not None: - self._params["engine"] = self.deployment - self._retry_exceptions = [ - openai.error.Timeout, - openai.error.APIError, - openai.error.APIConnectionError, - openai.error.RateLimitError, - openai.error.ServiceUnavailableError, - ] - self._RateLimitError = openai.error.RateLimitError - - self._statistics = { - "num_retries": 0, - "time_spent_sleeping": 0, - "num_tokens": 0, - } - - @property - def _openai_client_params(self) -> dict: - return self._params - - @property - def _retryable_openai_errors(self) -> List[Exception]: - return self._retry_exceptions - - def _dynamic_batch_size_embed_request(self, tokenized_texts: List[List[int]], **kwargs) -> dict: - try: - if self._dynamic_batch_size is None: - return self._embed_request(tokenized_texts=tokenized_texts, **kwargs) - else: - embedding_response: Dict[str, List] = {"data": []} - for i in range(0, len(tokenized_texts), self._dynamic_batch_size): - embedding_response["data"].extend( - self._embed_request( - tokenized_texts=tokenized_texts[i : i + self._dynamic_batch_size], **kwargs - )["data"] - ) - except Exception as e: - err_msg = str(e) - if "Too many inputs" not in err_msg: - raise - - import re - match = re.match(r".*The max number of inputs is ([0-9]+).*", err_msg) - if match and match.group(1): - try: - self._dynamic_batch_size = int(match.group(1)) - except Exception: - logger.error( - "Failed to parse max number of inputs from error message, falling back to batch_size=1." - ) - self._dynamic_batch_size = 1 - logger.warning(f"Reducing batch_size to {self._dynamic_batch_size} and retrying.") - embedding_response: Dict[str, List] = {"data": []} # type: ignore[no-redef] - for i in range(0, len(tokenized_texts), self._dynamic_batch_size): - embedding_response["data"].extend( - self._embed_request( - tokenized_texts=tokenized_texts[i : i + self._dynamic_batch_size], **kwargs - )["data"] - ) - else: - raise - - return embedding_response - - def _embed_request(self, tokenized_texts: List[List[int]], **kwargs) -> dict: - try: - total_delay = 0 - last_exception = None - for retry in range(self.max_retries): - logger.info(f"Attempt {retry} to embed {len(tokenized_texts)} documents.") - try: - response = self.embedding_client.create( - input=tokenized_texts, - **kwargs, - ) - if self.openai_v1plus: - response = {"object": "list", "data": [{"object": "embedding", "embedding": d.embedding} for d in response.data]} - return response - except Exception as e: - err_msg = str(e) - logger.warning(f"Error embedding: {err_msg}", exc_info=e) - last_exception = e - retrying = False - for retryable_error in self._retryable_openai_errors: - if isinstance(e, type(retryable_error)): - retrying = True - - # Retry with retry-after if found in RateLimitError - if isinstance(e, self._RateLimitError): - logger.warning(f"Retrying error type {type(e)}.") - response_headers = e.headers if hasattr(e, "headers") else {} - if "Retry-After" in response_headers: - delay = int(response_headers["Retry-After"]) - logger.warning(f"OpenAI throws RateLimitError with Retry-After {delay} seconds.") - else: - # Wait for 1 minute as suggested by openai https://help.openai.com/en/articles/6897202-ratelimiterror - logger.warning("Retry after 60 seconds.") - delay = 60 - total_delay += delay - logger.warning(f"Sleeping for {delay} seconds before retrying.") - time.sleep(delay) - break - - if not retrying: - break - finally: - self._statistics["num_retries"] += retry - self._statistics["time_spent_sleeping"] += total_delay - - err_msg = f"Failed to embed {len(tokenized_texts)} documents after {total_delay}s and {retry} retries. {last_exception}" - logger.error(err_msg) # TODO: Add custom dimensions - raise RuntimeError(err_msg) - - def _embed(self, texts: List[str]) -> List[List[float]]: - """Embed the given texts.""" - import numpy as np - import tiktoken - - try: - encoding = tiktoken.encoding_for_model(self.model) - except KeyError: - logger.warning("Warning: model not found. Using cl100k_base encoding.") - model = "cl100k_base" - encoding = tiktoken.get_encoding(model) - - tokenized_texts = [] - num_tokens = 0 - tokenized_texts_to_original_texts_indices = [] - for i, text in enumerate(texts): - if self.model.endswith("001"): - # Replace newlines, which can negatively affect performance. - # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 - text = text.replace("\n", " ") - - tokens = encoding.encode( - text, - # TODO: Does this need to be configurable? Our use cases treat all text as raw data. - disallowed_special=(), - ) - # Text longer than a models context length can be split and the embeddings averaged to approximate full text - # See: https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb - for j in range(0, len(tokens), self.embedding_ctx_length): - tokenized_texts.append(tokens[j : j + self.embedding_ctx_length]) - num_tokens += len(tokenized_texts[-1]) - tokenized_texts_to_original_texts_indices.append(i) - - self._statistics["num_tokens"] += num_tokens - - if self.show_progress_bar: - try: - import tqdm - - _iter = tqdm.tqdm(range(0, len(tokenized_texts), self.batch_size)) - except ImportError: - _iter = range(0, len(tokenized_texts), self.batch_size) - else: - _iter = range(0, len(tokenized_texts), self.batch_size) - - batched_embeddings: List[List[float]] = [] - for i in _iter: - response = self._dynamic_batch_size_embed_request( - tokenized_texts=tokenized_texts[i : i + self.batch_size], - **self._openai_client_params, - ) - batched_embeddings.extend(r["embedding"] for r in response["data"]) - - embedding_results: List[List[List[float]]] = [[] for _ in range(len(texts))] - num_tokens_in_batch: List[List[int]] = [[] for _ in range(len(texts))] - for i in range(len(tokenized_texts_to_original_texts_indices)): - embedding_results[tokenized_texts_to_original_texts_indices[i]].append(batched_embeddings[i]) - num_tokens_in_batch[tokenized_texts_to_original_texts_indices[i]].append(len(tokenized_texts[i])) - - embeddings: List[List[float]] = [[] for _ in range(len(texts))] - for i in range(len(texts)): - _result = embedding_results[i] - if len(_result) == 0: - average = self._embed_request(tokenized_texts="", **self._openai_client_params)["data"][0]["embedding"] # type: ignore[arg-type] - else: - average = np.average(_result, axis=0, weights=num_tokens_in_batch[i]) - embeddings[i] = (average / np.linalg.norm(average)).tolist() - - return embeddings - - def embed_documents(self, documents: List[str]) -> List[List[float]]: - """Batch embed documents.""" - return self._embed(documents) - - def embed_query(self, query: str) -> List[float]: - """Embed a single query.""" - return self.embed_documents([query])[0] - - # # TODO: _aembed - # async def aembed_documents(self, documents: List[str]) -> List[List[float]]: - # """Batch embed documents.""" - # return await self._aembed(documents) - - # async def aembed_query(self, query: str) -> List[float]: - # """Embed a single query.""" - # embeddings = await self.aembed_documents([query]) - # return embeddings[0] - - @property - def statistics(self) -> Dict[str, Any]: - """Return statistics about the last embedding request.""" - return self._statistics diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_functions.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_functions.py index 0690940f7a3a..7b8f97ac5356 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_functions.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_functions.py @@ -36,4 +36,4 @@ def get_native_index_client_from_index( path: Optional[Union[str, Path]], credential: Optional[TokenCredential] = None, ): - return DataplaneMLIndex(path).as_native_index_client(credential=credential) + return DataplaneMLIndex(path).as_native_index_client(credential=credential) \ No newline at end of file diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_indexes/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_indexes/__init__.py deleted file mode 100644 index 624f5ee88ecf..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_indexes/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- - - -__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_indexes/azure_search.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_indexes/azure_search.py deleted file mode 100644 index 94f448c5763f..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_indexes/azure_search.py +++ /dev/null @@ -1,21 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- -"""Azure Cognitive Search based Vector Index.""" -from types import ModuleType - -from azure.ai.generative.index._utils.logging import get_logger, version - -logger = get_logger("indexes.azure_search") - - -def import_azure_search_or_so_help_me() -> ModuleType: - """Import azure-search-documents if available, otherwise raise error.""" - try: - import azure.search.documents as azure_search_documents - except ImportError as e: - raise ImportError( - "Could not import azure-search-documents python package. " - f"Please install it with `pip install azure-ai-generative[cognitive_search]=={version}`" - ) from e - return azure_search_documents diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_indexes/faiss.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_indexes/faiss.py deleted file mode 100644 index 301c1d7a9402..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_indexes/faiss.py +++ /dev/null @@ -1,187 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- -"""Faiss based Vector Index using a file based DocumentStore.""" -import json -import os -from pathlib import Path -from types import ModuleType -from typing import Any, Callable, Dict, List, Tuple - -import numpy as np -from azure.ai.generative.index._docstore import FileBasedDocstore -from azure.ai.generative.index._documents import Document -from azure.ai.generative.index._utils.logging import get_logger - -logger = get_logger("indexes.faiss") - - -def import_faiss_or_so_help_me() -> ModuleType: - """Import faiss if available, otherwise raise error.""" - try: - if os.getenv("FAISS_NO_AVX2", "false").lower() == "true": - from faiss import swigfaiss as faiss - else: - import faiss - except ImportError as e: - raise ImportError( - "Could not import faiss python package. " - "Please install it with `pip install faiss-gpu` (for CUDA supported GPU) " - "or `pip install faiss-cpu` (depending on Python version)." - ) from e - return faiss - - -class FaissAndDocStore: - """Faiss based VectorStore using a file based DocumentStore.""" - - docstore: FileBasedDocstore - index: Any - query_embed: Callable[[str], List[float]] - index_to_doc_id: Dict[str, str] - - def __init__( - self, - query_embed: Callable[[str], List[float]], - index: Any, - docstore: FileBasedDocstore, - index_to_doc_id: Dict[str, str] - ): - """Initialize FaissAndDocStore.""" - self.query_embed = query_embed - self.index = index - self.docstore = docstore - self.index_to_doc_id = index_to_doc_id - - def similarity_search_with_score_by_vector( - self, - embedding: List[float], - k: int = 4, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """ - Return docs most similar to query. - - Args: - ---- - embedding: Embedding vector to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - kwargs: kwargs to be passed to similarity search. Can include: - score_threshold: Optional, a floating point value between 0 to 1 to - filter the resulting set of retrieved docs - - Returns: - ------- - List of documents most similar to the query text and L2 distance - in float for each. Lower score represents more similarity. - """ - vector = np.array([embedding], dtype=np.float32) - scores, indices = self.index.search(vector, k) - docs = [] - for j, i in enumerate(indices[0]): - if i == -1: - # This happens when not enough docs are returned. - continue - _id = self.index_to_doc_id[str(i)] - doc = self.docstore.search(_id) - if not isinstance(doc, Document): - raise ValueError(f"Could not find document for id {_id}, got {doc}") - docs.append((doc, scores[0][j])) - - score_threshold = kwargs.get("score_threshold") - if score_threshold is not None: - docs = [ - (doc, similarity) - for doc, similarity in docs - if similarity > score_threshold - ] - return docs[:k] - - def similarity_search_with_score(self, query: str, k: int = 8, **kwargs: Any) -> List[Tuple[Document, float]]: - """ - Return docs most similar to query. - - Args: - ---- - query: Text to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - - Returns: - ------- - List of documents most similar to the query text with - L2 distance in float. Lower score represents more similarity. - """ - embedding = self.query_embed(query) - docs = self.similarity_search_with_score_by_vector(embedding, k, **kwargs) - return docs - - def similarity_search_by_vector(self, embedding: List[float], k: int = 8, **kwargs) -> List[Document]: - """ - Return docs most similar to embedding vector. - - Args: - ---- - embedding: Embedding to look up documents similar to. - k: Number of Documents to return. - - Returns: - ------- - List of Documents most similar to the embedding. - """ - docs_and_scores = self.similarity_search_with_score_by_vector(embedding, k, **kwargs) - return [doc for doc, _ in docs_and_scores] - - def similarity_search(self, query: str, k: int = 8, **kwargs) -> List[Document]: - """ - Return docs most similar to query. - - Args: - ---- - query: Text to look up documents similar to. - k: Number of Documents to return. - - Returns: - ------- - List of Documents most similar to the query. - """ - docs_and_scores = self.similarity_search_with_score(query, k, **kwargs) - return [doc for doc, _ in docs_and_scores] - - def save(self, output_path: str): - """Write index and docstore to output_path.""" - output_path_obj = Path(output_path) - output_path_obj.mkdir(exist_ok=True, parents=True) - - faiss = import_faiss_or_so_help_me() - faiss.write_index(self.index, str(output_path_obj / "index.faiss")) - - self.docstore.save(str(output_path_obj / "docstore")) - - with (output_path_obj / "index_to_doc_id.json").open("w") as f: - json.dump(self.index_to_doc_id, f) - - def save_local(self, output_path: str): - """Same as save, alias to match langchain.vectorstores.FAISS.""" - return self.save(output_path) - - @classmethod - def load(cls, input_path: str, query_embed: Callable[[str], List[float]]) -> "FaissAndDocStore": - """Read index and docstore from input_path.""" - import tempfile - - from fsspec.core import url_to_fs - - logger.info(f"Loading FaissAndDocStore from: {input_path}") - fs, uri = url_to_fs(input_path) - - with tempfile.TemporaryDirectory() as tmpdir: - fs.download(f"{uri.rstrip('/')}/index.faiss", str(tmpdir)) - faiss = import_faiss_or_so_help_me() - index = faiss.read_index(f"{tmpdir.rstrip('/')}/index.faiss") - - with fs.open(f"{uri.rstrip('/')}/index_to_doc_id.json", "r") as f: - index_to_doc_id = json.load(f) - - docstore = FileBasedDocstore.load(f"{input_path.rstrip('/')}/docstore") - - return cls(query_embed, index, docstore, index_to_doc_id) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/acs.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/acs.py index 543ba1c211f2..7141bd822a55 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/acs.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/acs.py @@ -7,16 +7,16 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple from azure.ai.generative.index._utils.logging import get_logger -from azure.ai.generative.index._utils.requests import send_post_request +from azure.ai.resources._index._utils.requests import send_post_request try: from langchain.schema.document import Document from langchain.schema.embeddings import Embeddings from langchain.schema.vectorstore import VectorStore except ImportError: - from azure.ai.generative.index._langchain.vendor.embeddings.base import Embeddings - from azure.ai.generative.index._langchain.vendor.schema.document import Document - from azure.ai.generative.index._langchain.vendor.vectorstores.base import VectorStore + from azure.ai.resources._index._langchain.vendor.embeddings.base import Embeddings + from azure.ai.resources._index._langchain.vendor.schema.document import Document + from azure.ai.resources._index._langchain.vendor.vectorstores.base import VectorStore logger = get_logger("langchain.acs") diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/docstore.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/docstore.py index 5682b44555ae..6604dec8bd49 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/docstore.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/docstore.py @@ -4,9 +4,9 @@ """Langchain compatible Docstore which serializes to jsonl.""" from typing import Dict, Union -from azure.ai.generative.index._docstore import FileBasedDocstore from azure.ai.generative.index._embeddings import WrappedLangChainDocument -from azure.ai.generative.index._documents import Document +from azure.ai.resources._index._documents import Document +from azure.ai.resources._index._docstore import FileBasedDocstore from langchain.docstore.base import AddableMixin, Docstore from langchain.docstore.document import Document as LangChainDocument @@ -15,7 +15,7 @@ class FileBasedDocStore(Docstore, AddableMixin): """Simple docstore which serializes to file and loads into memory.""" def __init__(self, docstore: FileBasedDocstore): - """Initialize with azure.ai.generative.index._docstore.FileBasedDocstore.""" + """Initialize with azure.ai.resources._index._docstore.FileBasedDocstore.""" self.docstore = docstore def add(self, texts: Dict[str, LangChainDocument]) -> None: diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/faiss.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/faiss.py index d94590722a4e..dc8025e48a78 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/faiss.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/faiss.py @@ -2,9 +2,9 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- """Faiss based VectorStore using a file based DocumentStore.""" -from azure.ai.generative.index._indexes.faiss import FaissAndDocStore from azure.ai.generative.index._langchain.docstore import FileBasedDocStore from azure.ai.generative.index._utils.logging import get_logger +from azure.ai.resources._index._indexes.faiss import FaissAndDocStore from langchain.vectorstores import FAISS from langchain.vectorstores.base import VectorStore diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/openai.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/openai.py index ce4e9e340788..34bfddcfd02f 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/openai.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/openai.py @@ -8,7 +8,7 @@ def patch_openai_embedding_retries(logger, activity_logger, max_seconds_retrying """Patch the openai embedding to retry on failure."".""" from datetime import datetime - from azure.ai.generative.index._langchain.vendor.embeddings import openai as langchain_openai + from azure.ai.resources._index._langchain.vendor.embeddings import openai as langchain_openai from tenacity import ( retry, retry_if_exception_type, diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/docstore/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/docstore/__init__.py deleted file mode 100644 index 624f5ee88ecf..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/docstore/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- - - -__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/docstore/base.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/docstore/base.py deleted file mode 100644 index c054c26cfc69..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/docstore/base.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Interface to access to place that stores documents.""" -from abc import ABC, abstractmethod -from typing import Dict, Union - -from azure.ai.generative.index._langchain.vendor.schema.document import Document - - -class Docstore(ABC): - """Interface to access to place that stores documents.""" - - @abstractmethod - def search(self, search: str) -> Union[str, Document]: - """Search for document. - - If page exists, return the page summary, and a Document object. - If page does not exist, return similar entries. - """ - - -class AddableMixin(ABC): - """Mixin class that supports adding texts.""" - - @abstractmethod - def add(self, texts: Dict[str, Document]) -> None: - """Add more documents.""" diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/document_loaders/base.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/document_loaders/base.py index 5bb2836af8a8..7af9218aa109 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/document_loaders/base.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/document_loaders/base.py @@ -5,8 +5,8 @@ from abc import ABC, abstractmethod from typing import Iterator, List, Optional -from azure.ai.generative.index._langchain.vendor.schema.document import Document -from azure.ai.generative.index._langchain.vendor.text_splitter import RecursiveCharacterTextSplitter, TextSplitter +from azure.ai.resources._index._langchain.vendor.schema.document import Document +from azure.ai.resources._index._langchain.vendor.text_splitter import RecursiveCharacterTextSplitter, TextSplitter class BaseLoader(ABC): diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/document_loaders/unstructured.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/document_loaders/unstructured.py index bd507e0517b1..002047dae436 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/document_loaders/unstructured.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/document_loaders/unstructured.py @@ -5,7 +5,7 @@ from abc import ABC, abstractmethod from typing import IO, Any, Callable, Dict, List, Optional, Sequence, Union -from azure.ai.generative.index._langchain.vendor.schema.document import Document +from azure.ai.resources._index._langchain.vendor.schema.document import Document from azure.ai.generative.index._langchain.vendor.document_loaders.base import BaseLoader diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/__init__.py deleted file mode 100644 index 624f5ee88ecf..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- - - -__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/base.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/base.py deleted file mode 100644 index e9996c3c3f0f..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/base.py +++ /dev/null @@ -1,25 +0,0 @@ -# This file has been copied as is. -# Last Sync: 2023-08-24 -# Commit: 3e5cda3405ec1aa369fe90253d88f3e26a03db10 -from abc import ABC, abstractmethod -from typing import List - - -class Embeddings(ABC): - """Interface for embedding models.""" - - @abstractmethod - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Embed search docs.""" - - @abstractmethod - def embed_query(self, text: str) -> List[float]: - """Embed query text.""" - - async def aembed_documents(self, texts: List[str]) -> List[List[float]]: - """Asynchronous Embed search docs.""" - raise NotImplementedError - - async def aembed_query(self, text: str) -> List[float]: - """Asynchronous Embed query text.""" - raise NotImplementedError diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/huggingface.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/huggingface.py deleted file mode 100644 index 9495b815e7cc..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/huggingface.py +++ /dev/null @@ -1,256 +0,0 @@ -# This file has been copied as is. -# Last Sync: 2023-08-24 -# Commit: 3e5cda3405ec1aa369fe90253d88f3e26a03db10 -from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional - -from azure.ai.generative.index._langchain.vendor.embeddings.base import Embeddings - -DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" -DEFAULT_INSTRUCT_MODEL = "hkunlp/instructor-large" -DEFAULT_BGE_MODEL = "BAAI/bge-large-en" -DEFAULT_EMBED_INSTRUCTION = "Represent the document for retrieval: " -DEFAULT_QUERY_INSTRUCTION = ( - "Represent the question for retrieving supporting documents: " -) -DEFAULT_QUERY_BGE_INSTRUCTION_EN = ( - "Represent this question for searching relevant passages: " -) -DEFAULT_QUERY_BGE_INSTRUCTION_ZH = "为这个句子生成表示以用于检索相关文章:" - - -@dataclass -class HuggingFaceEmbeddings(Embeddings): - """HuggingFace sentence_transformers embedding models. - - To use, you should have the ``sentence_transformers`` python package installed. - - Example: - .. code-block:: python - - from langchain.embeddings import HuggingFaceEmbeddings - - model_name = "sentence-transformers/all-mpnet-base-v2" - model_kwargs = {'device': 'cpu'} - encode_kwargs = {'normalize_embeddings': False} - hf = HuggingFaceEmbeddings( - model_name=model_name, - model_kwargs=model_kwargs, - encode_kwargs=encode_kwargs - ) - """ - - client: Any = field(init=False) #: :meta private: - model_name: str = DEFAULT_MODEL_NAME - """Model name to use.""" - cache_folder: Optional[str] = None - """Path to store models. - Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable.""" - model_kwargs: Dict[str, Any] = field(default_factory=dict) - """Key word arguments to pass to the model.""" - encode_kwargs: Dict[str, Any] = field(default_factory=dict) - """Key word arguments to pass when calling the `encode` method of the model.""" - multi_process: bool = False - """Run encode() on multiple GPUs.""" - - def __post_init__(self, **kwargs: Any): - """Initialize the sentence_transformer.""" - try: - import sentence_transformers - - except ImportError as exc: - raise ImportError( - "Could not import sentence_transformers python package. " - "Please install it with `pip install sentence_transformers`." - ) from exc - - self.client = sentence_transformers.SentenceTransformer( - self.model_name, cache_folder=self.cache_folder, **self.model_kwargs - ) - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Compute doc embeddings using a HuggingFace transformer model. - - Args: - texts: The list of texts to embed. - - Returns: - List of embeddings, one for each text. - """ - import sentence_transformers - - texts = list(map(lambda x: x.replace("\n", " "), texts)) - if self.multi_process: - pool = self.client.start_multi_process_pool() - embeddings = self.client.encode_multi_process(texts, pool) - sentence_transformers.SentenceTransformer.stop_multi_process_pool(pool) - else: - embeddings = self.client.encode(texts, **self.encode_kwargs) - - return embeddings.tolist() - - def embed_query(self, text: str) -> List[float]: - """Compute query embeddings using a HuggingFace transformer model. - - Args: - text: The text to embed. - - Returns: - Embeddings for the text. - """ - return self.embed_documents([text])[0] - - -@dataclass -class HuggingFaceInstructEmbeddings(Embeddings): - """Wrapper around sentence_transformers embedding models. - - To use, you should have the ``sentence_transformers`` - and ``InstructorEmbedding`` python packages installed. - - Example: - .. code-block:: python - - from langchain.embeddings import HuggingFaceInstructEmbeddings - - model_name = "hkunlp/instructor-large" - model_kwargs = {'device': 'cpu'} - encode_kwargs = {'normalize_embeddings': True} - hf = HuggingFaceInstructEmbeddings( - model_name=model_name, - model_kwargs=model_kwargs, - encode_kwargs=encode_kwargs - ) - """ - - client: Any = field(init=False) #: :meta private: - model_name: str = DEFAULT_INSTRUCT_MODEL - """Model name to use.""" - cache_folder: Optional[str] = None - """Path to store models. - Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable.""" - model_kwargs: Dict[str, Any] = field(default_factory=dict) - """Key word arguments to pass to the model.""" - encode_kwargs: Dict[str, Any] = field(default_factory=dict) - """Key word arguments to pass when calling the `encode` method of the model.""" - embed_instruction: str = DEFAULT_EMBED_INSTRUCTION - """Instruction to use for embedding documents.""" - query_instruction: str = DEFAULT_QUERY_INSTRUCTION - """Instruction to use for embedding query.""" - - def __post_init__(self, **kwargs: Any): - """Initialize the sentence_transformer.""" - try: - from InstructorEmbedding import INSTRUCTOR - - self.client = INSTRUCTOR( - self.model_name, cache_folder=self.cache_folder, **self.model_kwargs - ) - except ImportError as e: - raise ImportError("Dependencies for InstructorEmbedding not found.") from e - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Compute doc embeddings using a HuggingFace instruct model. - - Args: - texts: The list of texts to embed. - - Returns: - List of embeddings, one for each text. - """ - instruction_pairs = [[self.embed_instruction, text] for text in texts] - embeddings = self.client.encode(instruction_pairs, **self.encode_kwargs) - return embeddings.tolist() - - def embed_query(self, text: str) -> List[float]: - """Compute query embeddings using a HuggingFace instruct model. - - Args: - text: The text to embed. - - Returns: - Embeddings for the text. - """ - instruction_pair = [self.query_instruction, text] - embedding = self.client.encode([instruction_pair], **self.encode_kwargs)[0] - return embedding.tolist() - - -@dataclass -class HuggingFaceBgeEmbeddings(Embeddings): - """HuggingFace BGE sentence_transformers embedding models. - - To use, you should have the ``sentence_transformers`` python package installed. - - Example: - .. code-block:: python - - from langchain.embeddings import HuggingFaceBgeEmbeddings - - model_name = "BAAI/bge-large-en" - model_kwargs = {'device': 'cpu'} - encode_kwargs = {'normalize_embeddings': True} - hf = HuggingFaceBgeEmbeddings( - model_name=model_name, - model_kwargs=model_kwargs, - encode_kwargs=encode_kwargs - ) - """ - - client: Any = field(init=False) #: :meta private: - model_name: str = DEFAULT_BGE_MODEL - """Model name to use.""" - cache_folder: Optional[str] = None - """Path to store models. - Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable.""" - model_kwargs: Dict[str, Any] = field(default_factory=dict) - """Key word arguments to pass to the model.""" - encode_kwargs: Dict[str, Any] = field(default_factory=dict) - """Key word arguments to pass when calling the `encode` method of the model.""" - query_instruction: str = DEFAULT_QUERY_BGE_INSTRUCTION_EN - """Instruction to use for embedding query.""" - - def __post_init__(self, **kwargs: Any): - """Initialize the sentence_transformer.""" - try: - import sentence_transformers - - except ImportError as exc: - raise ImportError( - "Could not import sentence_transformers python package. " - "Please install it with `pip install sentence_transformers`." - ) from exc - - self.client = sentence_transformers.SentenceTransformer( - self.model_name, cache_folder=self.cache_folder, **self.model_kwargs - ) - if "-zh" in self.model_name: - self.query_instruction = DEFAULT_QUERY_BGE_INSTRUCTION_ZH - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Compute doc embeddings using a HuggingFace transformer model. - - Args: - texts: The list of texts to embed. - - Returns: - List of embeddings, one for each text. - """ - texts = [t.replace("\n", " ") for t in texts] - embeddings = self.client.encode(texts, **self.encode_kwargs) - return embeddings.tolist() - - def embed_query(self, text: str) -> List[float]: - """Compute query embeddings using a HuggingFace transformer model. - - Args: - text: The text to embed. - - Returns: - Embeddings for the text. - """ - text = text.replace("\n", " ") - embedding = self.client.encode( - self.query_instruction + text, **self.encode_kwargs - ) - return embedding.tolist() diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/openai.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/openai.py deleted file mode 100644 index 2d1446426017..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/openai.py +++ /dev/null @@ -1,439 +0,0 @@ -# This file has been slightly modified to not rely on Pydantic. -# Last Sync: 2023-08-24 -# Commit: 3e5cda3405ec1aa369fe90253d88f3e26a03db10 -from __future__ import annotations - -import logging -from dataclasses import dataclass, field -from typing import ( - Any, - Callable, - Dict, - List, - Literal, - Optional, - Sequence, - Set, - Tuple, - Union, -) - -import openai -import numpy as np -from tenacity import ( - AsyncRetrying, - before_sleep_log, - retry, - retry_if_exception_type, - stop_after_attempt, - wait_exponential, -) - -from azure.ai.generative.index._langchain.vendor.embeddings.base import Embeddings - -logger = logging.getLogger(__name__) - - -def _create_retry_decorator(embeddings: OpenAIEmbeddings) -> Callable[[Any], Any]: - import openai - - min_seconds = 4 - max_seconds = 10 - # Wait 2^x * 1 second between each retry starting with - # 4 seconds, then up to 10 seconds, then 10 seconds afterwards - return retry( - reraise=True, - stop=stop_after_attempt(embeddings.max_retries), - wait=wait_exponential(multiplier=1, min=min_seconds, max=max_seconds), - retry=( - retry_if_exception_type(openai.error.Timeout) - | retry_if_exception_type(openai.error.APIError) - | retry_if_exception_type(openai.error.APIConnectionError) - | retry_if_exception_type(openai.error.RateLimitError) - | retry_if_exception_type(openai.error.ServiceUnavailableError) - ), - before_sleep=before_sleep_log(logger, logging.WARNING), - ) - - -def _async_retry_decorator(embeddings: OpenAIEmbeddings) -> Any: - import openai - - min_seconds = 4 - max_seconds = 10 - # Wait 2^x * 1 second between each retry starting with - # 4 seconds, then up to 10 seconds, then 10 seconds afterwards - async_retrying = AsyncRetrying( - reraise=True, - stop=stop_after_attempt(embeddings.max_retries), - wait=wait_exponential(multiplier=1, min=min_seconds, max=max_seconds), - retry=( - retry_if_exception_type(openai.error.Timeout) - | retry_if_exception_type(openai.error.APIError) - | retry_if_exception_type(openai.error.APIConnectionError) - | retry_if_exception_type(openai.error.RateLimitError) - | retry_if_exception_type(openai.error.ServiceUnavailableError) - ), - before_sleep=before_sleep_log(logger, logging.WARNING), - ) - - def wrap(func: Callable) -> Callable: - async def wrapped_f(*args: Any, **kwargs: Any) -> Callable: - async for _ in async_retrying: - return await func(*args, **kwargs) - raise AssertionError("this is unreachable") - - return wrapped_f - - return wrap - - -# https://stackoverflow.com/questions/76469415/getting-embeddings-of-length-1-from-langchain-openaiembeddings -def _check_response(response: dict) -> dict: - if any(len(d["embedding"]) == 1 for d in response["data"]): - import openai - - raise openai.error.APIError("OpenAI API returned an empty embedding") - return response - - -def embed_with_retry(embeddings: OpenAIEmbeddings, **kwargs: Any) -> Any: - """Use tenacity to retry the embedding call.""" - retry_decorator = _create_retry_decorator(embeddings) - - @retry_decorator - def _embed_with_retry(**kwargs: Any) -> Any: - response = embeddings.client.create(**kwargs) - return _check_response(response) - - return _embed_with_retry(**kwargs) - - -async def async_embed_with_retry(embeddings: OpenAIEmbeddings, **kwargs: Any) -> Any: - """Use tenacity to retry the embedding call.""" - - @_async_retry_decorator(embeddings) - async def _async_embed_with_retry(**kwargs: Any) -> Any: - response = await embeddings.client.acreate(**kwargs) - return _check_response(response) - - return await _async_embed_with_retry(**kwargs) - - -@dataclass -class OpenAIEmbeddings(Embeddings): - """OpenAI embedding models. - - To use, you should have the ``openai`` python package installed, and the - environment variable ``OPENAI_API_KEY`` set with your API key or pass it - as a named parameter to the constructor. - - Example: - .. code-block:: python - - from langchain.embeddings import OpenAIEmbeddings - openai = OpenAIEmbeddings(openai_api_key="my-api-key") - - In order to use the library with Microsoft Azure endpoints, you need to set - the OPENAI_API_TYPE, OPENAI_API_BASE, OPENAI_API_KEY and OPENAI_API_VERSION. - The OPENAI_API_TYPE must be set to 'azure' and the others correspond to - the properties of your endpoint. - In addition, the deployment name must be passed as the model parameter. - - Example: - .. code-block:: python - - import os - os.environ["OPENAI_API_TYPE"] = "azure" - os.environ["OPENAI_API_BASE"] = "https:// Dict: - openai_args = { - "model": self.model, - "request_timeout": self.request_timeout, - "headers": self.headers, - "api_key": self.openai_api_key, - "organization": self.openai_organization, - "api_base": self.openai_api_base, - "api_type": self.openai_api_type, - "api_version": self.openai_api_version, - **self.model_kwargs, - } - if self.openai_api_type in ("azure", "azure_ad", "azuread"): - openai_args["engine"] = self.deployment - if self.openai_proxy: - try: - import openai - except ImportError: - raise ImportError( - "Could not import openai python package. " - "Please install it with `pip install openai`." - ) - - openai.proxy = { - "http": self.openai_proxy, - "https": self.openai_proxy, - } # type: ignore[assignment] # noqa: E501 - return openai_args - - # please refer to - # https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb - def _get_len_safe_embeddings( - self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None - ) -> List[List[float]]: - embeddings: List[List[float]] = [[] for _ in range(len(texts))] - try: - import tiktoken - except ImportError: - raise ImportError( - "Could not import tiktoken python package. " - "This is needed in order to for OpenAIEmbeddings. " - "Please install it with `pip install tiktoken`." - ) - - tokens = [] - indices = [] - model_name = self.tiktoken_model_name or self.model - try: - encoding = tiktoken.encoding_for_model(model_name) - except KeyError: - logger.warning("Warning: model not found. Using cl100k_base encoding.") - model = "cl100k_base" - encoding = tiktoken.get_encoding(model) - for i, text in enumerate(texts): - if self.model.endswith("001"): - # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 - # replace newlines, which can negatively affect performance. - text = text.replace("\n", " ") - token = encoding.encode( - text, - allowed_special=self.allowed_special, - disallowed_special=self.disallowed_special, - ) - for j in range(0, len(token), self.embedding_ctx_length): - tokens.append(token[j : j + self.embedding_ctx_length]) - indices.append(i) - - batched_embeddings: List[List[float]] = [] - _chunk_size = chunk_size or self.chunk_size - - if self.show_progress_bar: - try: - from tqdm.auto import tqdm - - _iter = tqdm(range(0, len(tokens), _chunk_size)) - except ImportError: - _iter = range(0, len(tokens), _chunk_size) - else: - _iter = range(0, len(tokens), _chunk_size) - - for i in _iter: - response = embed_with_retry( - self, - input=tokens[i : i + _chunk_size], - **self._invocation_params, - ) - batched_embeddings.extend(r["embedding"] for r in response["data"]) - - results: List[List[List[float]]] = [[] for _ in range(len(texts))] - num_tokens_in_batch: List[List[int]] = [[] for _ in range(len(texts))] - for i in range(len(indices)): - results[indices[i]].append(batched_embeddings[i]) - num_tokens_in_batch[indices[i]].append(len(tokens[i])) - - for i in range(len(texts)): - _result = results[i] - if len(_result) == 0: - average = embed_with_retry( - self, - input="", - **self._invocation_params, - )[ - "data" - ][0]["embedding"] - else: - average = np.average(_result, axis=0, weights=num_tokens_in_batch[i]) - embeddings[i] = (average / np.linalg.norm(average)).tolist() - - return embeddings - - # please refer to - # https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb - async def _aget_len_safe_embeddings( - self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None - ) -> List[List[float]]: - embeddings: List[List[float]] = [[] for _ in range(len(texts))] - try: - import tiktoken - except ImportError: - raise ImportError( - "Could not import tiktoken python package. " - "This is needed in order to for OpenAIEmbeddings. " - "Please install it with `pip install tiktoken`." - ) - - tokens = [] - indices = [] - model_name = self.tiktoken_model_name or self.model - try: - encoding = tiktoken.encoding_for_model(model_name) - except KeyError: - logger.warning("Warning: model not found. Using cl100k_base encoding.") - model = "cl100k_base" - encoding = tiktoken.get_encoding(model) - for i, text in enumerate(texts): - if self.model.endswith("001"): - # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 - # replace newlines, which can negatively affect performance. - text = text.replace("\n", " ") - token = encoding.encode( - text, - allowed_special=self.allowed_special, - disallowed_special=self.disallowed_special, - ) - for j in range(0, len(token), self.embedding_ctx_length): - tokens.append(token[j : j + self.embedding_ctx_length]) - indices.append(i) - - batched_embeddings: List[List[float]] = [] - _chunk_size = chunk_size or self.chunk_size - for i in range(0, len(tokens), _chunk_size): - response = await async_embed_with_retry( - self, - input=tokens[i : i + _chunk_size], - **self._invocation_params, - ) - batched_embeddings.extend(r["embedding"] for r in response["data"]) - - results: List[List[List[float]]] = [[] for _ in range(len(texts))] - num_tokens_in_batch: List[List[int]] = [[] for _ in range(len(texts))] - for i in range(len(indices)): - results[indices[i]].append(batched_embeddings[i]) - num_tokens_in_batch[indices[i]].append(len(tokens[i])) - - for i in range(len(texts)): - _result = results[i] - if len(_result) == 0: - average = ( - await async_embed_with_retry( - self, - input="", - **self._invocation_params, - ) - )["data"][0]["embedding"] - else: - average = np.average(_result, axis=0, weights=num_tokens_in_batch[i]) - embeddings[i] = (average / np.linalg.norm(average)).tolist() - - return embeddings - - def embed_documents( - self, texts: List[str], chunk_size: Optional[int] = 0 - ) -> List[List[float]]: - """Call out to OpenAI's embedding endpoint for embedding search docs. - - Args: - texts: The list of texts to embed. - chunk_size: The chunk size of embeddings. If None, will use the chunk size - specified by the class. - - Returns: - List of embeddings, one for each text. - """ - # NOTE: to keep things simple, we assume the list may contain texts longer - # than the maximum context and use length-safe embedding function. - return self._get_len_safe_embeddings(texts, engine=self.deployment) - - async def aembed_documents( - self, texts: List[str], chunk_size: Optional[int] = 0 - ) -> List[List[float]]: - """Call out to OpenAI's embedding endpoint async for embedding search docs. - - Args: - texts: The list of texts to embed. - chunk_size: The chunk size of embeddings. If None, will use the chunk size - specified by the class. - - Returns: - List of embeddings, one for each text. - """ - # NOTE: to keep things simple, we assume the list may contain texts longer - # than the maximum context and use length-safe embedding function. - return await self._aget_len_safe_embeddings(texts, engine=self.deployment) - - def embed_query(self, text: str) -> List[float]: - """Call out to OpenAI's embedding endpoint for embedding query text. - - Args: - text: The text to embed. - - Returns: - Embedding for the text. - """ - return self.embed_documents([text])[0] - - async def aembed_query(self, text: str) -> List[float]: - """Call out to OpenAI's embedding endpoint async for embedding query text. - - Args: - text: The text to embed. - - Returns: - Embedding for the text. - """ - embeddings = await self.aembed_documents([text]) - return embeddings[0] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/schema/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/schema/__init__.py deleted file mode 100644 index a72d8893762e..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/schema/__init__.py +++ /dev/null @@ -1,35 +0,0 @@ -# This class has been copied from 'langchain/langchain/schema.py -# Last Sync: 2023-09-05 -# Tag: v0.0.220 -from abc import ABC, abstractmethod -from typing import ( - List, -) - -from azure.ai.generative.index._langchain.vendor.schema.document import Document - - -class BaseRetriever(ABC): - """Base interface for retrievers.""" - - @abstractmethod - def get_relevant_documents(self, query: str) -> List[Document]: - """Get documents relevant for a query. - - Args: - query: string to find relevant documents for - - Returns: - List of relevant documents - """ - - @abstractmethod - async def aget_relevant_documents(self, query: str) -> List[Document]: - """Get documents relevant for a query. - - Args: - query: string to find relevant documents for - - Returns: - List of relevant documents - """ diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/schema/document.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/schema/document.py deleted file mode 100644 index ae3b99d56c4c..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/schema/document.py +++ /dev/null @@ -1,83 +0,0 @@ -# This file has been slightly modified to not rely on Pydantic for the Document class. -# Last Sync: 2023-08-24 -# Commit: 3e5cda3405ec1aa369fe90253d88f3e26a03db10 -from __future__ import annotations - -from abc import ABC, abstractmethod -from dataclasses import dataclass, field -from typing import Any, Sequence - - -@dataclass -class Document: - """Class for storing a piece of text and associated metadata.""" - - page_content: str - """String text.""" - metadata: dict = field(default_factory=dict) - """Arbitrary metadata about the page content (e.g., source, relationships to other - documents, etc.). - """ - - -class BaseDocumentTransformer(ABC): - """Abstract base class for document transformation systems. - - A document transformation system takes a sequence of Documents and returns a - sequence of transformed Documents. - - Example: - .. code-block:: python - - class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel): - embeddings: Embeddings - similarity_fn: Callable = cosine_similarity - similarity_threshold: float = 0.95 - - class Config: - arbitrary_types_allowed = True - - def transform_documents( - self, documents: Sequence[Document], **kwargs: Any - ) -> Sequence[Document]: - stateful_documents = get_stateful_documents(documents) - embedded_documents = _get_embeddings_from_stateful_docs( - self.embeddings, stateful_documents - ) - included_idxs = _filter_similar_embeddings( - embedded_documents, self.similarity_fn, self.similarity_threshold - ) - return [stateful_documents[i] for i in sorted(included_idxs)] - - async def atransform_documents( - self, documents: Sequence[Document], **kwargs: Any - ) -> Sequence[Document]: - raise NotImplementedError - - """ # noqa: E501 - - @abstractmethod - def transform_documents( - self, documents: Sequence[Document], **kwargs: Any - ) -> Sequence[Document]: - """Transform a list of documents. - - Args: - documents: A sequence of Documents to be transformed. - - Returns: - A list of transformed Documents. - """ - - @abstractmethod - async def atransform_documents( - self, documents: Sequence[Document], **kwargs: Any - ) -> Sequence[Document]: - """Asynchronously transform a list of documents. - - Args: - documents: A sequence of Documents to be transformed. - - Returns: - A list of transformed Documents. - """ diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/text_splitter.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/text_splitter.py deleted file mode 100644 index c9fb26ead05e..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/text_splitter.py +++ /dev/null @@ -1,1051 +0,0 @@ -# Not all of this file has been vendor, just the parts we use. -# Last Sync: 2023-08-24 -# Commit: 3e5cda3405ec1aa369fe90253d88f3e26a03db10 -"""**Text Splitters** are classes for splitting text. - - -**Class hierarchy:** - -.. code-block:: - - BaseDocumentTransformer --> TextSplitter --> TextSplitter # Example: CharacterTextSplitter - RecursiveCharacterTextSplitter --> TextSplitter - -Note: **MarkdownHeaderTextSplitter** does not derive from TextSplitter. - - -**Main helpers:** - -.. code-block:: - - Document, Tokenizer, Language, LineType, HeaderType - -""" -from __future__ import annotations - -import copy -import logging -import re -from abc import ABC, abstractmethod -from dataclasses import dataclass -from enum import Enum -from typing import ( - AbstractSet, - Any, - Callable, - Collection, - Dict, - Iterable, - List, - Literal, - Optional, - Sequence, - Tuple, - Type, - TypedDict, - TypeVar, - Union, - cast, -) - -from azure.ai.generative.index._langchain.vendor.schema.document import Document -from azure.ai.generative.index._langchain.vendor.schema.document import BaseDocumentTransformer - -logger = logging.getLogger(__name__) - -TS = TypeVar("TS", bound="TextSplitter") - - -def _split_text_with_regex( - text: str, separator: str, keep_separator: bool -) -> List[str]: - # Now that we have the separator, split the text - if separator: - if keep_separator: - # The parentheses in the pattern keep the delimiters in the result. - _splits = re.split(f"({separator})", text) - splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)] - if len(_splits) % 2 == 0: - splits += _splits[-1:] - splits = [_splits[0]] + splits - else: - splits = re.split(separator, text) - else: - splits = list(text) - return [s for s in splits if s != ""] - - -class TextSplitter(BaseDocumentTransformer, ABC): - """Interface for splitting text into chunks.""" - - def __init__( - self, - chunk_size: int = 4000, - chunk_overlap: int = 200, - length_function: Callable[[str], int] = len, - keep_separator: bool = False, - add_start_index: bool = False, - ) -> None: - """Create a new TextSplitter. - - Args: - chunk_size: Maximum size of chunks to return - chunk_overlap: Overlap in characters between chunks - length_function: Function that measures the length of given chunks - keep_separator: Whether to keep the separator in the chunks - add_start_index: If `True`, includes chunk's start index in metadata - """ - if chunk_overlap > chunk_size: - raise ValueError( - f"Got a larger chunk overlap ({chunk_overlap}) than chunk size " - f"({chunk_size}), should be smaller." - ) - self._chunk_size = chunk_size - self._chunk_overlap = chunk_overlap - self._length_function = length_function - self._keep_separator = keep_separator - self._add_start_index = add_start_index - - @abstractmethod - def split_text(self, text: str) -> List[str]: - """Split text into multiple components.""" - - def create_documents( - self, texts: List[str], metadatas: Optional[List[dict]] = None - ) -> List[Document]: - """Create documents from a list of texts.""" - _metadatas = metadatas or [{}] * len(texts) - documents = [] - for i, text in enumerate(texts): - index = -1 - for chunk in self.split_text(text): - metadata = copy.deepcopy(_metadatas[i]) - if self._add_start_index: - index = text.find(chunk, index + 1) - metadata["start_index"] = index - new_doc = Document(page_content=chunk, metadata=metadata) - documents.append(new_doc) - return documents - - def split_documents(self, documents: Iterable[Document]) -> List[Document]: - """Split documents.""" - texts, metadatas = [], [] - for doc in documents: - texts.append(doc.page_content) - metadatas.append(doc.metadata) - return self.create_documents(texts, metadatas=metadatas) - - def _join_docs(self, docs: List[str], separator: str) -> Optional[str]: - text = separator.join(docs) - text = text.strip() - if text == "": - return None - else: - return text - - def _merge_splits(self, splits: Iterable[str], separator: str) -> List[str]: - # We now want to combine these smaller pieces into medium size - # chunks to send to the LLM. - separator_len = self._length_function(separator) - - docs = [] - current_doc: List[str] = [] - total = 0 - for d in splits: - _len = self._length_function(d) - if ( - total + _len + (separator_len if len(current_doc) > 0 else 0) - > self._chunk_size - ): - if total > self._chunk_size: - logger.warning( - f"Created a chunk of size {total}, " - f"which is longer than the specified {self._chunk_size}" - ) - if len(current_doc) > 0: - doc = self._join_docs(current_doc, separator) - if doc is not None: - docs.append(doc) - # Keep on popping if: - # - we have a larger chunk than in the chunk overlap - # - or if we still have any chunks and the length is long - while total > self._chunk_overlap or ( - total + _len + (separator_len if len(current_doc) > 0 else 0) - > self._chunk_size - and total > 0 - ): - total -= self._length_function(current_doc[0]) + ( - separator_len if len(current_doc) > 1 else 0 - ) - current_doc = current_doc[1:] - current_doc.append(d) - total += _len + (separator_len if len(current_doc) > 1 else 0) - doc = self._join_docs(current_doc, separator) - if doc is not None: - docs.append(doc) - return docs - - @classmethod - def from_huggingface_tokenizer(cls, tokenizer: Any, **kwargs: Any) -> TextSplitter: - """Text splitter that uses HuggingFace tokenizer to count length.""" - try: - from transformers import PreTrainedTokenizerBase - - if not isinstance(tokenizer, PreTrainedTokenizerBase): - raise ValueError( - "Tokenizer received was not an instance of PreTrainedTokenizerBase" - ) - - def _huggingface_tokenizer_length(text: str) -> int: - return len(tokenizer.encode(text)) - - except ImportError: - raise ValueError( - "Could not import transformers python package. " - "Please install it with `pip install transformers`." - ) - return cls(length_function=_huggingface_tokenizer_length, **kwargs) - - @classmethod - def from_tiktoken_encoder( - cls: Type[TS], - encoding_name: str = "gpt2", - model_name: Optional[str] = None, - allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), - disallowed_special: Union[Literal["all"], Collection[str]] = "all", - **kwargs: Any, - ) -> TS: - """Text splitter that uses tiktoken encoder to count length.""" - try: - import tiktoken - except ImportError: - raise ImportError( - "Could not import tiktoken python package. " - "This is needed in order to calculate max_tokens_for_prompt. " - "Please install it with `pip install tiktoken`." - ) - - if model_name is not None: - enc = tiktoken.encoding_for_model(model_name) - else: - enc = tiktoken.get_encoding(encoding_name) - - def _tiktoken_encoder(text: str) -> int: - return len( - enc.encode( - text, - allowed_special=allowed_special, - disallowed_special=disallowed_special, - ) - ) - - if issubclass(cls, TokenTextSplitter): - extra_kwargs = { - "encoding_name": encoding_name, - "model_name": model_name, - "allowed_special": allowed_special, - "disallowed_special": disallowed_special, - } - kwargs = {**kwargs, **extra_kwargs} - - return cls(length_function=_tiktoken_encoder, **kwargs) - - def transform_documents( - self, documents: Sequence[Document], **kwargs: Any - ) -> Sequence[Document]: - """Transform sequence of documents by splitting them.""" - return self.split_documents(list(documents)) - - async def atransform_documents( - self, documents: Sequence[Document], **kwargs: Any - ) -> Sequence[Document]: - """Asynchronously transform a sequence of documents by splitting them.""" - raise NotImplementedError - - -class CharacterTextSplitter(TextSplitter): - """Splitting text that looks at characters.""" - - def __init__( - self, separator: str = "\n\n", is_separator_regex: bool = False, **kwargs: Any - ) -> None: - """Create a new TextSplitter.""" - super().__init__(**kwargs) - self._separator = separator - self._is_separator_regex = is_separator_regex - - def split_text(self, text: str) -> List[str]: - """Split incoming text and return chunks.""" - # First we naively split the large input into a bunch of smaller ones. - separator = ( - self._separator if self._is_separator_regex else re.escape(self._separator) - ) - splits = _split_text_with_regex(text, separator, self._keep_separator) - _separator = "" if self._keep_separator else self._separator - return self._merge_splits(splits, _separator) - - -class LineType(TypedDict): - """Line type as typed dict.""" - - metadata: Dict[str, str] - content: str - - -class HeaderType(TypedDict): - """Header type as typed dict.""" - - level: int - name: str - data: str - - -class MarkdownHeaderTextSplitter: - """Splitting markdown files based on specified headers.""" - - def __init__( - self, headers_to_split_on: List[Tuple[str, str]], return_each_line: bool = False - ): - """Create a new MarkdownHeaderTextSplitter. - - Args: - headers_to_split_on: Headers we want to track - return_each_line: Return each line w/ associated headers - """ - # Output line-by-line or aggregated into chunks w/ common headers - self.return_each_line = return_each_line - # Given the headers we want to split on, - # (e.g., "#, ##, etc") order by length - self.headers_to_split_on = sorted( - headers_to_split_on, key=lambda split: len(split[0]), reverse=True - ) - - def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]: - """Combine lines with common metadata into chunks - Args: - lines: Line of text / associated header metadata - """ - aggregated_chunks: List[LineType] = [] - - for line in lines: - if ( - aggregated_chunks - and aggregated_chunks[-1]["metadata"] == line["metadata"] - ): - # If the last line in the aggregated list - # has the same metadata as the current line, - # append the current content to the last lines's content - aggregated_chunks[-1]["content"] += " \n" + line["content"] - else: - # Otherwise, append the current line to the aggregated list - aggregated_chunks.append(line) - - return [ - Document(page_content=chunk["content"], metadata=chunk["metadata"]) - for chunk in aggregated_chunks - ] - - def split_text(self, text: str) -> List[Document]: - """Split markdown file - Args: - text: Markdown file""" - - # Split the input text by newline character ("\n"). - lines = text.split("\n") - # Final output - lines_with_metadata: List[LineType] = [] - # Content and metadata of the chunk currently being processed - current_content: List[str] = [] - current_metadata: Dict[str, str] = {} - # Keep track of the nested header structure - # header_stack: List[Dict[str, Union[int, str]]] = [] - header_stack: List[HeaderType] = [] - initial_metadata: Dict[str, str] = {} - - for line in lines: - stripped_line = line.strip() - # Check each line against each of the header types (e.g., #, ##) - for sep, name in self.headers_to_split_on: - # Check if line starts with a header that we intend to split on - if stripped_line.startswith(sep) and ( - # Header with no text OR header is followed by space - # Both are valid conditions that sep is being used a header - len(stripped_line) == len(sep) - or stripped_line[len(sep)] == " " - ): - # Ensure we are tracking the header as metadata - if name is not None: - # Get the current header level - current_header_level = sep.count("#") - - # Pop out headers of lower or same level from the stack - while ( - header_stack - and header_stack[-1]["level"] >= current_header_level - ): - # We have encountered a new header - # at the same or higher level - popped_header = header_stack.pop() - # Clear the metadata for the - # popped header in initial_metadata - if popped_header["name"] in initial_metadata: - initial_metadata.pop(popped_header["name"]) - - # Push the current header to the stack - header: HeaderType = { - "level": current_header_level, - "name": name, - "data": stripped_line[len(sep) :].strip(), - } - header_stack.append(header) - # Update initial_metadata with the current header - initial_metadata[name] = header["data"] - - # Add the previous line to the lines_with_metadata - # only if current_content is not empty - if current_content: - lines_with_metadata.append( - { - "content": "\n".join(current_content), - "metadata": current_metadata.copy(), - } - ) - current_content.clear() - - break - else: - if stripped_line: - current_content.append(stripped_line) - elif current_content: - lines_with_metadata.append( - { - "content": "\n".join(current_content), - "metadata": current_metadata.copy(), - } - ) - current_content.clear() - - current_metadata = initial_metadata.copy() - - if current_content: - lines_with_metadata.append( - {"content": "\n".join(current_content), "metadata": current_metadata} - ) - - # lines_with_metadata has each line with associated header metadata - # aggregate these into chunks based on common metadata - if not self.return_each_line: - return self.aggregate_lines_to_chunks(lines_with_metadata) - else: - return [ - Document(page_content=chunk["content"], metadata=chunk["metadata"]) - for chunk in lines_with_metadata - ] - - -# should be in newer Python versions (3.10+) -# @dataclass(frozen=True, kw_only=True, slots=True) -@dataclass(frozen=True) -class Tokenizer: - chunk_overlap: int - tokens_per_chunk: int - decode: Callable[[list[int]], str] - encode: Callable[[str], List[int]] - - -def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> List[str]: - """Split incoming text and return chunks using tokenizer.""" - splits: List[str] = [] - input_ids = tokenizer.encode(text) - start_idx = 0 - cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids)) - chunk_ids = input_ids[start_idx:cur_idx] - while start_idx < len(input_ids): - splits.append(tokenizer.decode(chunk_ids)) - start_idx += tokenizer.tokens_per_chunk - tokenizer.chunk_overlap - cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids)) - chunk_ids = input_ids[start_idx:cur_idx] - return splits - - -class TokenTextSplitter(TextSplitter): - """Splitting text to tokens using model tokenizer.""" - - def __init__( - self, - encoding_name: str = "gpt2", - model_name: Optional[str] = None, - allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), - disallowed_special: Union[Literal["all"], Collection[str]] = "all", - **kwargs: Any, - ) -> None: - """Create a new TextSplitter.""" - super().__init__(**kwargs) - try: - import tiktoken - except ImportError: - raise ImportError( - "Could not import tiktoken python package. " - "This is needed in order to for TokenTextSplitter. " - "Please install it with `pip install tiktoken`." - ) - - if model_name is not None: - enc = tiktoken.encoding_for_model(model_name) - else: - enc = tiktoken.get_encoding(encoding_name) - self._tokenizer = enc - self._allowed_special = allowed_special - self._disallowed_special = disallowed_special - - def split_text(self, text: str) -> List[str]: - def _encode(_text: str) -> List[int]: - return self._tokenizer.encode( - _text, - allowed_special=self._allowed_special, - disallowed_special=self._disallowed_special, - ) - - tokenizer = Tokenizer( - chunk_overlap=self._chunk_overlap, - tokens_per_chunk=self._chunk_size, - decode=self._tokenizer.decode, - encode=_encode, - ) - - return split_text_on_tokens(text=text, tokenizer=tokenizer) - - -class SentenceTransformersTokenTextSplitter(TextSplitter): - """Splitting text to tokens using sentence model tokenizer.""" - - def __init__( - self, - chunk_overlap: int = 50, - model_name: str = "sentence-transformers/all-mpnet-base-v2", - tokens_per_chunk: Optional[int] = None, - **kwargs: Any, - ) -> None: - """Create a new TextSplitter.""" - super().__init__(**kwargs, chunk_overlap=chunk_overlap) - - try: - from sentence_transformers import SentenceTransformer - except ImportError: - raise ImportError( - "Could not import sentence_transformer python package. " - "This is needed in order to for SentenceTransformersTokenTextSplitter. " - "Please install it with `pip install sentence-transformers`." - ) - - self.model_name = model_name - self._model = SentenceTransformer(self.model_name) - self.tokenizer = self._model.tokenizer - self._initialize_chunk_configuration(tokens_per_chunk=tokens_per_chunk) - - def _initialize_chunk_configuration( - self, *, tokens_per_chunk: Optional[int] - ) -> None: - self.maximum_tokens_per_chunk = cast(int, self._model.max_seq_length) - - if tokens_per_chunk is None: - self.tokens_per_chunk = self.maximum_tokens_per_chunk - else: - self.tokens_per_chunk = tokens_per_chunk - - if self.tokens_per_chunk > self.maximum_tokens_per_chunk: - raise ValueError( - f"The token limit of the models '{self.model_name}'" - f" is: {self.maximum_tokens_per_chunk}." - f" Argument tokens_per_chunk={self.tokens_per_chunk}" - f" > maximum token limit." - ) - - def split_text(self, text: str) -> List[str]: - def encode_strip_start_and_stop_token_ids(text: str) -> List[int]: - return self._encode(text)[1:-1] - - tokenizer = Tokenizer( - chunk_overlap=self._chunk_overlap, - tokens_per_chunk=self.tokens_per_chunk, - decode=self.tokenizer.decode, - encode=encode_strip_start_and_stop_token_ids, - ) - - return split_text_on_tokens(text=text, tokenizer=tokenizer) - - def count_tokens(self, *, text: str) -> int: - return len(self._encode(text)) - - _max_length_equal_32_bit_integer: int = 2**32 - - def _encode(self, text: str) -> List[int]: - token_ids_with_start_and_end_token_ids = self.tokenizer.encode( - text, - max_length=self._max_length_equal_32_bit_integer, - truncation="do_not_truncate", - ) - return token_ids_with_start_and_end_token_ids - - -class Language(str, Enum): - """Enum of the programming languages.""" - - CPP = "cpp" - GO = "go" - JAVA = "java" - JS = "js" - PHP = "php" - PROTO = "proto" - PYTHON = "python" - RST = "rst" - RUBY = "ruby" - RUST = "rust" - SCALA = "scala" - SWIFT = "swift" - MARKDOWN = "markdown" - LATEX = "latex" - HTML = "html" - SOL = "sol" - - -class RecursiveCharacterTextSplitter(TextSplitter): - """Splitting text by recursively look at characters. - - Recursively tries to split by different characters to find one - that works. - """ - - def __init__( - self, - separators: Optional[List[str]] = None, - keep_separator: bool = True, - is_separator_regex: bool = False, - **kwargs: Any, - ) -> None: - """Create a new TextSplitter.""" - super().__init__(keep_separator=keep_separator, **kwargs) - self._separators = separators or ["\n\n", "\n", " ", ""] - self._is_separator_regex = is_separator_regex - - def _split_text(self, text: str, separators: List[str]) -> List[str]: - """Split incoming text and return chunks.""" - final_chunks = [] - # Get appropriate separator to use - separator = separators[-1] - new_separators = [] - for i, _s in enumerate(separators): - _separator = _s if self._is_separator_regex else re.escape(_s) - if _s == "": - separator = _s - break - if re.search(_separator, text): - separator = _s - new_separators = separators[i + 1 :] - break - - _separator = separator if self._is_separator_regex else re.escape(separator) - splits = _split_text_with_regex(text, _separator, self._keep_separator) - - # Now go merging things, recursively splitting longer texts. - _good_splits = [] - _separator = "" if self._keep_separator else separator - for s in splits: - if self._length_function(s) < self._chunk_size: - _good_splits.append(s) - else: - if _good_splits: - merged_text = self._merge_splits(_good_splits, _separator) - final_chunks.extend(merged_text) - _good_splits = [] - if not new_separators: - final_chunks.append(s) - else: - other_info = self._split_text(s, new_separators) - final_chunks.extend(other_info) - if _good_splits: - merged_text = self._merge_splits(_good_splits, _separator) - final_chunks.extend(merged_text) - return final_chunks - - def split_text(self, text: str) -> List[str]: - return self._split_text(text, self._separators) - - @classmethod - def from_language( - cls, language: Language, **kwargs: Any - ) -> RecursiveCharacterTextSplitter: - separators = cls.get_separators_for_language(language) - return cls(separators=separators, is_separator_regex=True, **kwargs) - - @staticmethod - def get_separators_for_language(language: Language) -> List[str]: - if language == Language.CPP: - return [ - # Split along class definitions - "\nclass ", - # Split along function definitions - "\nvoid ", - "\nint ", - "\nfloat ", - "\ndouble ", - # Split along control flow statements - "\nif ", - "\nfor ", - "\nwhile ", - "\nswitch ", - "\ncase ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.GO: - return [ - # Split along function definitions - "\nfunc ", - "\nvar ", - "\nconst ", - "\ntype ", - # Split along control flow statements - "\nif ", - "\nfor ", - "\nswitch ", - "\ncase ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.JAVA: - return [ - # Split along class definitions - "\nclass ", - # Split along method definitions - "\npublic ", - "\nprotected ", - "\nprivate ", - "\nstatic ", - # Split along control flow statements - "\nif ", - "\nfor ", - "\nwhile ", - "\nswitch ", - "\ncase ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.JS: - return [ - # Split along function definitions - "\nfunction ", - "\nconst ", - "\nlet ", - "\nvar ", - "\nclass ", - # Split along control flow statements - "\nif ", - "\nfor ", - "\nwhile ", - "\nswitch ", - "\ncase ", - "\ndefault ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.PHP: - return [ - # Split along function definitions - "\nfunction ", - # Split along class definitions - "\nclass ", - # Split along control flow statements - "\nif ", - "\nforeach ", - "\nwhile ", - "\ndo ", - "\nswitch ", - "\ncase ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.PROTO: - return [ - # Split along message definitions - "\nmessage ", - # Split along service definitions - "\nservice ", - # Split along enum definitions - "\nenum ", - # Split along option definitions - "\noption ", - # Split along import statements - "\nimport ", - # Split along syntax declarations - "\nsyntax ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.PYTHON: - return [ - # First, try to split along class definitions - "\nclass ", - "\ndef ", - "\n\tdef ", - # Now split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.RST: - return [ - # Split along section titles - "\n=+\n", - "\n-+\n", - "\n\\*+\n", - # Split along directive markers - "\n\n.. *\n\n", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.RUBY: - return [ - # Split along method definitions - "\ndef ", - "\nclass ", - # Split along control flow statements - "\nif ", - "\nunless ", - "\nwhile ", - "\nfor ", - "\ndo ", - "\nbegin ", - "\nrescue ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.RUST: - return [ - # Split along function definitions - "\nfn ", - "\nconst ", - "\nlet ", - # Split along control flow statements - "\nif ", - "\nwhile ", - "\nfor ", - "\nloop ", - "\nmatch ", - "\nconst ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.SCALA: - return [ - # Split along class definitions - "\nclass ", - "\nobject ", - # Split along method definitions - "\ndef ", - "\nval ", - "\nvar ", - # Split along control flow statements - "\nif ", - "\nfor ", - "\nwhile ", - "\nmatch ", - "\ncase ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.SWIFT: - return [ - # Split along function definitions - "\nfunc ", - # Split along class definitions - "\nclass ", - "\nstruct ", - "\nenum ", - # Split along control flow statements - "\nif ", - "\nfor ", - "\nwhile ", - "\ndo ", - "\nswitch ", - "\ncase ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.MARKDOWN: - return [ - # First, try to split along Markdown headings (starting with level 2) - "\n#{1,6} ", - # Note the alternative syntax for headings (below) is not handled here - # Heading level 2 - # --------------- - # End of code block - "```\n", - # Horizontal lines - "\n\\*\\*\\*+\n", - "\n---+\n", - "\n___+\n", - # Note that this splitter doesn't handle horizontal lines defined - # by *three or more* of ***, ---, or ___, but this is not handled - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.LATEX: - return [ - # First, try to split along Latex sections - "\n\\\\chapter{", - "\n\\\\section{", - "\n\\\\subsection{", - "\n\\\\subsubsection{", - # Now split by environments - "\n\\\\begin{enumerate}", - "\n\\\\begin{itemize}", - "\n\\\\begin{description}", - "\n\\\\begin{list}", - "\n\\\\begin{quote}", - "\n\\\\begin{quotation}", - "\n\\\\begin{verse}", - "\n\\\\begin{verbatim}", - # Now split by math environments - "\n\\\begin{align}", - "$$", - "$", - # Now split by the normal type of lines - " ", - "", - ] - elif language == Language.HTML: - return [ - # First, try to split along HTML tags - " None: - """Initialize the NLTK splitter.""" - super().__init__(**kwargs) - try: - from nltk.tokenize import sent_tokenize - - self._tokenizer = sent_tokenize - except ImportError: - raise ImportError( - "NLTK is not installed, please install it with `pip install nltk`." - ) - self._separator = separator - - def split_text(self, text: str) -> List[str]: - """Split incoming text and return chunks.""" - # First we naively split the large input into a bunch of smaller ones. - splits = self._tokenizer(text) - return self._merge_splits(splits, self._separator) - - -class MarkdownTextSplitter(RecursiveCharacterTextSplitter): - """Attempts to split the text along Markdown-formatted headings.""" - - def __init__(self, **kwargs: Any) -> None: - """Initialize a MarkdownTextSplitter.""" - separators = self.get_separators_for_language(Language.MARKDOWN) - super().__init__(separators=separators, **kwargs) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/utils/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/utils/__init__.py deleted file mode 100644 index 624f5ee88ecf..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/utils/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- - - -__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/utils/math.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/utils/math.py deleted file mode 100644 index 41e1b6a0bd00..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/utils/math.py +++ /dev/null @@ -1,61 +0,0 @@ -# This file has been copied as is. -# Last Sync: 2023-08-24 -# Commit: 3e5cda3405ec1aa369fe90253d88f3e26a03db10 -"""Math utils.""" -from typing import List, Optional, Tuple, Union - -import numpy as np - -Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray] - - -def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray: - """Row-wise cosine similarity between two equal-width matrices.""" - if len(X) == 0 or len(Y) == 0: - return np.array([]) - X = np.array(X) - Y = np.array(Y) - if X.shape[1] != Y.shape[1]: - raise ValueError( - f"Number of columns in X and Y must be the same. X has shape {X.shape} " - f"and Y has shape {Y.shape}." - ) - - X_norm = np.linalg.norm(X, axis=1) - Y_norm = np.linalg.norm(Y, axis=1) - # Ignore divide by zero errors run time warnings as those are handled below. - with np.errstate(divide="ignore", invalid="ignore"): - similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm) - similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0 - return similarity - - -def cosine_similarity_top_k( - X: Matrix, - Y: Matrix, - top_k: Optional[int] = 5, - score_threshold: Optional[float] = None, -) -> Tuple[List[Tuple[int, int]], List[float]]: - """Row-wise cosine similarity with optional top-k and score threshold filtering. - - Args: - X: Matrix. - Y: Matrix, same width as X. - top_k: Max number of results to return. - score_threshold: Minimum cosine similarity of results. - - Returns: - Tuple of two lists. First contains two-tuples of indices (X_idx, Y_idx), - second contains corresponding cosine similarities. - """ - if len(X) == 0 or len(Y) == 0: - return [], [] - score_array = cosine_similarity(X, Y) - score_threshold = score_threshold or -1.0 - score_array[score_array < score_threshold] = 0 - top_k = min(top_k or len(score_array), np.count_nonzero(score_array)) - top_k_idxs = np.argpartition(score_array, -top_k, axis=None)[-top_k:] - top_k_idxs = top_k_idxs[np.argsort(score_array.ravel()[top_k_idxs])][::-1] - ret_idxs = np.unravel_index(top_k_idxs, score_array.shape) - scores = score_array.ravel()[top_k_idxs].tolist() - return list(zip(*ret_idxs)), scores # type: ignore diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/__init__.py deleted file mode 100644 index 624f5ee88ecf..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- - - -__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/base.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/base.py deleted file mode 100644 index 32ac4a7f6b8c..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/base.py +++ /dev/null @@ -1,431 +0,0 @@ -# This file has been slightly modified to not rely on Pydantic. -# Last Sync: 2023-09-05 -# Tag: v0.0.220 -"""Interface for vector stores.""" -from __future__ import annotations - -import asyncio -import warnings -from abc import ABC, abstractmethod -from dataclasses import dataclass, field -from functools import partial -from typing import ( - Any, - ClassVar, - Collection, - Dict, - Iterable, - List, - Optional, - Tuple, - Type, - TypeVar, -) - -from azure.ai.generative.index._langchain.vendor.schema.document import Document -from azure.ai.generative.index._langchain.vendor.embeddings.base import Embeddings -from azure.ai.generative.index._langchain.vendor.schema import BaseRetriever - -VST = TypeVar("VST", bound="VectorStore") - - -class VectorStore(ABC): - """Interface for vector stores.""" - - @abstractmethod - def add_texts( - self, - texts: Iterable[str], - metadatas: Optional[List[dict]] = None, - **kwargs: Any, - ) -> List[str]: - """Run more texts through the embeddings and add to the vectorstore. - - Args: - texts: Iterable of strings to add to the vectorstore. - metadatas: Optional list of metadatas associated with the texts. - kwargs: vectorstore specific parameters - - Returns: - List of ids from adding the texts into the vectorstore. - """ - - def delete(self, ids: List[str]) -> Optional[bool]: - """Delete by vector ID. - - Args: - ids: List of ids to delete. - - Returns: - Optional[bool]: True if deletion is successful, - False otherwise, None if not implemented. - """ - - raise NotImplementedError( - "delete_by_id method must be implemented by subclass." - ) - - async def aadd_texts( - self, - texts: Iterable[str], - metadatas: Optional[List[dict]] = None, - **kwargs: Any, - ) -> List[str]: - """Run more texts through the embeddings and add to the vectorstore.""" - raise NotImplementedError - - def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]: - """Run more documents through the embeddings and add to the vectorstore. - - Args: - documents (List[Document]: Documents to add to the vectorstore. - - Returns: - List[str]: List of IDs of the added texts. - """ - # TODO: Handle the case where the user doesn't provide ids on the Collection - texts = [doc.page_content for doc in documents] - metadatas = [doc.metadata for doc in documents] - return self.add_texts(texts, metadatas, **kwargs) - - async def aadd_documents( - self, documents: List[Document], **kwargs: Any - ) -> List[str]: - """Run more documents through the embeddings and add to the vectorstore. - - Args: - documents (List[Document]: Documents to add to the vectorstore. - - Returns: - List[str]: List of IDs of the added texts. - """ - texts = [doc.page_content for doc in documents] - metadatas = [doc.metadata for doc in documents] - return await self.aadd_texts(texts, metadatas, **kwargs) - - def search(self, query: str, search_type: str, **kwargs: Any) -> List[Document]: - """Return docs most similar to query using specified search type.""" - if search_type == "similarity": - return self.similarity_search(query, **kwargs) - elif search_type == "mmr": - return self.max_marginal_relevance_search(query, **kwargs) - else: - raise ValueError( - f"search_type of {search_type} not allowed. Expected " - "search_type to be 'similarity' or 'mmr'." - ) - - async def asearch( - self, query: str, search_type: str, **kwargs: Any - ) -> List[Document]: - """Return docs most similar to query using specified search type.""" - if search_type == "similarity": - return await self.asimilarity_search(query, **kwargs) - elif search_type == "mmr": - return await self.amax_marginal_relevance_search(query, **kwargs) - else: - raise ValueError( - f"search_type of {search_type} not allowed. Expected " - "search_type to be 'similarity' or 'mmr'." - ) - - @abstractmethod - def similarity_search( - self, query: str, k: int = 4, **kwargs: Any - ) -> List[Document]: - """Return docs most similar to query.""" - - def similarity_search_with_relevance_scores( - self, - query: str, - k: int = 4, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Return docs and relevance scores in the range [0, 1]. - - 0 is dissimilar, 1 is most similar. - - Args: - query: input text - k: Number of Documents to return. Defaults to 4. - **kwargs: kwargs to be passed to similarity search. Should include: - score_threshold: Optional, a floating point value between 0 to 1 to - filter the resulting set of retrieved docs - - Returns: - List of Tuples of (doc, similarity_score) - """ - docs_and_similarities = self._similarity_search_with_relevance_scores( - query, k=k, **kwargs - ) - if any( - similarity < 0.0 or similarity > 1.0 - for _, similarity in docs_and_similarities - ): - warnings.warn( - "Relevance scores must be between" - f" 0 and 1, got {docs_and_similarities}" - ) - - score_threshold = kwargs.get("score_threshold") - if score_threshold is not None: - docs_and_similarities = [ - (doc, similarity) - for doc, similarity in docs_and_similarities - if similarity >= score_threshold - ] - if len(docs_and_similarities) == 0: - warnings.warn( - "No relevant docs were retrieved using the relevance score" - f" threshold {score_threshold}" - ) - return docs_and_similarities - - def _similarity_search_with_relevance_scores( - self, - query: str, - k: int = 4, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Return docs and relevance scores, normalized on a scale from 0 to 1. - - 0 is dissimilar, 1 is most similar. - """ - raise NotImplementedError - - async def asimilarity_search_with_relevance_scores( - self, query: str, k: int = 4, **kwargs: Any - ) -> List[Tuple[Document, float]]: - """Return docs most similar to query.""" - - # This is a temporary workaround to make the similarity search - # asynchronous. The proper solution is to make the similarity search - # asynchronous in the vector store implementations. - func = partial(self.similarity_search_with_relevance_scores, query, k, **kwargs) - return await asyncio.get_event_loop().run_in_executor(None, func) - - async def asimilarity_search( - self, query: str, k: int = 4, **kwargs: Any - ) -> List[Document]: - """Return docs most similar to query.""" - - # This is a temporary workaround to make the similarity search - # asynchronous. The proper solution is to make the similarity search - # asynchronous in the vector store implementations. - func = partial(self.similarity_search, query, k, **kwargs) - return await asyncio.get_event_loop().run_in_executor(None, func) - - def similarity_search_by_vector( - self, embedding: List[float], k: int = 4, **kwargs: Any - ) -> List[Document]: - """Return docs most similar to embedding vector. - - Args: - embedding: Embedding to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - - Returns: - List of Documents most similar to the query vector. - """ - raise NotImplementedError - - async def asimilarity_search_by_vector( - self, embedding: List[float], k: int = 4, **kwargs: Any - ) -> List[Document]: - """Return docs most similar to embedding vector.""" - - # This is a temporary workaround to make the similarity search - # asynchronous. The proper solution is to make the similarity search - # asynchronous in the vector store implementations. - func = partial(self.similarity_search_by_vector, embedding, k, **kwargs) - return await asyncio.get_event_loop().run_in_executor(None, func) - - def max_marginal_relevance_search( - self, - query: str, - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - **kwargs: Any, - ) -> List[Document]: - """Return docs selected using the maximal marginal relevance. - - Maximal marginal relevance optimizes for similarity to query AND diversity - among selected documents. - - Args: - query: Text to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - fetch_k: Number of Documents to fetch to pass to MMR algorithm. - lambda_mult: Number between 0 and 1 that determines the degree - of diversity among the results with 0 corresponding - to maximum diversity and 1 to minimum diversity. - Defaults to 0.5. - Returns: - List of Documents selected by maximal marginal relevance. - """ - raise NotImplementedError - - async def amax_marginal_relevance_search( - self, - query: str, - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - **kwargs: Any, - ) -> List[Document]: - """Return docs selected using the maximal marginal relevance.""" - - # This is a temporary workaround to make the similarity search - # asynchronous. The proper solution is to make the similarity search - # asynchronous in the vector store implementations. - func = partial( - self.max_marginal_relevance_search, query, k, fetch_k, lambda_mult, **kwargs - ) - return await asyncio.get_event_loop().run_in_executor(None, func) - - def max_marginal_relevance_search_by_vector( - self, - embedding: List[float], - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - **kwargs: Any, - ) -> List[Document]: - """Return docs selected using the maximal marginal relevance. - - Maximal marginal relevance optimizes for similarity to query AND diversity - among selected documents. - - Args: - embedding: Embedding to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - fetch_k: Number of Documents to fetch to pass to MMR algorithm. - lambda_mult: Number between 0 and 1 that determines the degree - of diversity among the results with 0 corresponding - to maximum diversity and 1 to minimum diversity. - Defaults to 0.5. - Returns: - List of Documents selected by maximal marginal relevance. - """ - raise NotImplementedError - - async def amax_marginal_relevance_search_by_vector( - self, - embedding: List[float], - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - **kwargs: Any, - ) -> List[Document]: - """Return docs selected using the maximal marginal relevance.""" - raise NotImplementedError - - @classmethod - def from_documents( - cls: Type[VST], - documents: List[Document], - embedding: Embeddings, - **kwargs: Any, - ) -> VST: - """Return VectorStore initialized from documents and embeddings.""" - texts = [d.page_content for d in documents] - metadatas = [d.metadata for d in documents] - return cls.from_texts(texts, embedding, metadatas=metadatas, **kwargs) - - @classmethod - async def afrom_documents( - cls: Type[VST], - documents: List[Document], - embedding: Embeddings, - **kwargs: Any, - ) -> VST: - """Return VectorStore initialized from documents and embeddings.""" - texts = [d.page_content for d in documents] - metadatas = [d.metadata for d in documents] - return await cls.afrom_texts(texts, embedding, metadatas=metadatas, **kwargs) - - @classmethod - @abstractmethod - def from_texts( - cls: Type[VST], - texts: List[str], - embedding: Embeddings, - metadatas: Optional[List[dict]] = None, - **kwargs: Any, - ) -> VST: - """Return VectorStore initialized from texts and embeddings.""" - - @classmethod - async def afrom_texts( - cls: Type[VST], - texts: List[str], - embedding: Embeddings, - metadatas: Optional[List[dict]] = None, - **kwargs: Any, - ) -> VST: - """Return VectorStore initialized from texts and embeddings.""" - raise NotImplementedError - - def as_retriever(self, **kwargs: Any) -> VectorStoreRetriever: - return VectorStoreRetriever(vectorstore=self, **kwargs) - - -@dataclass -class VectorStoreRetriever(BaseRetriever): - vectorstore: VectorStore - search_type: str = "similarity" - search_kwargs: dict = field(default_factory=dict) - allowed_search_types: ClassVar[Collection[str]] = ( - "similarity", - "similarity_score_threshold", - "mmr", - ) - - def get_relevant_documents(self, query: str) -> List[Document]: - if self.search_type == "similarity": - docs = self.vectorstore.similarity_search(query, **self.search_kwargs) - elif self.search_type == "similarity_score_threshold": - docs_and_similarities = ( - self.vectorstore.similarity_search_with_relevance_scores( - query, **self.search_kwargs - ) - ) - docs = [doc for doc, _ in docs_and_similarities] - elif self.search_type == "mmr": - docs = self.vectorstore.max_marginal_relevance_search( - query, **self.search_kwargs - ) - else: - raise ValueError(f"search_type of {self.search_type} not allowed.") - return docs - - async def aget_relevant_documents(self, query: str) -> List[Document]: - if self.search_type == "similarity": - docs = await self.vectorstore.asimilarity_search( - query, **self.search_kwargs - ) - elif self.search_type == "similarity_score_threshold": - docs_and_similarities = ( - await self.vectorstore.asimilarity_search_with_relevance_scores( - query, **self.search_kwargs - ) - ) - docs = [doc for doc, _ in docs_and_similarities] - elif self.search_type == "mmr": - docs = await self.vectorstore.amax_marginal_relevance_search( - query, **self.search_kwargs - ) - else: - raise ValueError(f"search_type of {self.search_type} not allowed.") - return docs - - def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]: - """Add documents to vectorstore.""" - return self.vectorstore.add_documents(documents, **kwargs) - - async def aadd_documents( - self, documents: List[Document], **kwargs: Any - ) -> List[str]: - """Add documents to vectorstore.""" - return await self.vectorstore.aadd_documents(documents, **kwargs) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/faiss.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/faiss.py deleted file mode 100644 index 7d811ab56f2f..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/faiss.py +++ /dev/null @@ -1,631 +0,0 @@ -"""Wrapper around FAISS vector database.""" -from __future__ import annotations - -import math -import os -import pickle -import uuid -from pathlib import Path -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple - -import numpy as np - -from azure.ai.generative.index._langchain.vendor.docstore.base import AddableMixin, Docstore -from azure.ai.generative.index._langchain.vendor.schema.document import Document -from azure.ai.generative.index._langchain.vendor.embeddings.base import Embeddings -from azure.ai.generative.index._langchain.vendor.vectorstores.base import VectorStore -from azure.ai.generative.index._langchain.vendor.vectorstores.utils import maximal_marginal_relevance - - -def dependable_faiss_import(no_avx2: Optional[bool] = None) -> Any: - """ - Import faiss if available, otherwise raise error. - If FAISS_NO_AVX2 environment variable is set, it will be considered - to load FAISS with no AVX2 optimization. - - Args: - no_avx2: Load FAISS strictly with no AVX2 optimization - so that the vectorstore is portable and compatible with other devices. - """ - if no_avx2 is None and "FAISS_NO_AVX2" in os.environ: - no_avx2 = bool(os.getenv("FAISS_NO_AVX2")) - - try: - if no_avx2: - from faiss import swigfaiss as faiss - else: - import faiss - except ImportError: - raise ImportError( - "Could not import faiss python package. " - "Please install it with `pip install faiss` " - "or `pip install faiss-cpu` (depending on Python version)." - ) - return faiss - - -def _default_relevance_score_fn(score: float) -> float: - """Return a similarity score on a scale [0, 1].""" - # The 'correct' relevance function - # may differ depending on a few things, including: - # - the distance / similarity metric used by the VectorStore - # - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) - # - embedding dimensionality - # - etc. - # This function converts the euclidean norm of normalized embeddings - # (0 is most similar, sqrt(2) most dissimilar) - # to a similarity function (0 to 1) - return 1.0 - score / math.sqrt(2) - - -class FAISS(VectorStore): - """Wrapper around FAISS vector database. - - To use, you should have the ``faiss`` python package installed. - - Example: - .. code-block:: python - - from langchain import FAISS - faiss = FAISS(embedding_function, index, docstore, index_to_docstore_id) - - """ - - def __init__( - self, - embedding_function: Callable, - index: Any, - docstore: Docstore, - index_to_docstore_id: Dict[int, str], - relevance_score_fn: Optional[ - Callable[[float], float] - ] = _default_relevance_score_fn, - normalize_L2: bool = False, - ): - """Initialize with necessary components.""" - self.embedding_function = embedding_function - self.index = index - self.docstore = docstore - self.index_to_docstore_id = index_to_docstore_id - self.relevance_score_fn = relevance_score_fn - self._normalize_L2 = normalize_L2 - - def __add( - self, - texts: Iterable[str], - embeddings: Iterable[List[float]], - metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, - **kwargs: Any, - ) -> List[str]: - if not isinstance(self.docstore, AddableMixin): - raise ValueError( - "If trying to add texts, the underlying docstore should support " - f"adding items, which {self.docstore} does not" - ) - documents = [] - for i, text in enumerate(texts): - metadata = metadatas[i] if metadatas else {} - documents.append(Document(page_content=text, metadata=metadata)) - if ids is None: - ids = [str(uuid.uuid4()) for _ in texts] - # Add to the index, the index_to_id mapping, and the docstore. - starting_len = len(self.index_to_docstore_id) - faiss = dependable_faiss_import() - vector: np.ndarray = np.array(embeddings, dtype=np.float32) - if self._normalize_L2: - faiss.normalize_L2(vector) - self.index.add(vector) - # Get list of index, id, and docs. - full_info = [(starting_len + i, ids[i], doc) for i, doc in enumerate(documents)] - # Add information to docstore and index. - self.docstore.add({_id: doc for _, _id, doc in full_info}) - index_to_id = {index: _id for index, _id, _ in full_info} - self.index_to_docstore_id.update(index_to_id) - return [_id for _, _id, _ in full_info] - - def add_texts( - self, - texts: Iterable[str], - metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, - **kwargs: Any, - ) -> List[str]: - """Run more texts through the embeddings and add to the vectorstore. - - Args: - texts: Iterable of strings to add to the vectorstore. - metadatas: Optional list of metadatas associated with the texts. - ids: Optional list of unique IDs. - - Returns: - List of ids from adding the texts into the vectorstore. - """ - if not isinstance(self.docstore, AddableMixin): - raise ValueError( - "If trying to add texts, the underlying docstore should support " - f"adding items, which {self.docstore} does not" - ) - # Embed and create the documents. - embeddings = [self.embedding_function(text) for text in texts] - return self.__add(texts, embeddings, metadatas=metadatas, ids=ids, **kwargs) - - def add_embeddings( - self, - text_embeddings: Iterable[Tuple[str, List[float]]], - metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, - **kwargs: Any, - ) -> List[str]: - """Run more texts through the embeddings and add to the vectorstore. - - Args: - text_embeddings: Iterable pairs of string and embedding to - add to the vectorstore. - metadatas: Optional list of metadatas associated with the texts. - ids: Optional list of unique IDs. - - Returns: - List of ids from adding the texts into the vectorstore. - """ - if not isinstance(self.docstore, AddableMixin): - raise ValueError( - "If trying to add texts, the underlying docstore should support " - f"adding items, which {self.docstore} does not" - ) - # Embed and create the documents. - texts, embeddings = zip(*text_embeddings) - - return self.__add(texts, embeddings, metadatas=metadatas, ids=ids, **kwargs) - - def similarity_search_with_score_by_vector( - self, - embedding: List[float], - k: int = 4, - filter: Optional[Dict[str, Any]] = None, - fetch_k: int = 20, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Return docs most similar to query. - - Args: - embedding: Embedding vector to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - filter (Optional[Dict[str, Any]]): Filter by metadata. Defaults to None. - fetch_k: (Optional[int]) Number of Documents to fetch before filtering. - Defaults to 20. - **kwargs: kwargs to be passed to similarity search. Can include: - score_threshold: Optional, a floating point value between 0 to 1 to - filter the resulting set of retrieved docs - - Returns: - List of documents most similar to the query text and L2 distance - in float for each. Lower score represents more similarity. - """ - faiss = dependable_faiss_import() - vector: np.ndarray = np.array([embedding], dtype=np.float32) - if self._normalize_L2: - faiss.normalize_L2(vector) - scores, indices = self.index.search(vector, k if filter is None else fetch_k) - docs = [] - for j, i in enumerate(indices[0]): - if i == -1: - # This happens when not enough docs are returned. - continue - _id = self.index_to_docstore_id[i] - doc = self.docstore.search(_id) - if not isinstance(doc, Document): - raise ValueError(f"Could not find document for id {_id}, got {doc}") - if filter is not None: - filter = { - key: [value] if not isinstance(value, list) else value - for key, value in filter.items() - } - if all(doc.metadata.get(key) in value for key, value in filter.items()): - docs.append((doc, scores[0][j])) - else: - docs.append((doc, scores[0][j])) - - score_threshold = kwargs.get("score_threshold") - if score_threshold is not None: - docs = [ - (doc, similarity) - for doc, similarity in docs - if similarity >= score_threshold - ] - return docs[:k] - - def similarity_search_with_score( - self, - query: str, - k: int = 4, - filter: Optional[Dict[str, Any]] = None, - fetch_k: int = 20, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Return docs most similar to query. - - Args: - query: Text to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. - fetch_k: (Optional[int]) Number of Documents to fetch before filtering. - Defaults to 20. - - Returns: - List of documents most similar to the query text with - L2 distance in float. Lower score represents more similarity. - """ - embedding = self.embedding_function(query) - docs = self.similarity_search_with_score_by_vector( - embedding, - k, - filter=filter, - fetch_k=fetch_k, - **kwargs, - ) - return docs - - def similarity_search_by_vector( - self, - embedding: List[float], - k: int = 4, - filter: Optional[Dict[str, Any]] = None, - fetch_k: int = 20, - **kwargs: Any, - ) -> List[Document]: - """Return docs most similar to embedding vector. - - Args: - embedding: Embedding to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. - fetch_k: (Optional[int]) Number of Documents to fetch before filtering. - Defaults to 20. - - Returns: - List of Documents most similar to the embedding. - """ - docs_and_scores = self.similarity_search_with_score_by_vector( - embedding, - k, - filter=filter, - fetch_k=fetch_k, - **kwargs, - ) - return [doc for doc, _ in docs_and_scores] - - def similarity_search( - self, - query: str, - k: int = 4, - filter: Optional[Dict[str, Any]] = None, - fetch_k: int = 20, - **kwargs: Any, - ) -> List[Document]: - """Return docs most similar to query. - - Args: - query: Text to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - filter: (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. - fetch_k: (Optional[int]) Number of Documents to fetch before filtering. - Defaults to 20. - - Returns: - List of Documents most similar to the query. - """ - docs_and_scores = self.similarity_search_with_score( - query, k, filter=filter, fetch_k=fetch_k, **kwargs - ) - return [doc for doc, _ in docs_and_scores] - - def max_marginal_relevance_search_with_score_by_vector( - self, - embedding: List[float], - *, - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - filter: Optional[Dict[str, Any]] = None, - ) -> List[Tuple[Document, float]]: - """Return docs and their similarity scores selected using the maximal marginal - relevance. - - Maximal marginal relevance optimizes for similarity to query AND diversity - among selected documents. - - Args: - embedding: Embedding to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - fetch_k: Number of Documents to fetch before filtering to - pass to MMR algorithm. - lambda_mult: Number between 0 and 1 that determines the degree - of diversity among the results with 0 corresponding - to maximum diversity and 1 to minimum diversity. - Defaults to 0.5. - Returns: - List of Documents and similarity scores selected by maximal marginal - relevance and score for each. - """ - scores, indices = self.index.search( - np.array([embedding], dtype=np.float32), - fetch_k if filter is None else fetch_k * 2, - ) - if filter is not None: - filtered_indices = [] - for i in indices[0]: - if i == -1: - # This happens when not enough docs are returned. - continue - _id = self.index_to_docstore_id[i] - doc = self.docstore.search(_id) - if not isinstance(doc, Document): - raise ValueError(f"Could not find document for id {_id}, got {doc}") - if all(doc.metadata.get(key) == value for key, value in filter.items()): - filtered_indices.append(i) - indices = np.array([filtered_indices]) - # -1 happens when not enough docs are returned. - embeddings = [self.index.reconstruct(int(i)) for i in indices[0] if i != -1] - mmr_selected = maximal_marginal_relevance( - np.array([embedding], dtype=np.float32), - embeddings, - k=k, - lambda_mult=lambda_mult, - ) - selected_indices = [indices[0][i] for i in mmr_selected] - selected_scores = [scores[0][i] for i in mmr_selected] - docs_and_scores = [] - for i, score in zip(selected_indices, selected_scores): - if i == -1: - # This happens when not enough docs are returned. - continue - _id = self.index_to_docstore_id[i] - doc = self.docstore.search(_id) - if not isinstance(doc, Document): - raise ValueError(f"Could not find document for id {_id}, got {doc}") - docs_and_scores.append((doc, score)) - return docs_and_scores - - def max_marginal_relevance_search_by_vector( - self, - embedding: List[float], - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - filter: Optional[Dict[str, Any]] = None, - **kwargs: Any, - ) -> List[Document]: - """Return docs selected using the maximal marginal relevance. - - Maximal marginal relevance optimizes for similarity to query AND diversity - among selected documents. - - Args: - embedding: Embedding to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - fetch_k: Number of Documents to fetch before filtering to - pass to MMR algorithm. - lambda_mult: Number between 0 and 1 that determines the degree - of diversity among the results with 0 corresponding - to maximum diversity and 1 to minimum diversity. - Defaults to 0.5. - Returns: - List of Documents selected by maximal marginal relevance. - """ - docs_and_scores = self.max_marginal_relevance_search_with_score_by_vector( - embedding, k=k, fetch_k=fetch_k, lambda_mult=lambda_mult, filter=filter - ) - return [doc for doc, _ in docs_and_scores] - - def max_marginal_relevance_search( - self, - query: str, - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - filter: Optional[Dict[str, Any]] = None, - **kwargs: Any, - ) -> List[Document]: - """Return docs selected using the maximal marginal relevance. - - Maximal marginal relevance optimizes for similarity to query AND diversity - among selected documents. - - Args: - query: Text to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - fetch_k: Number of Documents to fetch before filtering (if needed) to - pass to MMR algorithm. - lambda_mult: Number between 0 and 1 that determines the degree - of diversity among the results with 0 corresponding - to maximum diversity and 1 to minimum diversity. - Defaults to 0.5. - Returns: - List of Documents selected by maximal marginal relevance. - """ - embedding = self.embedding_function(query) - docs = self.max_marginal_relevance_search_by_vector( - embedding, - k=k, - fetch_k=fetch_k, - lambda_mult=lambda_mult, - filter=filter, - **kwargs, - ) - return docs - - def merge_from(self, target: FAISS) -> None: - """Merge another FAISS object with the current one. - - Add the target FAISS to the current one. - - Args: - target: FAISS object you wish to merge into the current one - - Returns: - None. - """ - if not isinstance(self.docstore, AddableMixin): - raise ValueError("Cannot merge with this type of docstore") - # Numerical index for target docs are incremental on existing ones - starting_len = len(self.index_to_docstore_id) - - # Merge two IndexFlatL2 - self.index.merge_from(target.index) - - # Get id and docs from target FAISS object - full_info = [] - for i, target_id in target.index_to_docstore_id.items(): - doc = target.docstore.search(target_id) - if not isinstance(doc, Document): - raise ValueError("Document should be returned") - full_info.append((starting_len + i, target_id, doc)) - - # Add information to docstore and index_to_docstore_id. - self.docstore.add({_id: doc for _, _id, doc in full_info}) - index_to_id = {index: _id for index, _id, _ in full_info} - self.index_to_docstore_id.update(index_to_id) - - @classmethod - def from_texts( - cls, - texts: List[str], - embedding: Embeddings, - metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, - **kwargs: Any, - ) -> FAISS: - """Construct FAISS wrapper from raw documents. - - This is a user friendly interface that: - 1. Embeds documents. - 2. Creates an in memory docstore - 3. Initializes the FAISS database - - This is intended to be a quick way to get started. - - Example: - .. code-block:: python - - from langchain import FAISS - from langchain.embeddings import OpenAIEmbeddings - embeddings = OpenAIEmbeddings() - faiss = FAISS.from_texts(texts, embeddings) - """ - embeddings = embedding.embed_documents(texts) - return cls.__from( - texts, - embeddings, - embedding, - metadatas=metadatas, - ids=ids, - **kwargs, - ) - - @classmethod - def from_embeddings( - cls, - text_embeddings: List[Tuple[str, List[float]]], - embedding: Embeddings, - metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, - **kwargs: Any, - ) -> FAISS: - """Construct FAISS wrapper from raw documents. - - This is a user friendly interface that: - 1. Embeds documents. - 2. Creates an in memory docstore - 3. Initializes the FAISS database - - This is intended to be a quick way to get started. - - Example: - .. code-block:: python - - from langchain import FAISS - from langchain.embeddings import OpenAIEmbeddings - embeddings = OpenAIEmbeddings() - text_embeddings = embeddings.embed_documents(texts) - text_embedding_pairs = list(zip(texts, text_embeddings)) - faiss = FAISS.from_embeddings(text_embedding_pairs, embeddings) - """ - texts = [t[0] for t in text_embeddings] - embeddings = [t[1] for t in text_embeddings] - return cls.__from( - texts, - embeddings, - embedding, - metadatas=metadatas, - ids=ids, - **kwargs, - ) - - def save_local(self, folder_path: str, index_name: str = "index") -> None: - """Save FAISS index, docstore, and index_to_docstore_id to disk. - - Args: - folder_path: folder path to save index, docstore, - and index_to_docstore_id to. - index_name: for saving with a specific index file name - """ - path = Path(folder_path) - path.mkdir(exist_ok=True, parents=True) - - # save index separately since it is not picklable - faiss = dependable_faiss_import() - faiss.write_index( - self.index, str(path / "{index_name}.faiss".format(index_name=index_name)) - ) - - # save docstore and index_to_docstore_id - with open(path / "{index_name}.pkl".format(index_name=index_name), "wb") as f: - pickle.dump((self.docstore, self.index_to_docstore_id), f) - - @classmethod - def load_local( - cls, folder_path: str, embeddings: Embeddings, index_name: str = "index" - ) -> FAISS: - """Load FAISS index, docstore, and index_to_docstore_id from disk. - - Args: - folder_path: folder path to load index, docstore, - and index_to_docstore_id from. - embeddings: Embeddings to use when generating queries - index_name: for saving with a specific index file name - """ - path = Path(folder_path) - # load index separately since it is not picklable - faiss = dependable_faiss_import() - index = faiss.read_index( - str(path / "{index_name}.faiss".format(index_name=index_name)) - ) - - # load docstore and index_to_docstore_id - with open(path / "{index_name}.pkl".format(index_name=index_name), "rb") as f: - docstore, index_to_docstore_id = pickle.load(f) - return cls(embeddings.embed_query, index, docstore, index_to_docstore_id) - - def _similarity_search_with_relevance_scores( - self, - query: str, - k: int = 4, - filter: Optional[Dict[str, Any]] = None, - fetch_k: int = 20, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Return docs and their similarity scores on a scale from 0 to 1.""" - if self.relevance_score_fn is None: - raise ValueError( - "normalize_score_fn must be provided to" - " FAISS constructor to normalize scores" - ) - docs_and_scores = self.similarity_search_with_score( - query, - k=k, - filter=filter, - fetch_k=fetch_k, - **kwargs, - ) - return [(doc, self.relevance_score_fn(score)) for doc, score in docs_and_scores] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/utils.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/utils.py deleted file mode 100644 index c10bde09e79b..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/utils.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Utility functions for working with vectors and vectorstores.""" - -from typing import List - -import numpy as np - -from azure.ai.generative.index._langchain.vendor.utils.math import cosine_similarity - - -def maximal_marginal_relevance( - query_embedding: np.ndarray, - embedding_list: list, - lambda_mult: float = 0.5, - k: int = 4, -) -> List[int]: - """Calculate maximal marginal relevance.""" - if min(k, len(embedding_list)) <= 0: - return [] - if query_embedding.ndim == 1: - query_embedding = np.expand_dims(query_embedding, axis=0) - similarity_to_query = cosine_similarity(query_embedding, embedding_list)[0] - most_similar = int(np.argmax(similarity_to_query)) - idxs = [most_similar] - selected = np.array([embedding_list[most_similar]]) - while len(idxs) < min(k, len(embedding_list)): - best_score = -np.inf - idx_to_add = -1 - similarity_to_selected = cosine_similarity(embedding_list, selected) - for i, query_score in enumerate(similarity_to_query): - if i in idxs: - continue - redundant_score = max(similarity_to_selected[i]) - equation_score = ( - lambda_mult * query_score - (1 - lambda_mult) * redundant_score - ) - if equation_score > best_score: - best_score = equation_score - idx_to_add = i - idxs.append(idx_to_add) - selected = np.append(selected, [embedding_list[idx_to_add]], axis=0) - return idxs diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_mlindex.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_mlindex.py index 6b352ef70bdf..6ffd231a5e1a 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_mlindex.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_mlindex.py @@ -13,7 +13,7 @@ from azure.core.credentials import TokenCredential from azure.ai.generative.index._documents import Document, DocumentChunksIterator from azure.ai.generative.index._embeddings import EmbeddingsContainer -from azure.ai.generative.index._utils.connections import ( +from azure.ai.resources._index._utils.connections import ( BaseConnection, WorkspaceConnection, get_connection_by_id_v2, @@ -158,7 +158,7 @@ def as_langchain_vectorstore(self, credential: Optional[TokenCredential] = None) langchain_pkg_version = pkg_version.parse(langchain_version) if index_kind == "acs": - from azure.ai.generative.index._indexes.azure_search import import_azure_search_or_so_help_me + from azure.ai.resources._index._indexes.azure_search import import_azure_search_or_so_help_me import_azure_search_or_so_help_me() @@ -274,11 +274,11 @@ def as_langchain_vectorstore(self, credential: Optional[TokenCredential] = None) f"Failed to load FAISS Index using installed version of langchain, retrying with vendored FAISS VectorStore.\n{e}" ) - from azure.ai.generative.index._langchain.vendor.vectorstores.faiss import FAISS + from azure.ai.resources._index._langchain.vendor.vectorstores.faiss import FAISS store = FAISS.load_local(str(tmpdir), embeddings) elif engine.endswith("indexes.faiss.FaissAndDocStore"): - from azure.ai.generative.index._indexes.faiss import FaissAndDocStore + from azure.ai.resources._index._indexes.faiss import FaissAndDocStore error_fmt_str = """Failed to import langchain faiss bridge module with: {e}\n" This could be due to an incompatible change in langchain since this bridge was implemented. If you understand what has changed you could implement your own wrapper of azure.ai.tools.mlindex._indexes.faiss.FaissAndDocStore. @@ -381,7 +381,7 @@ def as_native_index_client(self, credential: Optional[TokenCredential] = None): """ Converts MLIndex config into a client for the underlying Index, may download files. - An azure.search.documents.SearchClient for acs indexes or an azure.ai.generative.index._indexes.indexFaissAndDocStore for faiss indexes. + An azure.search.documents.SearchClient for acs indexes or an azure.ai.resources._index._indexes.indexFaissAndDocStore for faiss indexes. """ index_kind = self.index_config.get("kind", None) if index_kind == "acs": @@ -396,7 +396,7 @@ def as_native_index_client(self, credential: Optional[TokenCredential] = None): api_version=self.index_config.get("api_version", "2023-07-01-preview"), ) elif index_kind == "faiss": - from azure.ai.generative.index._indexes.faiss import FaissAndDocStore + from azure.ai.resources._index._indexes.faiss import FaissAndDocStore embeddings = self.get_langchain_embeddings(credential=credential) @@ -457,7 +457,7 @@ def override_connections( else: self.embeddings_config["connection_type"] = "workspace_connection" if isinstance(embedding_connection, str): - from azure.ai.generative.index._utils.connections import get_connection_by_id_v2 + from azure.ai.resources._index._utils.connections import get_connection_by_id_v2 embedding_connection = get_connection_by_id_v2(embedding_connection, credential=credential) self.embeddings_config["connection"] = {"id": get_id_from_connection(embedding_connection)} if index_connection: @@ -466,7 +466,7 @@ def override_connections( else: self.index_config["connection_type"] = "workspace_connection" if isinstance(index_connection, str): - from azure.ai.generative.index._utils.connections import get_connection_by_id_v2 + from azure.ai.resources._index._utils.connections import get_connection_by_id_v2 index_connection = get_connection_by_id_v2(index_connection, credential=credential) self.index_config["connection"] = {"id": get_id_from_connection(index_connection)} self.save(just_config=True) # type: ignore[call-arg] @@ -619,7 +619,7 @@ def from_documents( if isinstance(embeddings_model, str): connection_args = {} if "open_ai" in embeddings_model: - from azure.ai.generative.index._utils.connections import get_connection_by_id_v2 + from azure.ai.resources._index._utils.connections import get_connection_by_id_v2 if embeddings_connection: if isinstance(embeddings_connection, str): @@ -725,7 +725,7 @@ def from_embeddings_container( ) elif index_type == "acs": from azure.ai.generative.index._tasks.update_acs import create_index_from_raw_embeddings - from azure.ai.generative.index._utils.connections import get_connection_by_id_v2 + from azure.ai.resources._index._utils.connections import get_connection_by_id_v2 if not index_connection: index_config = { diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_models.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_models.py deleted file mode 100644 index d3fe34dd68b4..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_models.py +++ /dev/null @@ -1,218 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- -"""Language model classes.""" -import copy -import json -import os -from typing import Dict, Optional, Union - -from azure.core.credentials import TokenCredential -from azure.ai.generative.constants._common import USER_AGENT_HEADER_KEY -from azure.ai.generative.index._utils.connections import ( - connection_to_credential, - get_connection_by_id_v2, - get_connection_credential, -) -from azure.ai.generative.index._utils.logging import get_logger -from azure.ai.generative._user_agent import USER_AGENT - -try: - from azure.ai.resources.entities import BaseConnection -except Exception: - BaseConnection = None -try: - from azure.ai.ml.entities import WorkspaceConnection -except Exception: - WorkspaceConnection = None - -logger = get_logger(__name__) - - -def parse_model_uri(uri: str, **kwargs) -> dict: - """Parse a model URI into a dictionary of configuration parameters.""" - scheme, details = uri.split("://") - - def split_details(details): - details = details.split("/") - dets = {} - for i in range(0, len(details), 2): - dets[details[i]] = details[i + 1] - return dets - - config = {**kwargs} - if scheme == "azure_open_ai": - config = {**split_details(details), **config} - config["kind"] = "open_ai" - if "endpoint" in config: - if config["endpoint"] and (".openai." in config["endpoint"] or ".api.cognitive." in config["endpoint"] or ".cognitiveservices." in config["endpoint"]): - config["api_base"] = config["endpoint"].rstrip("/") - else: - config["api_base"] = f"https://{config['endpoint']}.openai.azure.com" - config["api_type"] = "azure" - config["api_version"] = kwargs.get("api_version") if kwargs.get("api_version") is not None else "2023-03-15-preview" - # Azure OpenAI has a batch_size limit of 16 - if "batch_size" not in config: - config["batch_size"] = "16" - elif scheme == "open_ai": - config["kind"] = "open_ai" - config = {**split_details(details), **config} - config["api_type"] = "open_ai" - elif scheme == "hugging_face": - config["kind"] = "hugging_face" - config["model"] = details.split("model/")[1] - elif scheme == "none": - config["kind"] = "none" - else: - raise ValueError(f"Unknown model kind: {scheme}") - - return config - - -def init_open_ai_from_config(config: dict, credential: Optional[TokenCredential]) -> Dict: - """Initialize an OpenAI model from a configuration dictionary.""" - import openai - - logger.debug("OpenAI arguments: \n") - logger.debug("\n".join(f"{k}={v}" if k != "key" and k != "api_key" else f"{k}=[REDACTED]" for k, v in config.items())) - - try: - if config.get("key") is not None: - config["api_key"] = config.get("key") - elif "connection_type" not in config: - if config.get("api_key") is None: - config["api_key"] = os.environ.get("OPENAI_API_KEY", None) - if config["api_key"] is None and "azure" in config["api_type"]: - from azure.identity import DefaultAzureCredential - - credential = DefaultAzureCredential(process_timeout=60) if credential is None else credential - config["api_key"] = credential.get_token("https://cognitiveservices.azure.com/.default").token - config["api_type"] = "azure_ad" - else: - if config["connection_type"] == "workspace_connection": - connection_id = config.get("connection", {}).get("id", "") - connection = get_connection_by_id_v2(connection_id, credential=credential) - # Only change base, version, and type in AOAI case - if hasattr(connection, "type"): - connection_obj: Union[WorkspaceConnection, BaseConnection] = connection - if connection_obj.type == "azure_open_ai": - config["api_base"] = connection_obj.target - connection_metadata = connection_obj.metadata - config["api_version"] = connection_obj.metadata.get("apiVersion", connection_metadata.get("ApiVersion", "2023-07-01-preview")) - config["api_type"] = connection_obj.metadata.get("apiType", connection_metadata.get("ApiType", "azure")).lower() - elif isinstance(connection, dict) and connection.get("properties", {}).get("category", None) == "AzureOpenAI": - config["api_base"] = connection.get("properties", {}).get("target") - connection_metadata = connection.get("properties", {}).get("metadata", {}) - config["api_version"] = connection_metadata.get("apiVersion", connection_metadata.get("ApiVersion", "2023-03-15-preview")) - config["api_type"] = connection_metadata.get("apiType", connection_metadata.get("ApiType", "azure")).lower() - - if config["api_type"] == "azure_ad" or config["api_type"] == "azuread": - from azure.identity import DefaultAzureCredential - - credential = DefaultAzureCredential(process_timeout=60) if credential is None else credential - else: - credential = connection_to_credential(connection) - else: - credential = get_connection_credential(config) - - if not hasattr(credential, "key"): - # Add hack to check for "BAKER-OPENAI-API-KEY" - if config.get("connection_type", "workspace_keyvault") == "workspace_keyvault": - new_args = copy.deepcopy(config) - new_args["connection"]["key"] = "BAKER-OPENAI-API-KEY" - credential = get_connection_credential(new_args) - - if hasattr(credential, "key"): - config["api_key"] = credential.key # type: ignore[union-attr] - else: - config["api_key"] = credential.get_token("https://cognitiveservices.azure.com/.default").token # type: ignore[union-attr] - config["api_type"] = "azure_ad" - except Exception as e: - if "OPENAI_API_KEY" in os.environ: - logger.warning(f"Failed to get credential for ACS with {e}, falling back to openai 0.x env vars.") - config["api_key"] = os.environ["OPENAI_API_KEY"] - config["api_type"] = os.environ.get("OPENAI_API_TYPE", "azure") - config["api_base"] = os.environ.get("OPENAI_API_BASE", openai.api_base if hasattr(openai, "api_base") else openai.base_url) - config["api_version"] = os.environ.get("OPENAI_API_VERSION", openai.api_version) - elif "AZURE_OPENAI_KEY" in os.environ: - logger.warning(f"Failed to get credential for ACS with {e}, falling back to openai 1.x env vars.") - config["api_key"] = os.environ["AZURE_OPENAI_KEY"] - config["api_type"] = os.environ.get("OPENAI_API_TYPE", "azure") - config["azure_endpoint"] = os.environ.get("AZURE_OPENAI_ENDPOINT") - config["api_version"] = os.environ.get("OPENAI_API_VERSION", openai.api_version) - else: - raise e - - if openai.api_type and "azure" in openai.api_type: - config["api_version"] = config.get("api_version", "2023-03-15-preview") - - return config - - -# TODO: Vendor langchain deps or move to langchain module. -def init_llm(model_config: dict, **kwargs): - """Initialize a language model from a model configuration.""" - from langchain.chat_models.azure_openai import AzureChatOpenAI - from langchain.chat_models.openai import ChatOpenAI - from langchain.llms import AzureOpenAI - - llm = None - logger.debug(f"model_config: {json.dumps(model_config, indent=2)}") - model_kwargs = { - "frequency_penalty": model_config.get("frequency_penalty", 0), - "presence_penalty": model_config.get("presence_penalty", 0), - } - if model_config.get("stop") is not None: - model_kwargs["stop"] = model_config.get("stop") - if model_config.get("kind") == "open_ai" and model_config.get("api_type") == "azure": - model_config = init_open_ai_from_config(model_config, credential=None) - if model_config["model"].startswith("gpt-3.5-turbo") or model_config["model"].startswith("gpt-35-turbo") or model_config["model"].startswith("gpt-4"): - logger.info(f"Initializing AzureChatOpenAI with model {model_config['model']} with kwargs: {model_kwargs}") - - llm = AzureChatOpenAI( - deployment_name=model_config["deployment"], - model=model_config["model"], - max_tokens=model_config.get("max_tokens"), - model_kwargs=model_kwargs, - openai_api_key=model_config.get("api_key"), - openai_api_base=model_config.get("api_base"), - openai_api_type=model_config.get("api_type"), - openai_api_version=model_config.get("api_version"), - max_retries=model_config.get("max_retries", 3), - default_headers={USER_AGENT_HEADER_KEY: USER_AGENT}, - **kwargs - ) # type: ignore - if model_config.get("temperature", None) is not None: - llm.temperature = model_config.get("temperature") - else: - logger.info(f"Initializing AzureOpenAI with model {model_config['model']} with kwargs: {model_kwargs}") - - llm = AzureOpenAI( - deployment_name=model_config["deployment"], - model=model_config["model"], - max_tokens=model_config.get("max_tokens"), - model_kwargs=model_kwargs, - openai_api_key=model_config.get("api_key"), - max_retries=model_config.get("max_retries", 3), - default_headers={USER_AGENT_HEADER_KEY: USER_AGENT}, - **kwargs - ) # type: ignore - if model_config.get("temperature", None) is not None: - llm.temperature = model_config.get("temperature") - elif model_config.get("kind") == "open_ai" and model_config.get("api_type") == "open_ai": - logger.info(f"Initializing OpenAI with model {model_config['model']} with kwargs: {model_kwargs}") - model_config = init_open_ai_from_config(model_config, credential=None) - llm = ChatOpenAI( - model=model_config["model"], - max_tokens=model_config.get("max_tokens"), - model_kwargs=model_kwargs, - openai_api_key=model_config.get("api_key"), - default_headers={USER_AGENT_HEADER_KEY: USER_AGENT}, - **kwargs - ) # type: ignore - if model_config.get("temperature", None) is not None: - llm.temperature = model_config.get("temperature") - else: - raise ValueError(f"Unsupported llm kind: {model_config.get('kind')}") - - return llm diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk.py index 85187a934d8b..7632b67d1d9b 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk.py @@ -16,7 +16,6 @@ from azure.ai.generative.index._documents import ( SUPPORTED_EXTENSIONS, ChunkedDocument, - Document, DocumentChunksIterator, DocumentSource, ) @@ -30,6 +29,7 @@ safe_mlflow_start_run, track_activity, ) +from azure.ai.resources._index._documents import Document logger = get_logger("crack_and_chunk") diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk_and_embed.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk_and_embed.py index 45562e5aa8a6..bb0aab5fcee1 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk_and_embed.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk_and_embed.py @@ -21,7 +21,7 @@ from azure.ai.generative.index._embeddings import DataEmbeddedDocument, EmbeddedDocumentSource, EmbeddingsContainer from azure.ai.generative.index._mlindex import MLIndex from azure.ai.generative.index._tasks.crack_and_chunk import custom_loading, get_activity_logging_filter, str2bool -from azure.ai.generative.index._documents.document import Document, DocumentSource +from azure.ai.generative.index._documents.document import DocumentSource from azure.ai.generative.index._utils.logging import ( _logger_factory, enable_appinsights_logging, @@ -30,6 +30,7 @@ safe_mlflow_start_run, track_activity, ) +from azure.ai.resources._index._documents import Document logger = get_logger("crack_and_chunk_and_embed") @@ -62,7 +63,7 @@ def crack_and_chunk_and_embed( if isinstance(embeddings_connection, str): connection_args["connection"] = {"id": embeddings_connection} else: - from azure.ai.generative.index._utils.connections import get_id_from_connection + from azure.ai.resources._index._utils.connections import get_id_from_connection connection_args["connection"] = {"id": get_id_from_connection(embeddings_connection)} diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk_and_embed_and_index.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk_and_embed_and_index.py index 7527a7a6d73e..d27b907fd395 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk_and_embed_and_index.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk_and_embed_and_index.py @@ -81,17 +81,17 @@ def crack_and_chunk_and_embed_and_index( if index_connection is not None: connection_args["connection_type"] = "workspace_connection" if isinstance(embeddings_connection, str): - from azure.ai.generative.index._utils.connections import get_connection_by_id_v2 + from azure.ai.resources._index._utils.connections import get_connection_by_id_v2 connection_args["connection"] = {"id": index_connection} connection = get_connection_by_id_v2(index_connection) else: - from azure.ai.generative.index._utils.connections import get_id_from_connection + from azure.ai.resources._index._utils.connections import get_id_from_connection connection_args["connection"] = {"id": get_id_from_connection(index_connection)} connection = index_connection - from azure.ai.generative.index._utils.connections import ( + from azure.ai.resources._index._utils.connections import ( get_metadata_from_connection, get_target_from_connection, ) @@ -107,7 +107,7 @@ def crack_and_chunk_and_embed_and_index( ) elif index_type == "faiss": logger.info(f"Creating Faiss index from embeddings_container with config {index_config}") - mlindex = embeddings_container.write_as_faiss_mlindex(output_path, engine="azure.ai.generative.index._indexes.faiss.FaissAndDocStore") + mlindex = embeddings_container.write_as_faiss_mlindex(output_path, engine="azure.ai.resources._index._indexes.faiss.FaissAndDocStore") else: raise ValueError(f"Unsupported index_type {index_type}") diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/embed.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/embed.py index 44dad28aaae2..fde34ef29576 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/embed.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/embed.py @@ -13,10 +13,6 @@ from typing import Iterator, List, Optional import pandas as pd -from azure.ai.generative.index._documents import ( - Document, - StaticDocument, -) from azure.ai.generative.index._embeddings import EmbeddingsContainer from azure.ai.generative.index._utils.logging import ( _logger_factory, @@ -27,6 +23,7 @@ safe_mlflow_start_run, track_activity, ) +from azure.ai.resources._index._documents import Document, StaticDocument logger = get_logger("embed") @@ -279,7 +276,7 @@ def main(args, logger, activity_logger): connection_args["connection"] = {"id": connection_id} else: if "open_ai" in args.embeddings_model: - from azure.ai.generative.index._utils.azureml import get_workspace_from_environment + from azure.ai.resources._index._utils.azureml import get_workspace_from_environment ws = get_workspace_from_environment() connection_args["connection_type"] = "workspace_keyvault" diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/embed_prs.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/embed_prs.py index 4c8a3db230ea..5f0c478967de 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/embed_prs.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/embed_prs.py @@ -11,7 +11,7 @@ import pandas as pd from azure.ai.generative.index._embeddings import EmbeddingsContainer from azure.ai.generative.index._tasks.embed import read_chunks_into_documents -from azure.ai.generative.index._utils.azureml import get_workspace_from_environment +from azure.ai.resources._index._utils.azureml import get_workspace_from_environment from azure.ai.generative.index._utils.logging import ( _logger_factory, enable_appinsights_logging, diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/generate_qa.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/generate_qa.py index eb14809d04d7..5a379a9f4b22 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/generate_qa.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/generate_qa.py @@ -13,7 +13,7 @@ import pandas as pd from azureml.core import Run from azure.ai.generative.index.data_generation.qa import QADataGenerator, GenerationResult, QAType -from azure.ai.generative.index._utils.connections import (get_connection_by_id_v2, +from azure.ai.resources._index._utils.connections import (get_connection_by_id_v2, get_connection_credential, connection_to_credential) from azure.ai.generative.index._utils.logging import (enable_appinsights_logging, @@ -26,7 +26,7 @@ def get_model_config(llm_config: Dict[str, Union[str, int]], openai_api_type: str, openai_api_version: str, activity_logger: Logger): """Get model_config from llm_config. llm_config format is used in Baker pipelines. - model_config format is accepted by `azure.ai.generative.index._models.init_llm()`.""" + model_config format is accepted by `azure.ai.resources._index._models.init_llm()`.""" model_config = llm_config.copy() model_config['kind'] = model_config['type'] del model_config['type'] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/git_clone.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/git_clone.py index 8776f22c0a7d..67c3e8844010 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/git_clone.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/git_clone.py @@ -16,7 +16,7 @@ def main(args, logger, activity_logger): try: connection_id = os.environ.get('AZUREML_WORKSPACE_CONNECTION_ID_GIT') if connection_id is not None and connection_id != '': - from azure.ai.generative.index._utils.connections import get_connection_by_id_v2 + from azure.ai.resources._index._utils.connections import get_connection_by_id_v2 connection = get_connection_by_id_v2(connection_id) if args.git_repository != connection['properties']['target']: diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/update_acs.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/update_acs.py index 2cda41ab5cb5..286164d2f431 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/update_acs.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/update_acs.py @@ -15,7 +15,7 @@ from azure.ai.generative.index._embeddings import EmbeddingsContainer, ReferenceEmbeddedDocument from azure.ai.generative.index._mlindex import MLIndex -from azure.ai.generative.index._utils.connections import get_connection_credential +from azure.ai.resources._index._utils.connections import get_connection_credential from azure.ai.generative.index._utils.logging import ( _logger_factory, enable_appinsights_logging, @@ -486,7 +486,7 @@ def main(args, logger, activity_logger): if args.connection_id is not None: connection_args["connection_type"] = "workspace_connection" connection_args["connection"] = {"id": args.connection_id} - from azure.ai.generative.index._utils.connections import ( + from azure.ai.resources._index._utils.connections import ( get_connection_by_id_v2, get_metadata_from_connection, get_target_from_connection, diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/update_pinecone.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/update_pinecone.py index f8e7c24132ad..da6a8efa6411 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/update_pinecone.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/update_pinecone.py @@ -14,7 +14,7 @@ from azure.core.credentials import AzureKeyCredential, TokenCredential from azure.ai.generative.index._embeddings import EmbeddingsContainer, ReferenceEmbeddedDocument from azure.ai.generative.index._mlindex import MLIndex -from azure.ai.generative.index._utils.connections import get_connection_credential +from azure.ai.resources._index._utils.connections import get_connection_credential from azure.ai.generative.index._utils.logging import ( _logger_factory, get_logger, @@ -303,7 +303,7 @@ def main(args, logger, activity_logger): if args.connection_id is not None: connection_args["connection_type"] = "workspace_connection" connection_args["connection"] = {"id": args.connection_id} - from azure.ai.generative.index._utils.connections import ( + from azure.ai.resources._index._utils.connections import ( get_connection_by_id_v2, get_metadata_from_connection, ) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/azureml.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/azureml.py deleted file mode 100644 index f0958f52ca87..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/azureml.py +++ /dev/null @@ -1,44 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- -"""Functions for interacting with AzureML.""" -from typing import Dict, List - -from azure.ai.generative.index._utils.logging import get_logger - -logger = get_logger(__name__) - - -def get_workspace_from_environment(): - """Get the workspace from the run context if running in Azure, otherwise return None.""" - from azureml.core import Run - - run = Run.get_context() - if hasattr(run, "experiment"): - # We are running in Azure - return run.experiment.workspace - else: - return None - - -def get_secret_from_workspace(name: str, workspace=None) -> str: - """Get a secret from the workspace if running in Azure, otherwise get it from the environment.""" - secrets = get_secrets_from_workspace([name], workspace) - return secrets[name] - - -def get_secrets_from_workspace(names: List[str], workspace=None) -> Dict[str, str]: - """Get a secret from the workspace if running in Azure, otherwise get it from the environment.""" - import os - - ws = get_workspace_from_environment() if workspace is None else workspace - if ws: - keyvault = ws.get_default_keyvault() - secrets = keyvault.get_secrets(names) - logger.info("Run context and secrets retrieved", extra={"print": True}) - else: - secrets = {} - for name in names: - secrets[name] = os.environ.get(name, os.environ.get(name.replace("-", "_"))) - - return secrets diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/connections.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/connections.py deleted file mode 100644 index 8640f42fe130..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/connections.py +++ /dev/null @@ -1,321 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- -"""MLIndex auth connection utilities.""" -import json -import os -import re -from typing import Any, Dict, Optional, Union - -from azure.ai.generative.index._utils.logging import get_logger -from azure.ai.generative.index._utils.requests import create_session_with_retry, send_post_request - -try: - from azure.ai.resources.entities import BaseConnection -except Exception: - BaseConnection = None -try: - from azure.ai.ml import MLClient - from azure.ai.ml.entities import WorkspaceConnection -except Exception: - MLClient = None - WorkspaceConnection = None -try: - from azure.core.credentials import TokenCredential -except Exception: - TokenCredential = object - -logger = get_logger("connections") - -def get_pinecone_environment(config, credential: Optional[TokenCredential] = None): - """Get the Pinecone project environment from a connection.""" - connection_type = config.get("connection_type", None) - if connection_type != "workspace_connection": - raise ValueError(f"Unsupported connection type for Pinecone index: {connection_type}") - - connection_id = config.get("connection", {}).get("id") - connection = get_connection_by_id_v2(connection_id, credential=credential) - return get_metadata_from_connection(connection)["environment"] - - -def get_connection_credential(config, credential: Optional[TokenCredential] = None): - """Get a credential for a connection.""" - try: - from azure.core.credentials import AzureKeyCredential - except ImportError as e: - raise ValueError( - "Could not import azure-core python package. " - "Please install it with `pip install azure-core`." - ) from e - try: - from azure.identity import DefaultAzureCredential - except ImportError as e: - raise ValueError( - "Could not import azure-identity python package. " - "Please install it with `pip install azure-identity`." - ) from e - - if config.get("connection_type", None) == "workspace_keyvault": - from azureml.core import Run, Workspace - run = Run.get_context() - if hasattr(run, "experiment"): - ws = run.experiment.workspace - else: - try: - ws = Workspace( - subscription_id=config.get("connection", {}).get("subscription"), - resource_group=config.get("connection", {}).get("resource_group"), - workspace_name=config.get("connection", {}).get("workspace") - ) - except Exception as e: - logger.warning(f"Could not get workspace '{config.get('connection', {}).get('workspace')}': {e}") - # Fall back to looking for key in environment. - import os - key = os.environ.get(config.get("connection", {}).get("key")) - if key is None: - raise ValueError(f"Could not get workspace '{config.get('connection', {}).get('workspace')}' and no key named '{config.get('connection', {}).get('key')}' in environment") - return AzureKeyCredential(key) - - keyvault = ws.get_default_keyvault() - connection_credential = AzureKeyCredential(keyvault.get_secret(config.get("connection", {}).get("key"))) - elif config.get("connection_type", None) == "workspace_connection": - connection_id = config.get("connection", {}).get("id") - connection = get_connection_by_id_v2(connection_id, credential=credential) - connection_credential = connection_to_credential(connection) - elif config.get("connection_type", None) == "environment": - import os - key = os.environ.get(config.get("connection", {}).get("key", "OPENAI_API_KEY")) - connection_credential = (credential if credential is not None else DefaultAzureCredential(process_timeout=60)) if key is None else AzureKeyCredential(key) - else: - connection_credential = credential if credential is not None else DefaultAzureCredential(process_timeout=60) - - return connection_credential - - -def workspace_connection_to_credential(connection: Union[dict, BaseConnection, WorkspaceConnection]): - """Get a credential for a workspace connection.""" - return connection_to_credential(connection) - - -def connection_to_credential(connection: Union[dict, BaseConnection, WorkspaceConnection]): - """Get a credential for a workspace connection.""" - if isinstance(connection, dict): - props = connection["properties"] - auth_type = props.get("authType", props.get("AuthType")) - if auth_type == "ApiKey": - from azure.core.credentials import AzureKeyCredential - return AzureKeyCredential(props["credentials"]["key"]) - elif auth_type == "PAT": - from azure.core.credentials import AccessToken - return AccessToken(props["credentials"]["pat"], props.get("expiresOn", None)) - elif auth_type == "CustomKeys": - # OpenAI connections are made with CustomKeys auth, so we can try to access the key using known structure - from azure.core.credentials import AzureKeyCredential - if connection.get("metadata", {}).get("azureml.flow.connection_type", None) == "OpenAI": - # Try to get the the key with api_key, if fail, default to regular CustomKeys handling - try: - key = props["credentials"]["keys"]["api_key"] - return AzureKeyCredential(key) - except Exception as e: - logger.warning(f"Could not get key using api_key, using default handling: {e}") - key_dict = props["credentials"]["keys"] - if len(key_dict.keys()) != 1: - raise ValueError(f"Only connections with a single key can be used. Number of keys present: {len(key_dict.keys())}") - return AzureKeyCredential(props["credentials"]["keys"][list(key_dict.keys())[0]]) - else: - raise ValueError(f"Unknown auth type '{auth_type}'") - elif isinstance(connection, WorkspaceConnection): - if connection.credentials.type.lower() == "api_key": - from azure.core.credentials import AzureKeyCredential - return AzureKeyCredential(connection.credentials.key) - elif connection.credentials.type.lower() == "pat": - from azure.core.credentials import AccessToken - return AccessToken(connection.credentials.pat, connection.credentials.expires_on) - elif connection.credentials.type.lower() == "custom_keys": - if connection._metadata.get("azureml.flow.connection_type", "").lower() == "openai": - from azure.core.credentials import AzureKeyCredential - try: - key = connection.credentials.keys.api_key - return AzureKeyCredential(key) - except Exception as e: - logger.warning(f"Could not get key using api_key, using default handling: {e}") - key_dict = connection.credentials.keys - if len(key_dict.keys()) != 1: - raise ValueError(f"Only connections with a single key can be used. Number of keys present: {len(key_dict.keys())}") - return AzureKeyCredential(connection.credentials.keys[list(key_dict.keys())[0]]) - else: - raise ValueError(f"Unknown auth type '{connection.credentials.type}' for connection '{connection.name}'") - else: - if connection.credentials.type.lower() == "api_key": - from azure.core.credentials import AzureKeyCredential - return AzureKeyCredential(connection.credentials.key) - else: - raise ValueError(f"Unknown auth type '{connection.credentials.type}' for connection '{connection.name}'") - - -def get_connection_by_id_v2(connection_id: str, credential: Optional[TokenCredential] = None, client: str = "sdk") -> Union[Dict[str, Dict[str, Dict[str, Any]]], WorkspaceConnection, BaseConnection]: - """ - Get a connection by id using azure.ai.ml or azure.ai.generative. - - If azure.ai.ml is installed, use that, otherwise use azure.ai.generative. - """ - uri_match = re.match(r"/subscriptions/(.*)/resourceGroups/(.*)/providers/Microsoft.MachineLearningServices/workspaces/(.*)/connections/(.*)", connection_id, flags=re.IGNORECASE) - - if uri_match is None: - logger.error(f"Invalid connection_id {connection_id}, expecting Azure Machine Learning resource ID") - raise ValueError(f"Invalid connection id {connection_id}") - - logger.info(f"Getting workspace connection: {uri_match.group(4)}") - - from azureml.dataprep.api._aml_auth._azureml_token_authentication import AzureMLTokenAuthentication - - if credential is None: - from azure.identity import DefaultAzureCredential - - if os.environ.get("AZUREML_RUN_ID", None) is not None: - credential = AzureMLTokenAuthentication._initialize_aml_token_auth() - else: - credential = credential if credential is not None else DefaultAzureCredential(process_timeout=60) - - logger.info(f"Using auth: {type(credential)}") - - if client == "sdk" and MLClient is not None: - logger.info("Getting workspace connection via MLClient") - ml_client = MLClient( - credential=credential, - subscription_id=uri_match.group(1), - resource_group_name=uri_match.group(2), - workspace_name=uri_match.group(3) - ) - - if os.environ.get("AZUREML_RUN_ID", None) is not None: - # In AzureML Run context, we need to use workspaces internal endpoint that will accept AzureMLToken auth. - old_base_url = ml_client.connections._operation._client._base_url - ml_client.connections._operation._client._base_url = f"{os.environ.get('AZUREML_SERVICE_ENDPOINT')}/rp/workspaces" - - logger.info(f"Using ml_client base_url: {ml_client.connections._operation._client._base_url}") - - list_secrets_response = ml_client.connections._operation.list_secrets( - connection_name=uri_match.group(4), - resource_group_name=ml_client.resource_group_name, - workspace_name=ml_client.workspace_name, - ) - connection = WorkspaceConnection._from_rest_object(list_secrets_response) - logger.info(f"Got Connection: {connection.id}") - - if os.environ.get("AZUREML_RUN_ID", None) is not None: - ml_client.connections._operation._client._base_url = old_base_url - else: - logger.info("Getting workspace connection via REST as fallback") - return get_connection_by_id_v1(connection_id, credential) - - return connection - - -def get_id_from_connection(connection: Union[dict, WorkspaceConnection, BaseConnection]) -> str: - """Get a connection id from a connection.""" - if isinstance(connection, dict): - return connection["id"] - elif isinstance(connection, WorkspaceConnection): - return connection.id - elif isinstance(connection, BaseConnection): - return connection.id - else: - raise ValueError(f"Unknown connection type: {type(connection)}") - - -def get_target_from_connection(connection: Union[dict, WorkspaceConnection, BaseConnection]) -> str: - """Get a connection target from a connection.""" - if isinstance(connection, dict): - return connection["properties"]["target"] - elif isinstance(connection, WorkspaceConnection): - return connection.target - elif isinstance(connection, BaseConnection): - return connection.target - else: - raise ValueError(f"Unknown connection type: {type(connection)}") - - -def get_metadata_from_connection(connection: Union[dict, WorkspaceConnection, BaseConnection]) -> dict: - """Get a connection metadata from a connection.""" - if isinstance(connection, dict): - return connection["properties"]["metadata"] - elif isinstance(connection, WorkspaceConnection): - return connection.metadata - elif isinstance(connection, BaseConnection): - return connection.metadata - else: - raise ValueError(f"Unknown connection type: {type(connection)}") - - -def get_connection_by_name_v2(workspace, name: str) -> dict: - """Get a connection from a workspace.""" - if hasattr(workspace._auth, "get_token"): - bearer_token = workspace._auth.get_token("https://management.azure.com/.default").token - else: - bearer_token = workspace._auth.token - - endpoint = workspace.service_context._get_endpoint("api") - url = f"{endpoint}/rp/workspaces/subscriptions/{workspace.subscription_id}/resourcegroups/{workspace.resource_group}/providers/Microsoft.MachineLearningServices/workspaces/{workspace.name}/connections/{name}/listsecrets?api-version=2023-02-01-preview" - resp = send_post_request(url, { - "Authorization": f"Bearer {bearer_token}", - "content-type": "application/json" - }, {}) - - return resp.json() - - -def get_connection_by_id_v1(connection_id: str, credential: Optional[TokenCredential] = None) -> dict: - """Get a connection from a workspace.""" - uri_match = re.match(r"/subscriptions/(.*)/resourceGroups/(.*)/providers/Microsoft.MachineLearningServices/workspaces/(.*)/connections/(.*)", connection_id) - - if uri_match is None: - logger.error(f"Invalid connection_id {connection_id}, expecting Azure Machine Learning resource ID") - raise ValueError(f"Invalid connection id {connection_id}") - - from azureml.core import Run, Workspace - run = Run.get_context() - if hasattr(run, "experiment"): - ws = run.experiment.workspace - else: - try: - ws = Workspace( - subscription_id=uri_match.group(1), - resource_group=uri_match.group(2), - workspace_name=uri_match.group(3) - ) - except Exception as e: - logger.warning(f"Could not get workspace '{uri_match.group(3)}': {e}") - raise ValueError(f"Could not get workspace '{uri_match.group(3)}'") from e - - return get_connection_by_name_v2(ws, uri_match.group(4)) - - -def send_put_request(url, headers, payload): - """Send a PUT request.""" - with create_session_with_retry() as session: - response = session.put(url, data=json.dumps(payload), headers=headers) - # Raise an exception if the response contains an HTTP error status code - response.raise_for_status() - - return response.json() - - -def create_connection_v2(workspace, name, category: str, target: str, auth_type: str, credentials: dict, metadata: str): - """Create a connection in a workspace.""" - url = f"https://management.azure.com/subscriptions/{workspace.subscription_id}/resourcegroups/{workspace.resource_group}/providers/Microsoft.MachineLearningServices/workspaces/{workspace.name}/connections/{name}?api-version=2023-04-01-preview" - - resp = send_put_request(url, { - "Authorization": f"Bearer {workspace._auth.get_token('https://management.azure.com/.default').token}", - "content-type": "application/json" - }, { - "properties": { - "category": category, - "target": target, - "authType": auth_type, - "credentials": credentials, - "metadata": metadata - } - }) - - return resp diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/deployment.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/deployment.py deleted file mode 100644 index 47823f648e3d..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/deployment.py +++ /dev/null @@ -1,40 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- -"""Azure OpenAI deployment related utils.""" -import openai -from azure.core.credentials import AzureKeyCredential -from azure.ai.generative.index._utils.connections import ( - connection_to_credential, - get_metadata_from_connection, - get_target_from_connection, -) -from openai.api_resources.deployment import Deployment -from openai.util import convert_to_dict - - -def infer_deployment(aoai_connection, model_name): - """Infer deployment name in an AOAI connection, given model name.""" - if model_name is None or model_name == "": - raise ValueError("Parameter 'model_name' has no value. Deployment inferring cannot be performed.") - connection_metadata = get_metadata_from_connection(aoai_connection) - openai.api_type = connection_metadata.get("ApiType", connection_metadata.get("apiType", "azure")) - openai.api_version = connection_metadata.get( - "ApiVersion", connection_metadata.get("apiVersion", "2023-03-15-preview") - ) - api_base = get_target_from_connection(aoai_connection) - if hasattr(openai, "api_base"): - openai.api_base = api_base - else: - openai.base_url = api_base - credential = connection_to_credential(aoai_connection) - openai.api_key = credential.key if isinstance(credential, AzureKeyCredential) else credential.get_token().token - deployment_list = convert_to_dict( - Deployment.list(api_key=openai.api_key, api_base=api_base, api_type=openai.api_type) - ) - for deployment in deployment_list["data"]: - if deployment["model"] == model_name: - return deployment["id"] - raise Exception( - f"Deployment for model={model_name} not found in AOAI workspace. Please retry with correct model name or create a deployment." - ) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/git.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/git.py index df24cbff88df..aa27b50b259c 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/git.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/git.py @@ -8,7 +8,7 @@ import git -from azure.ai.generative.index._utils.azureml import get_secret_from_workspace +from azure.ai.resources._index._utils.azureml import get_secret_from_workspace from azure.ai.generative.index._utils.logging import get_logger logger = get_logger("utils.git") diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/requests.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/requests.py deleted file mode 100644 index 8e71dcb11e6f..000000000000 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/requests.py +++ /dev/null @@ -1,58 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- -"""Request utilities.""" -import json - - -def create_session_with_retry(retry=3): - """ - Create requests.session with retry. - - :type retry: int - rtype: Response - """ - import requests - from requests.adapters import HTTPAdapter - - retry_policy = _get_retry_policy(num_retry=retry) - - session = requests.Session() - session.mount("https://", HTTPAdapter(max_retries=retry_policy)) - session.mount("http://", HTTPAdapter(max_retries=retry_policy)) - return session - - -def _get_retry_policy(num_retry=3): - """ - Request retry policy with increasing backoff. - - :return: Returns the msrest or requests REST client retry policy. - :rtype: urllib3.Retry - """ - from urllib3 import Retry - - status_forcelist = [413, 429, 500, 502, 503, 504] - backoff_factor = 0.4 - retry_policy = Retry( - total=num_retry, - read=num_retry, - connect=num_retry, - backoff_factor=backoff_factor, - status_forcelist=status_forcelist, - # By default this is True. We set it to false to get the full error trace, including url and - # status code of the last retry. Otherwise, the error message is 'too many 500 error responses', - # which is not useful. - raise_on_status=False - ) - return retry_policy - - -def send_post_request(url, headers, payload): - """Send a POST request.""" - with create_session_with_retry() as session: - response = session.post(url, data=json.dumps(payload), headers=headers) - # Raise an exception if the response contains an HTTP error status code - response.raise_for_status() - - return response diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_docstore.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_docstore.py index b3c532c44507..d66c9ed36b4a 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_docstore.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_docstore.py @@ -83,9 +83,9 @@ def load(cls, input_path: str) -> "FileBasedDocstore": fs, uri = url_to_fs(input_path) - documents = {} + documents: Optional[Dict[str, Document]] = {} with fs.open(f"{input_path.rstrip('/')}/docs.jsonl") as f: for line in f: document = StaticDocument.loads(line.strip()) - documents[document.document_id] = document + documents[document.document_id] = document # type: ignore[index] return cls(documents) diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_documents/document.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_documents/document.py index 5e9db92dd4a3..77289df1b785 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_documents/document.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_documents/document.py @@ -127,7 +127,7 @@ def dumps(self) -> str: return json.dumps({"content": self.data, "metadata": self._metadata, "document_id": self.document_id}) @classmethod - def loads(cls, data: str) -> "Document": + def loads(cls, data: str) -> "StaticDocument": """Load the document from a json string.""" data_dict = json.loads(data) metadata = data_dict["metadata"] diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_embeddings/__init__.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_embeddings/__init__.py index f59f8a1df16a..af0bd4fe84f6 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_embeddings/__init__.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_embeddings/__init__.py @@ -8,6 +8,7 @@ from collections import OrderedDict from typing import Callable, List, Optional, Union +import cloudpickle from azure.core.credentials import TokenCredential from azure.ai.resources._index._embeddings.openai import OpenAIEmbedder from azure.ai.resources._index._langchain.vendor.embeddings.base import Embeddings as Embedder @@ -218,9 +219,33 @@ def embeddings_container_local_path(self, value): """Set the path to the embeddings container.""" self._embeddings_container_path = value + def as_langchain_embeddings(self, credential: Optional[TokenCredential] = None) -> Embedder: + """Returns a langchain Embedder that can be used to embed text.""" + return get_langchain_embeddings(self.kind, self.arguments, credential=credential) + @staticmethod def from_uri(uri: str, credential: Optional[TokenCredential] = None, **kwargs) -> "EmbeddingsContainer": """Create an embeddings object from a URI.""" config = parse_model_uri(uri, **kwargs) kwargs["credential"] = credential - return EmbeddingsContainer(**{**config, **kwargs}) \ No newline at end of file + return EmbeddingsContainer(**{**config, **kwargs}) + + @staticmethod + def from_metadata(metadata: dict) -> "EmbeddingsContainer": + """Create an embeddings object from metadata.""" + schema_version = metadata.get("schema_version", "1") + if schema_version == "1": + embeddings = EmbeddingsContainer(metadata["kind"], **metadata["arguments"]) + return embeddings + elif schema_version == "2": + kind = metadata["kind"] + del metadata["kind"] + if kind == "custom": + metadata["embedding_fn"] = cloudpickle.loads( + gzip.decompress(metadata["pickled_embedding_fn"])) + del metadata["pickled_embedding_fn"] + + embeddings = EmbeddingsContainer(kind, **metadata) + return embeddings + else: + raise ValueError(f"Schema version {schema_version} is not supported") diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_embeddings/openai.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_embeddings/openai.py index d18acfc9cc4e..8f43bba263ef 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_embeddings/openai.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_embeddings/openai.py @@ -44,6 +44,7 @@ def __init__( elif batch_size is None: batch_size = 1000 self.batch_size = int(batch_size) + self._dynamic_batch_size: Optional[int] = None if max_retries is None: max_retries = 10 @@ -143,7 +144,16 @@ def _retryable_openai_errors(self) -> List[Exception]: def _dynamic_batch_size_embed_request(self, tokenized_texts: List[List[int]], **kwargs) -> dict: try: - return self._embed_request(tokenized_texts=tokenized_texts, **kwargs) + if self._dynamic_batch_size is None: + return self._embed_request(tokenized_texts=tokenized_texts, **kwargs) + else: + embedding_response: Dict[str, List] = {"data": []} + for i in range(0, len(tokenized_texts), self._dynamic_batch_size): + embedding_response["data"].extend( + self._embed_request( + tokenized_texts=tokenized_texts[i : i + self._dynamic_batch_size], **kwargs + )["data"] + ) except Exception as e: err_msg = str(e) if "Too many inputs" not in err_msg: @@ -153,14 +163,20 @@ def _dynamic_batch_size_embed_request(self, tokenized_texts: List[List[int]], ** match = re.match(r".*The max number of inputs is ([0-9]+).*", err_msg) if match and match.group(1): try: - self.batch_size = int(match.group(1)) + self._dynamic_batch_size = int(match.group(1)) except Exception: - logger.error("Failed to parse max number of inputs from error message, falling back to batch_size=1.") - self.batch_size = 1 - logger.warning(f"Reducing batch_size to {self.batch_size} and retrying.") - embedding_response: Dict[str, List] = {"data": []} - for i in range(0, len(tokenized_texts), self.batch_size): - embedding_response["data"].extend(self._embed_request(tokenized_texts=tokenized_texts[i : i + self.batch_size], **kwargs)["data"]) + logger.error( + "Failed to parse max number of inputs from error message, falling back to batch_size=1." + ) + self._dynamic_batch_size = 1 + logger.warning(f"Reducing batch_size to {self._dynamic_batch_size} and retrying.") + embedding_response: Dict[str, List] = {"data": []} # type: ignore[no-redef] + for i in range(0, len(tokenized_texts), self._dynamic_batch_size): + embedding_response["data"].extend( + self._embed_request( + tokenized_texts=tokenized_texts[i : i + self._dynamic_batch_size], **kwargs + )["data"] + ) else: raise @@ -168,8 +184,6 @@ def _dynamic_batch_size_embed_request(self, tokenized_texts: List[List[int]], ** def _embed_request(self, tokenized_texts: List[List[int]], **kwargs) -> dict: try: - min_seconds = 4 - max_seconds = 10 total_delay = 0 last_exception = None for retry in range(self.max_retries): @@ -190,7 +204,6 @@ def _embed_request(self, tokenized_texts: List[List[int]], **kwargs) -> dict: for retryable_error in self._retryable_openai_errors: if isinstance(e, type(retryable_error)): retrying = True - import openai # Retry with retry-after if found in RateLimitError if isinstance(e, self._RateLimitError): @@ -203,10 +216,10 @@ def _embed_request(self, tokenized_texts: List[List[int]], **kwargs) -> dict: # Wait for 1 minute as suggested by openai https://help.openai.com/en/articles/6897202-ratelimiterror logger.warning("Retry after 60 seconds.") delay = 60 - total_delay += delay - logger.warning(f"Sleeping for {delay} seconds before retrying.") - time.sleep(delay) - break + total_delay += delay + logger.warning(f"Sleeping for {delay} seconds before retrying.") + time.sleep(delay) + break if not retrying: break @@ -241,8 +254,7 @@ def _embed(self, texts: List[str]) -> List[List[float]]: tokens = encoding.encode( text, - # TODO: Do these need to be configurable? Our use cases treat all text as raw data. - allowed_special="all", + # TODO: Does this need to be configurable? Our use cases treat all text as raw data. disallowed_special=(), ) # Text longer than a models context length can be split and the embeddings averaged to approximate full text diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_indexes/faiss.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_indexes/faiss.py index cdcb1db6e9e2..049d81fa2ad3 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_indexes/faiss.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_indexes/faiss.py @@ -149,15 +149,15 @@ def similarity_search(self, query: str, k: int = 8, **kwargs) -> List[Document]: def save(self, output_path: Union[str, Path]): """Write index and docstore to output_path.""" - output_path = Path(output_path) - output_path.mkdir(exist_ok=True, parents=True) + output_path_obj = Path(output_path) + output_path_obj.mkdir(exist_ok=True, parents=True) faiss = import_faiss_or_so_help_me() - faiss.write_index(self.index, str(output_path / "index.faiss")) + faiss.write_index(self.index, str(output_path_obj / "index.faiss")) - self.docstore.save(str(output_path / "docstore")) + self.docstore.save(str(output_path_obj / "docstore")) - with (output_path / "index_to_doc_id.json").open("w") as f: + with (output_path_obj / "index_to_doc_id.json").open("w") as f: json.dump(self.index_to_doc_id, f) def save_local(self, output_path: str): diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_langchain/vendor/utils/__init__.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_langchain/vendor/utils/__init__.py deleted file mode 100644 index 624f5ee88ecf..000000000000 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_langchain/vendor/utils/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# --------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# --------------------------------------------------------- - - -__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_langchain/vendor/utils/math.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_langchain/vendor/utils/math.py deleted file mode 100644 index 41e1b6a0bd00..000000000000 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_langchain/vendor/utils/math.py +++ /dev/null @@ -1,61 +0,0 @@ -# This file has been copied as is. -# Last Sync: 2023-08-24 -# Commit: 3e5cda3405ec1aa369fe90253d88f3e26a03db10 -"""Math utils.""" -from typing import List, Optional, Tuple, Union - -import numpy as np - -Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray] - - -def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray: - """Row-wise cosine similarity between two equal-width matrices.""" - if len(X) == 0 or len(Y) == 0: - return np.array([]) - X = np.array(X) - Y = np.array(Y) - if X.shape[1] != Y.shape[1]: - raise ValueError( - f"Number of columns in X and Y must be the same. X has shape {X.shape} " - f"and Y has shape {Y.shape}." - ) - - X_norm = np.linalg.norm(X, axis=1) - Y_norm = np.linalg.norm(Y, axis=1) - # Ignore divide by zero errors run time warnings as those are handled below. - with np.errstate(divide="ignore", invalid="ignore"): - similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm) - similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0 - return similarity - - -def cosine_similarity_top_k( - X: Matrix, - Y: Matrix, - top_k: Optional[int] = 5, - score_threshold: Optional[float] = None, -) -> Tuple[List[Tuple[int, int]], List[float]]: - """Row-wise cosine similarity with optional top-k and score threshold filtering. - - Args: - X: Matrix. - Y: Matrix, same width as X. - top_k: Max number of results to return. - score_threshold: Minimum cosine similarity of results. - - Returns: - Tuple of two lists. First contains two-tuples of indices (X_idx, Y_idx), - second contains corresponding cosine similarities. - """ - if len(X) == 0 or len(Y) == 0: - return [], [] - score_array = cosine_similarity(X, Y) - score_threshold = score_threshold or -1.0 - score_array[score_array < score_threshold] = 0 - top_k = min(top_k or len(score_array), np.count_nonzero(score_array)) - top_k_idxs = np.argpartition(score_array, -top_k, axis=None)[-top_k:] - top_k_idxs = top_k_idxs[np.argsort(score_array.ravel()[top_k_idxs])][::-1] - ret_idxs = np.unravel_index(top_k_idxs, score_array.shape) - scores = score_array.ravel()[top_k_idxs].tolist() - return list(zip(*ret_idxs)), scores # type: ignore diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_mlindex.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_mlindex.py index ff24452b8e57..4887fa70d87a 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_mlindex.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_mlindex.py @@ -11,7 +11,7 @@ from azure.ai.ml.entities import Data from azure.core.credentials import TokenCredential from azure.ai.resources._index._documents import Document -from azure.ai.resources._index._embeddings.EmbeddingsContainer import from_metadata +from azure.ai.resources._index._embeddings import EmbeddingsContainer from azure.ai.resources._index._utils.connections import ( get_connection_credential, get_connection_by_id_v2, @@ -142,7 +142,7 @@ def description(self, value: str): def get_langchain_embeddings(self, credential: Optional[TokenCredential] = None): """Get the LangChainEmbeddings from the MLIndex.""" - embeddings = from_metadata(self.embeddings_config.copy()) + embeddings = EmbeddingsContainer.from_metadata(self.embeddings_config.copy()) return embeddings.as_langchain_embeddings(credential=credential) @@ -233,7 +233,7 @@ def as_langchain_vectorstore(self, credential: Optional[TokenCredential] = None) if engine == "langchain.vectorstores.FAISS": from azure.ai.resources._index._langchain.vendor.vectorstores.faiss import FAISS - embeddings = from_metadata(self.embeddings_config.copy()).as_langchain_embeddings(credential=credential) + embeddings = EmbeddingsContainer.from_metadata(self.embeddings_config.copy()).as_langchain_embeddings(credential=credential) fs, uri = url_to_fs(self.base_uri) @@ -253,7 +253,7 @@ def as_langchain_vectorstore(self, credential: Optional[TokenCredential] = None) logger.warning(error_fmt_str.format(e=e)) azureml_faiss_as_langchain_faiss = None # type: ignore[assignment] - embeddings = from_metadata(self.embeddings_config.copy()).as_langchain_embeddings(credential=credential) + embeddings = EmbeddingsContainer.from_metadata(self.embeddings_config.copy()).as_langchain_embeddings(credential=credential) store: FaissAndDocStore = FaissAndDocStore.load(self.base_uri, embeddings.embed_query) # type: ignore[no-redef] if azureml_faiss_as_langchain_faiss is not None: diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_models.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_models.py index 5d0c912b4545..472648664231 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_models.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_models.py @@ -8,12 +8,14 @@ from typing import Optional from azure.core.credentials import TokenCredential +from azure.ai.resources.constants._common import USER_AGENT_HEADER_KEY from azure.ai.resources._index._utils.connections import ( connection_to_credential, get_connection_by_id_v2, get_connection_credential, ) from azure.ai.resources._index._utils.logging import get_logger +from azure.ai.resources._user_agent import USER_AGENT logger = get_logger(__name__) @@ -130,7 +132,75 @@ def init_open_ai_from_config(config: dict, credential: Optional[TokenCredential] else: raise e - if "azure" in openai.api_type: + if openai.api_type and "azure" in openai.api_type: config["api_version"] = config.get("api_version", "2023-03-15-preview") - return config \ No newline at end of file + return config + +# TODO: Vendor langchain deps or move to langchain module. +def init_llm(model_config: dict, **kwargs): + """Initialize a language model from a model configuration.""" + from langchain.chat_models.azure_openai import AzureChatOpenAI + from langchain.chat_models.openai import ChatOpenAI + from langchain.llms import AzureOpenAI + + llm = None + logger.debug(f"model_config: {json.dumps(model_config, indent=2)}") + model_kwargs = { + "frequency_penalty": model_config.get("frequency_penalty", 0), + "presence_penalty": model_config.get("presence_penalty", 0), + } + if model_config.get("stop") is not None: + model_kwargs["stop"] = model_config.get("stop") + if model_config.get("kind") == "open_ai" and model_config.get("api_type") == "azure": + model_config = init_open_ai_from_config(model_config, credential=None) + if model_config["model"].startswith("gpt-3.5-turbo") or model_config["model"].startswith("gpt-35-turbo") or model_config["model"].startswith("gpt-4"): + logger.info(f"Initializing AzureChatOpenAI with model {model_config['model']} with kwargs: {model_kwargs}") + + llm = AzureChatOpenAI( + deployment_name=model_config["deployment"], + model=model_config["model"], + max_tokens=model_config.get("max_tokens"), + model_kwargs=model_kwargs, + openai_api_key=model_config.get("api_key"), + openai_api_base=model_config.get("api_base"), + openai_api_type=model_config.get("api_type"), + openai_api_version=model_config.get("api_version"), + max_retries=model_config.get("max_retries", 3), + default_headers={USER_AGENT_HEADER_KEY: USER_AGENT}, + **kwargs + ) # type: ignore + if model_config.get("temperature", None) is not None: + llm.temperature = model_config.get("temperature") + else: + logger.info(f"Initializing AzureOpenAI with model {model_config['model']} with kwargs: {model_kwargs}") + + llm = AzureOpenAI( + deployment_name=model_config["deployment"], + model=model_config["model"], + max_tokens=model_config.get("max_tokens"), + model_kwargs=model_kwargs, + openai_api_key=model_config.get("api_key"), + max_retries=model_config.get("max_retries", 3), + default_headers={USER_AGENT_HEADER_KEY: USER_AGENT}, + **kwargs + ) # type: ignore + if model_config.get("temperature", None) is not None: + llm.temperature = model_config.get("temperature") + elif model_config.get("kind") == "open_ai" and model_config.get("api_type") == "open_ai": + logger.info(f"Initializing OpenAI with model {model_config['model']} with kwargs: {model_kwargs}") + model_config = init_open_ai_from_config(model_config, credential=None) + llm = ChatOpenAI( + model=model_config["model"], + max_tokens=model_config.get("max_tokens"), + model_kwargs=model_kwargs, + openai_api_key=model_config.get("api_key"), + default_headers={USER_AGENT_HEADER_KEY: USER_AGENT}, + **kwargs + ) # type: ignore + if model_config.get("temperature", None) is not None: + llm.temperature = model_config.get("temperature") + else: + raise ValueError(f"Unsupported llm kind: {model_config.get('kind')}") + + return llm \ No newline at end of file diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_utils/connections.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_utils/connections.py index 0224afff459f..93886f0a22dc 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_utils/connections.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_utils/connections.py @@ -27,6 +27,16 @@ logger = get_logger("connections") +def get_pinecone_environment(config, credential: Optional[TokenCredential] = None): + """Get the Pinecone project environment from a connection.""" + connection_type = config.get("connection_type", None) + if connection_type != "workspace_connection": + raise ValueError(f"Unsupported connection type for Pinecone index: {connection_type}") + + connection_id = config.get("connection", {}).get("id") + connection = get_connection_by_id_v2(connection_id, credential=credential) + return get_metadata_from_connection(connection)["environment"] + def get_connection_credential(config, credential: Optional[TokenCredential] = None): """Get a credential for a connection.""" diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_utils/logging.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_utils/logging.py index a47ff50e0013..da74a9abd2bf 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_utils/logging.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_utils/logging.py @@ -229,7 +229,7 @@ def _try_get_run_info(): info["location"] = location try: from azureml.core import Run - run: Run = Run.get_context() + run: Run = Run.get_context() # type: ignore[annotation-unchecked] if hasattr(run, "experiment"): info["parent_run_id"] = run.properties.get("azureml.pipelinerunid", "Unknown") info["mlIndexAssetKind"] = run.properties.get("azureml.mlIndexAssetKind", "Unknown") diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_utils/_ai_client_utils.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_utils/_ai_client_utils.py index d8330340d105..60288bf64701 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_utils/_ai_client_utils.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_utils/_ai_client_utils.py @@ -10,7 +10,6 @@ from azure.ai.ml._file_utils.file_utils import traverse_up_path_and_find_file from azure.ai.ml.exceptions import ErrorCategory, ErrorTarget, ValidationException -from azure.ai.ml.constants._common import LOCAL_PATH def find_config_file_path( path: Optional[Union[os.PathLike, str]] = None, diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/client/_ai_client.py b/sdk/ai/azure-ai-resources/azure/ai/resources/client/_ai_client.py index 08204339d176..36d224eeffbe 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/client/_ai_client.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/client/_ai_client.py @@ -361,7 +361,7 @@ def build_index_on_cloud( source=IndexSource( input_data=Data( type="uri_folder", - path="", + path=".", ), input_glob=input_glob, chunk_size=chunk_size, diff --git a/sdk/ai/azure-ai-resources/cspell.json b/sdk/ai/azure-ai-resources/cspell.json index e37ee76a4934..586a18e8c44d 100644 --- a/sdk/ai/azure-ai-resources/cspell.json +++ b/sdk/ai/azure-ai-resources/cspell.json @@ -1,3 +1,3 @@ { - "ignoreWords": ["redef"] + "ignoreWords": ["redef", "llms"] }