Skip to content

Revert "cleanup duplicated code gen ai sdk (#34048)" #34224

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ def build_index(
from azure.ai.generative.index._documents import DocumentChunksIterator, split_documents
from azure.ai.generative.index._embeddings import EmbeddingsContainer
from azure.ai.generative.index._tasks.update_acs import create_index_from_raw_embeddings
from azure.ai.generative.index._utils.connections import get_connection_by_id_v2
from azure.ai.generative.index._utils.logging import disable_mlflow
from azure.ai.resources._index._utils.connections import get_connection_by_id_v2
except ImportError as e:
print("In order to use build_index to build an Index locally, you must have azure-ai-generative[index] installed")
raise e
Expand Down Expand Up @@ -176,7 +176,7 @@ def _create_mlindex_from_existing_acs(
) -> Index:
try:
from azure.ai.generative.index._embeddings import EmbeddingsContainer
from azure.ai.resources._index._utils.connections import get_connection_by_id_v2
from azure.ai.generative.index._utils.connections import get_connection_by_id_v2
except ImportError as e:
print("In order to use build_index to build an Index locally, you must have azure-ai-generative[index] installed")
raise e
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
"""DataIndex configuration and operations."""

__path__ = __import__("pkgutil").extend_path(__path__, __name__)

from azure.ai.generative.index._dataindex.entities import Data, CitationRegex, DataIndex, Embedding, IndexSource, IndexStore, index_data
from azure.ai.generative.index._dataindex.operations import DataOperations

__all__ = [
"DataOperations",
"DataIndex",
"IndexSource",
"Data",
"CitationRegex",
"Embedding",
"IndexStore",
"index_data",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------


__path__ = __import__("pkgutil").extend_path(__path__, __name__)
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

__path__ = __import__("pkgutil").extend_path(__path__, __name__)

from .data_index import (
CitationRegexSchema,
DataIndexSchema,
DataIndexTypes,
EmbeddingSchema,
IndexSourceSchema,
IndexStoreSchema,
)

__all__ = [
"DataIndexSchema",
"IndexSourceSchema",
"CitationRegexSchema",
"EmbeddingSchema",
"IndexStoreSchema",
"DataIndexTypes",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

# pylint: disable=unused-argument

from marshmallow import fields, post_load

from azure.ai.ml._schema.assets.data import DataSchema
from azure.ai.ml._schema.core.fields import ArmVersionedStr, LocalPathField, NestedField, StringTransformedEnum, UnionField
from azure.ai.ml._schema.core.schema import PatchedSchemaMeta
from azure.ai.ml._schema.job.input_output_entry import generate_datastore_property
from azure.ai.ml._utils._experimental import experimental
from azure.ai.ml.constants._common import AssetTypes, AzureMLResourceType, InputOutputModes


# FROM: azure.ai.ml._schema.job.input_output_entry
def generate_path_property(azureml_type, **kwargs):
return UnionField(
[
ArmVersionedStr(azureml_type=azureml_type),
fields.Str(metadata={"pattern": r"^(http(s)?):.*"}),
fields.Str(metadata={"pattern": r"^(wasb(s)?):.*"}),
LocalPathField(pattern=r"^file:.*"),
LocalPathField(
pattern=r"^(?!(azureml|http(s)?|wasb(s)?|file):).*",
),
],
is_strict=True,
**kwargs,
)


class DataIndexTypes:
"""DataIndexTypes is an enumeration of values for the types out indexes which can be written to by DataIndex."""

ACS = "acs"
"""Azure Cognitive Search index type."""
FAISS = "faiss"
"""Faiss index type."""


class CitationRegexSchema(metaclass=PatchedSchemaMeta):
match_pattern = fields.Str(
required=True,
metadata={"description": "Regex to match citation in the citation_url + input file path. e.g. '\\1/\\2'"},
)
replacement_pattern = fields.Str(
required=True,
metadata={"description": r"Replacement string for citation. e.g. '(.*)/articles/(.*)(\.[^.]+)$'"},
)

@post_load
def make(self, data, **kwargs):
from azure.ai.generative.index._dataindex.entities.data_index import CitationRegex

return CitationRegex(**data)


class InputDataSchema(metaclass=PatchedSchemaMeta):
mode = StringTransformedEnum(
allowed_values=[
InputOutputModes.RO_MOUNT,
InputOutputModes.RW_MOUNT,
InputOutputModes.DOWNLOAD,
],
required=False,
)
type = StringTransformedEnum(
allowed_values=[
AssetTypes.URI_FILE,
AssetTypes.URI_FOLDER,
]
)
path = generate_path_property(azureml_type=AzureMLResourceType.DATA)
datastore = generate_datastore_property()

@post_load
def make(self, data, **kwargs):
from azure.ai.ml.entities import Data

return Data(**data)


class InputMLTableSchema(metaclass=PatchedSchemaMeta):
mode = StringTransformedEnum(
allowed_values=[
InputOutputModes.EVAL_MOUNT,
InputOutputModes.EVAL_DOWNLOAD,
],
required=False,
)
type = StringTransformedEnum(allowed_values=[AssetTypes.MLTABLE])
path = generate_path_property(azureml_type=AzureMLResourceType.DATA)
datastore = generate_datastore_property()

@post_load
def make(self, data, **kwargs):
from azure.ai.ml.entities import Data

return Data(**data)


class IndexSourceSchema(metaclass=PatchedSchemaMeta):
input_data = UnionField(
[NestedField(InputDataSchema), NestedField(InputMLTableSchema)],
required=True,
allow_none=False,
metadata={"description": "Input Data to index files from. MLTable type inputs will use `mode: eval_mount`."},
)
input_glob = fields.Str(
required=False,
metadata={
"description": "Glob pattern to filter files from input_data. If not specified, all files will be indexed."
},
)
chunk_size = fields.Int(
required=False,
allow_none=False,
metadata={"description": "Maximum number of tokens to put in each chunk."},
)
chunk_overlap = fields.Int(
required=False,
allow_none=False,
metadata={"description": "Number of tokens to overlap between chunks."},
)
citation_url = fields.Str(
required=False,
metadata={"description": "Base URL to join with file paths to create full source file URL for chunk metadata."},
)
citation_url_replacement_regex = NestedField(
CitationRegexSchema,
required=False,
metadata={
"description": "Regex match and replacement patterns for citation url. Useful if the paths in `input_data` "
"don't match the desired citation format."
},
)

@post_load
def make(self, data, **kwargs):
from azure.ai.generative.index._dataindex.entities.data_index import IndexSource

return IndexSource(**data)


class EmbeddingSchema(metaclass=PatchedSchemaMeta):
model = fields.Str(
required=True,
allow_none=False,
metadata={
"description": "The model to use to embed data. E.g. 'hugging_face://model/sentence-transformers/"
"all-mpnet-base-v2' or 'azure_open_ai://deployment/{{deployment_name}}/model/{{model_name}}'"
},
)
connection = fields.Str(
required=False,
metadata={
"description": "Connection reference to use for embedding model information, "
"only needed for hosted embeddings models (such as Azure OpenAI)."
},
)
cache_path = generate_path_property(
azureml_type=AzureMLResourceType.DATASTORE,
required=False,
metadata={
"description": "Folder containing previously generated embeddings. "
"Should be parent folder of the 'embeddings' output path used for for this component. "
"Will compare input data to existing embeddings and only embed changed/new data, "
"reusing existing chunks."
},
)

@post_load
def make(self, data, **kwargs):
from azure.ai.generative.index._dataindex.entities.data_index import Embedding

return Embedding(**data)


class IndexStoreSchema(metaclass=PatchedSchemaMeta):
type = StringTransformedEnum(
allowed_values=[
DataIndexTypes.ACS,
DataIndexTypes.FAISS,
],
metadata={"description": "The type of index to write to. Currently supported types are 'acs' and 'faiss'."},
)
name = fields.Str(
required=False,
metadata={"description": "Name of the index to write to. If not specified, a name will be generated."},
)
connection = fields.Str(
required=False,
metadata={
"description": "Connection reference to use for index information, "
"only needed for hosted indexes (such as Azure Cognitive Search)."
},
)
config = fields.Dict(
required=False,
metadata={
"description": "Configuration for the index. Primary use is to configure Azure Cognitive Search specific settings."
"Such as custom `field_mapping` for known field types."
}
)

@post_load
def make(self, data, **kwargs):
from azure.ai.generative.index._dataindex.entities.data_index import IndexStore

return IndexStore(**data)


@experimental
class DataIndexSchema(DataSchema):
source = NestedField(IndexSourceSchema, required=True, allow_none=False)
embedding = NestedField(EmbeddingSchema, required=True, allow_none=False)
index = NestedField(IndexStoreSchema, required=True, allow_none=False)
incremental_update = fields.Bool()

@post_load
def make(self, data, **kwargs):
from azure.ai.generative.index._dataindex.entities.data_index import DataIndex

return DataIndex(**data)
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------


__path__ = __import__("pkgutil").extend_path(__path__, __name__)
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

class DataIndexComponentUri(object):
DATA_INDEX_COG_SEARCH = "azureml://registries/azureml/components/llm_ingest_dataset_to_acs_basic/labels/default"
DATA_INDEX_FAISS = "azureml://registries/azureml/components/llm_ingest_dataset_to_faiss_basic/labels/default"

@staticmethod
def with_registry(component_uri: str, registry_name: str) -> str:
return component_uri.replace("azureml://registries/azureml", f"azureml://registries/{registry_name}")


class LLMRAGComponentUri(object):
LLM_RAG_CRACK_AND_CHUNK = "azureml://registries/azureml/components/llm_rag_crack_and_chunk/labels/default"
LLM_RAG_GENERATE_EMBEDDINGS = "azureml://registries/azureml/components/llm_rag_generate_embeddings/labels/default"
LLM_RAG_CRACK_AND_CHUNK_AND_EMBED = (
"azureml://registries/azureml/components/llm_rag_crack_and_chunk_and_embed/labels/default"
)
LLM_RAG_UPDATE_ACS_INDEX = "azureml://registries/azureml/components/llm_rag_update_acs_index/labels/default"
LLM_RAG_CREATE_FAISS_INDEX = "azureml://registries/azureml/components/llm_rag_create_faiss_index/labels/default"
LLM_RAG_REGISTER_MLINDEX_ASSET = (
"azureml://registries/azureml/components/llm_rag_register_mlindex_asset/labels/default"
)
LLM_RAG_VALIDATE_DEPLOYMENTS = "azureml://registries/azureml/components/llm_rag_validate_deployments/labels/default"
LLM_RAG_CREATE_PROMPTFLOW = "azureml://registries/azureml/components/llm_rag_create_promptflow/labels/default"
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
"""DataIndex configuration and operations."""

from azure.ai.generative.index._dataindex.data_index.models import build_model_protocol
from azure.ai.generative.index._dataindex.entities.data_index import CitationRegex, Data, DataIndex, Embedding, IndexSource, IndexStore
from azure.ai.generative.index._dataindex.entities._builders.data_index_func import index_data

__all__ = [
"DataIndex",
"IndexSource",
"Data",
"CitationRegex",
"Embedding",
"IndexStore",
"index_data",
"build_model_protocol",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
"""DataIndex embedding model helpers."""
import re
from typing import Optional

OPEN_AI_PROTOCOL_TEMPLATE = "azure_open_ai://deployment/{}/model/{}"
OPEN_AI_PROTOCOL_REGEX_PATTERN = OPEN_AI_PROTOCOL_TEMPLATE.format(".*", ".*")
OPEN_AI_SHORT_FORM_PROTOCOL_TEMPLATE = "azure_open_ai://deployments?/{}"
OPEN_AI_PROTOCOL_REGEX_PATTERN = OPEN_AI_SHORT_FORM_PROTOCOL_TEMPLATE.format(".*")

HUGGINGFACE_PROTOCOL_TEMPLATE = "hugging_face://model/{}"
HUGGINGFACE_PROTOCOL_REGEX_PATTERN = HUGGINGFACE_PROTOCOL_TEMPLATE.format(".*")


def build_model_protocol(model: Optional[str] = None):
"""Build a model protocol from user input."""
if not model or re.match(OPEN_AI_PROTOCOL_REGEX_PATTERN, model, re.IGNORECASE):
return model
if re.match(OPEN_AI_SHORT_FORM_PROTOCOL_TEMPLATE, model, re.IGNORECASE):
return model
if re.match(HUGGINGFACE_PROTOCOL_REGEX_PATTERN, model, re.IGNORECASE):
return model

return OPEN_AI_PROTOCOL_TEMPLATE.format(model, model)
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------


__path__ = __import__("pkgutil").extend_path(__path__, __name__)
Loading