Azure · luigiw · Feb 8, 2024 · Feb 7, 2024
@@ -38,8 +38,8 @@ def build_index(
         from azure.ai.generative.index._documents import DocumentChunksIterator, split_documents
         from azure.ai.generative.index._embeddings import EmbeddingsContainer
         from azure.ai.generative.index._tasks.update_acs import create_index_from_raw_embeddings
+        from azure.ai.generative.index._utils.connections import get_connection_by_id_v2
         from azure.ai.generative.index._utils.logging import disable_mlflow
-        from azure.ai.resources._index._utils.connections import get_connection_by_id_v2
     except ImportError as e:
         print("In order to use build_index to build an Index locally, you must have azure-ai-generative[index] installed")
         raise e
@@ -176,7 +176,7 @@ def _create_mlindex_from_existing_acs(
 ) -> Index:
     try:
         from azure.ai.generative.index._embeddings import EmbeddingsContainer
-        from azure.ai.resources._index._utils.connections import get_connection_by_id_v2
+        from azure.ai.generative.index._utils.connections import get_connection_by_id_v2
     except ImportError as e:
         print("In order to use build_index to build an Index locally, you must have azure-ai-generative[index] installed")
         raise e

@@ -0,0 +1,20 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+"""DataIndex configuration and operations."""
+
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)
+
+from azure.ai.generative.index._dataindex.entities import Data, CitationRegex, DataIndex, Embedding, IndexSource, IndexStore, index_data
+from azure.ai.generative.index._dataindex.operations import DataOperations
+
+__all__ = [
+    "DataOperations",
+    "DataIndex",
+    "IndexSource",
+    "Data",
+    "CitationRegex",
+    "Embedding",
+    "IndexStore",
+    "index_data",
+]
@@ -0,0 +1,6 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)
@@ -0,0 +1,23 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)
+
+from .data_index import (
+    CitationRegexSchema,
+    DataIndexSchema,
+    DataIndexTypes,
+    EmbeddingSchema,
+    IndexSourceSchema,
+    IndexStoreSchema,
+)
+
+__all__ = [
+    "DataIndexSchema",
+    "IndexSourceSchema",
+    "CitationRegexSchema",
+    "EmbeddingSchema",
+    "IndexStoreSchema",
+    "DataIndexTypes",
+]
@@ -0,0 +1,226 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+# pylint: disable=unused-argument
+
+from marshmallow import fields, post_load
+
+from azure.ai.ml._schema.assets.data import DataSchema
+from azure.ai.ml._schema.core.fields import ArmVersionedStr, LocalPathField, NestedField, StringTransformedEnum, UnionField
+from azure.ai.ml._schema.core.schema import PatchedSchemaMeta
+from azure.ai.ml._schema.job.input_output_entry import generate_datastore_property
+from azure.ai.ml._utils._experimental import experimental
+from azure.ai.ml.constants._common import AssetTypes, AzureMLResourceType, InputOutputModes
+
+
+# FROM: azure.ai.ml._schema.job.input_output_entry
+def generate_path_property(azureml_type, **kwargs):
+    return UnionField(
+        [
+            ArmVersionedStr(azureml_type=azureml_type),
+            fields.Str(metadata={"pattern": r"^(http(s)?):.*"}),
+            fields.Str(metadata={"pattern": r"^(wasb(s)?):.*"}),
+            LocalPathField(pattern=r"^file:.*"),
+            LocalPathField(
+                pattern=r"^(?!(azureml|http(s)?|wasb(s)?|file):).*",
+            ),
+        ],
+        is_strict=True,
+        **kwargs,
+    )
+
+
+class DataIndexTypes:
+    """DataIndexTypes is an enumeration of values for the types out indexes which can be written to by DataIndex."""
+
+    ACS = "acs"
+    """Azure Cognitive Search index type."""
+    FAISS = "faiss"
+    """Faiss index type."""
+
+
+class CitationRegexSchema(metaclass=PatchedSchemaMeta):
+    match_pattern = fields.Str(
+        required=True,
+        metadata={"description": "Regex to match citation in the citation_url + input file path. e.g. '\\1/\\2'"},
+    )
+    replacement_pattern = fields.Str(
+        required=True,
+        metadata={"description": r"Replacement string for citation. e.g. '(.*)/articles/(.*)(\.[^.]+)$'"},
+    )
+
+    @post_load
+    def make(self, data, **kwargs):
+        from azure.ai.generative.index._dataindex.entities.data_index import CitationRegex
+
+        return CitationRegex(**data)
+
+
+class InputDataSchema(metaclass=PatchedSchemaMeta):
+    mode = StringTransformedEnum(
+        allowed_values=[
+            InputOutputModes.RO_MOUNT,
+            InputOutputModes.RW_MOUNT,
+            InputOutputModes.DOWNLOAD,
+        ],
+        required=False,
+    )
+    type = StringTransformedEnum(
+        allowed_values=[
+            AssetTypes.URI_FILE,
+            AssetTypes.URI_FOLDER,
+        ]
+    )
+    path = generate_path_property(azureml_type=AzureMLResourceType.DATA)
+    datastore = generate_datastore_property()
+
+    @post_load
+    def make(self, data, **kwargs):
+        from azure.ai.ml.entities import Data
+
+        return Data(**data)
+
+
+class InputMLTableSchema(metaclass=PatchedSchemaMeta):
+    mode = StringTransformedEnum(
+        allowed_values=[
+            InputOutputModes.EVAL_MOUNT,
+            InputOutputModes.EVAL_DOWNLOAD,
+        ],
+        required=False,
+    )
+    type = StringTransformedEnum(allowed_values=[AssetTypes.MLTABLE])
+    path = generate_path_property(azureml_type=AzureMLResourceType.DATA)
+    datastore = generate_datastore_property()
+
+    @post_load
+    def make(self, data, **kwargs):
+        from azure.ai.ml.entities import Data
+
+        return Data(**data)
+
+
+class IndexSourceSchema(metaclass=PatchedSchemaMeta):
+    input_data = UnionField(
+        [NestedField(InputDataSchema), NestedField(InputMLTableSchema)],
+        required=True,
+        allow_none=False,
+        metadata={"description": "Input Data to index files from. MLTable type inputs will use `mode: eval_mount`."},
+    )
+    input_glob = fields.Str(
+        required=False,
+        metadata={
+            "description": "Glob pattern to filter files from input_data. If not specified, all files will be indexed."
+        },
+    )
+    chunk_size = fields.Int(
+        required=False,
+        allow_none=False,
+        metadata={"description": "Maximum number of tokens to put in each chunk."},
+    )
+    chunk_overlap = fields.Int(
+        required=False,
+        allow_none=False,
+        metadata={"description": "Number of tokens to overlap between chunks."},
+    )
+    citation_url = fields.Str(
+        required=False,
+        metadata={"description": "Base URL to join with file paths to create full source file URL for chunk metadata."},
+    )
+    citation_url_replacement_regex = NestedField(
+        CitationRegexSchema,
+        required=False,
+        metadata={
+            "description": "Regex match and replacement patterns for citation url. Useful if the paths in `input_data` "
+            "don't match the desired citation format."
+        },
+    )
+
+    @post_load
+    def make(self, data, **kwargs):
+        from azure.ai.generative.index._dataindex.entities.data_index import IndexSource
+
+        return IndexSource(**data)
+
+
+class EmbeddingSchema(metaclass=PatchedSchemaMeta):
+    model = fields.Str(
+        required=True,
+        allow_none=False,
+        metadata={
+            "description": "The model to use to embed data. E.g. 'hugging_face://model/sentence-transformers/"
+            "all-mpnet-base-v2' or 'azure_open_ai://deployment/{{deployment_name}}/model/{{model_name}}'"
+        },
+    )
+    connection = fields.Str(
+        required=False,
+        metadata={
+            "description": "Connection reference to use for embedding model information, "
+            "only needed for hosted embeddings models (such as Azure OpenAI)."
+        },
+    )
+    cache_path = generate_path_property(
+        azureml_type=AzureMLResourceType.DATASTORE,
+        required=False,
+        metadata={
+            "description": "Folder containing previously generated embeddings. "
+            "Should be parent folder of the 'embeddings' output path used for for this component. "
+            "Will compare input data to existing embeddings and only embed changed/new data, "
+            "reusing existing chunks."
+        },
+    )
+
+    @post_load
+    def make(self, data, **kwargs):
+        from azure.ai.generative.index._dataindex.entities.data_index import Embedding
+
+        return Embedding(**data)
+
+
+class IndexStoreSchema(metaclass=PatchedSchemaMeta):
+    type = StringTransformedEnum(
+        allowed_values=[
+            DataIndexTypes.ACS,
+            DataIndexTypes.FAISS,
+        ],
+        metadata={"description": "The type of index to write to. Currently supported types are 'acs' and 'faiss'."},
+    )
+    name = fields.Str(
+        required=False,
+        metadata={"description": "Name of the index to write to. If not specified, a name will be generated."},
+    )
+    connection = fields.Str(
+        required=False,
+        metadata={
+            "description": "Connection reference to use for index information, "
+            "only needed for hosted indexes (such as Azure Cognitive Search)."
+        },
+    )
+    config = fields.Dict(
+        required=False,
+        metadata={
+            "description": "Configuration for the index. Primary use is to configure Azure Cognitive Search specific settings."
+            "Such as custom `field_mapping` for known field types."
+        }
+    )
+
+    @post_load
+    def make(self, data, **kwargs):
+        from azure.ai.generative.index._dataindex.entities.data_index import IndexStore
+
+        return IndexStore(**data)
+
+
+@experimental
+class DataIndexSchema(DataSchema):
+    source = NestedField(IndexSourceSchema, required=True, allow_none=False)
+    embedding = NestedField(EmbeddingSchema, required=True, allow_none=False)
+    index = NestedField(IndexStoreSchema, required=True, allow_none=False)
+    incremental_update = fields.Bool()
+
+    @post_load
+    def make(self, data, **kwargs):
+        from azure.ai.generative.index._dataindex.entities.data_index import DataIndex
+
+        return DataIndex(**data)
@@ -0,0 +1,6 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)
@@ -0,0 +1,26 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+class DataIndexComponentUri(object):
+    DATA_INDEX_COG_SEARCH = "azureml://registries/azureml/components/llm_ingest_dataset_to_acs_basic/labels/default"
+    DATA_INDEX_FAISS = "azureml://registries/azureml/components/llm_ingest_dataset_to_faiss_basic/labels/default"
+
+    @staticmethod
+    def with_registry(component_uri: str, registry_name: str) -> str:
+        return component_uri.replace("azureml://registries/azureml", f"azureml://registries/{registry_name}")
+
+
+class LLMRAGComponentUri(object):
+    LLM_RAG_CRACK_AND_CHUNK = "azureml://registries/azureml/components/llm_rag_crack_and_chunk/labels/default"
+    LLM_RAG_GENERATE_EMBEDDINGS = "azureml://registries/azureml/components/llm_rag_generate_embeddings/labels/default"
+    LLM_RAG_CRACK_AND_CHUNK_AND_EMBED = (
+        "azureml://registries/azureml/components/llm_rag_crack_and_chunk_and_embed/labels/default"
+    )
+    LLM_RAG_UPDATE_ACS_INDEX = "azureml://registries/azureml/components/llm_rag_update_acs_index/labels/default"
+    LLM_RAG_CREATE_FAISS_INDEX = "azureml://registries/azureml/components/llm_rag_create_faiss_index/labels/default"
+    LLM_RAG_REGISTER_MLINDEX_ASSET = (
+        "azureml://registries/azureml/components/llm_rag_register_mlindex_asset/labels/default"
+    )
+    LLM_RAG_VALIDATE_DEPLOYMENTS = "azureml://registries/azureml/components/llm_rag_validate_deployments/labels/default"
+    LLM_RAG_CREATE_PROMPTFLOW = "azureml://registries/azureml/components/llm_rag_create_promptflow/labels/default"
@@ -0,0 +1,19 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+"""DataIndex configuration and operations."""
+
+from azure.ai.generative.index._dataindex.data_index.models import build_model_protocol
+from azure.ai.generative.index._dataindex.entities.data_index import CitationRegex, Data, DataIndex, Embedding, IndexSource, IndexStore
+from azure.ai.generative.index._dataindex.entities._builders.data_index_func import index_data
+
+__all__ = [
+    "DataIndex",
+    "IndexSource",
+    "Data",
+    "CitationRegex",
+    "Embedding",
+    "IndexStore",
+    "index_data",
+    "build_model_protocol",
+]
@@ -0,0 +1,26 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+"""DataIndex embedding model helpers."""
+import re
+from typing import Optional
+
+OPEN_AI_PROTOCOL_TEMPLATE = "azure_open_ai://deployment/{}/model/{}"
+OPEN_AI_PROTOCOL_REGEX_PATTERN = OPEN_AI_PROTOCOL_TEMPLATE.format(".*", ".*")
+OPEN_AI_SHORT_FORM_PROTOCOL_TEMPLATE = "azure_open_ai://deployments?/{}"
+OPEN_AI_PROTOCOL_REGEX_PATTERN = OPEN_AI_SHORT_FORM_PROTOCOL_TEMPLATE.format(".*")
+
+HUGGINGFACE_PROTOCOL_TEMPLATE = "hugging_face://model/{}"
+HUGGINGFACE_PROTOCOL_REGEX_PATTERN = HUGGINGFACE_PROTOCOL_TEMPLATE.format(".*")
+
+
+def build_model_protocol(model: Optional[str] = None):
+    """Build a model protocol from user input."""
+    if not model or re.match(OPEN_AI_PROTOCOL_REGEX_PATTERN, model, re.IGNORECASE):
+        return model
+    if re.match(OPEN_AI_SHORT_FORM_PROTOCOL_TEMPLATE, model, re.IGNORECASE):
+        return model
+    if re.match(HUGGINGFACE_PROTOCOL_REGEX_PATTERN, model, re.IGNORECASE):
+        return model
+
+    return OPEN_AI_PROTOCOL_TEMPLATE.format(model, model)
@@ -0,0 +1,6 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)