Skip to content

Commit 537647b

Browse files
authored
Revert "cleanup duplicated code gen ai sdk (#34048)" (#34224)
This reverts commit bb97437.
1 parent 95ad7b2 commit 537647b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

82 files changed

+6577
-218
lines changed

sdk/ai/azure-ai-generative/azure/ai/generative/index/_build_mlindex.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ def build_index(
3838
from azure.ai.generative.index._documents import DocumentChunksIterator, split_documents
3939
from azure.ai.generative.index._embeddings import EmbeddingsContainer
4040
from azure.ai.generative.index._tasks.update_acs import create_index_from_raw_embeddings
41+
from azure.ai.generative.index._utils.connections import get_connection_by_id_v2
4142
from azure.ai.generative.index._utils.logging import disable_mlflow
42-
from azure.ai.resources._index._utils.connections import get_connection_by_id_v2
4343
except ImportError as e:
4444
print("In order to use build_index to build an Index locally, you must have azure-ai-generative[index] installed")
4545
raise e
@@ -176,7 +176,7 @@ def _create_mlindex_from_existing_acs(
176176
) -> Index:
177177
try:
178178
from azure.ai.generative.index._embeddings import EmbeddingsContainer
179-
from azure.ai.resources._index._utils.connections import get_connection_by_id_v2
179+
from azure.ai.generative.index._utils.connections import get_connection_by_id_v2
180180
except ImportError as e:
181181
print("In order to use build_index to build an Index locally, you must have azure-ai-generative[index] installed")
182182
raise e
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
"""DataIndex configuration and operations."""
5+
6+
__path__ = __import__("pkgutil").extend_path(__path__, __name__)
7+
8+
from azure.ai.generative.index._dataindex.entities import Data, CitationRegex, DataIndex, Embedding, IndexSource, IndexStore, index_data
9+
from azure.ai.generative.index._dataindex.operations import DataOperations
10+
11+
__all__ = [
12+
"DataOperations",
13+
"DataIndex",
14+
"IndexSource",
15+
"Data",
16+
"CitationRegex",
17+
"Embedding",
18+
"IndexStore",
19+
"index_data",
20+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
6+
__path__ = __import__("pkgutil").extend_path(__path__, __name__)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
__path__ = __import__("pkgutil").extend_path(__path__, __name__)
6+
7+
from .data_index import (
8+
CitationRegexSchema,
9+
DataIndexSchema,
10+
DataIndexTypes,
11+
EmbeddingSchema,
12+
IndexSourceSchema,
13+
IndexStoreSchema,
14+
)
15+
16+
__all__ = [
17+
"DataIndexSchema",
18+
"IndexSourceSchema",
19+
"CitationRegexSchema",
20+
"EmbeddingSchema",
21+
"IndexStoreSchema",
22+
"DataIndexTypes",
23+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
# pylint: disable=unused-argument
6+
7+
from marshmallow import fields, post_load
8+
9+
from azure.ai.ml._schema.assets.data import DataSchema
10+
from azure.ai.ml._schema.core.fields import ArmVersionedStr, LocalPathField, NestedField, StringTransformedEnum, UnionField
11+
from azure.ai.ml._schema.core.schema import PatchedSchemaMeta
12+
from azure.ai.ml._schema.job.input_output_entry import generate_datastore_property
13+
from azure.ai.ml._utils._experimental import experimental
14+
from azure.ai.ml.constants._common import AssetTypes, AzureMLResourceType, InputOutputModes
15+
16+
17+
# FROM: azure.ai.ml._schema.job.input_output_entry
18+
def generate_path_property(azureml_type, **kwargs):
19+
return UnionField(
20+
[
21+
ArmVersionedStr(azureml_type=azureml_type),
22+
fields.Str(metadata={"pattern": r"^(http(s)?):.*"}),
23+
fields.Str(metadata={"pattern": r"^(wasb(s)?):.*"}),
24+
LocalPathField(pattern=r"^file:.*"),
25+
LocalPathField(
26+
pattern=r"^(?!(azureml|http(s)?|wasb(s)?|file):).*",
27+
),
28+
],
29+
is_strict=True,
30+
**kwargs,
31+
)
32+
33+
34+
class DataIndexTypes:
35+
"""DataIndexTypes is an enumeration of values for the types out indexes which can be written to by DataIndex."""
36+
37+
ACS = "acs"
38+
"""Azure Cognitive Search index type."""
39+
FAISS = "faiss"
40+
"""Faiss index type."""
41+
42+
43+
class CitationRegexSchema(metaclass=PatchedSchemaMeta):
44+
match_pattern = fields.Str(
45+
required=True,
46+
metadata={"description": "Regex to match citation in the citation_url + input file path. e.g. '\\1/\\2'"},
47+
)
48+
replacement_pattern = fields.Str(
49+
required=True,
50+
metadata={"description": r"Replacement string for citation. e.g. '(.*)/articles/(.*)(\.[^.]+)$'"},
51+
)
52+
53+
@post_load
54+
def make(self, data, **kwargs):
55+
from azure.ai.generative.index._dataindex.entities.data_index import CitationRegex
56+
57+
return CitationRegex(**data)
58+
59+
60+
class InputDataSchema(metaclass=PatchedSchemaMeta):
61+
mode = StringTransformedEnum(
62+
allowed_values=[
63+
InputOutputModes.RO_MOUNT,
64+
InputOutputModes.RW_MOUNT,
65+
InputOutputModes.DOWNLOAD,
66+
],
67+
required=False,
68+
)
69+
type = StringTransformedEnum(
70+
allowed_values=[
71+
AssetTypes.URI_FILE,
72+
AssetTypes.URI_FOLDER,
73+
]
74+
)
75+
path = generate_path_property(azureml_type=AzureMLResourceType.DATA)
76+
datastore = generate_datastore_property()
77+
78+
@post_load
79+
def make(self, data, **kwargs):
80+
from azure.ai.ml.entities import Data
81+
82+
return Data(**data)
83+
84+
85+
class InputMLTableSchema(metaclass=PatchedSchemaMeta):
86+
mode = StringTransformedEnum(
87+
allowed_values=[
88+
InputOutputModes.EVAL_MOUNT,
89+
InputOutputModes.EVAL_DOWNLOAD,
90+
],
91+
required=False,
92+
)
93+
type = StringTransformedEnum(allowed_values=[AssetTypes.MLTABLE])
94+
path = generate_path_property(azureml_type=AzureMLResourceType.DATA)
95+
datastore = generate_datastore_property()
96+
97+
@post_load
98+
def make(self, data, **kwargs):
99+
from azure.ai.ml.entities import Data
100+
101+
return Data(**data)
102+
103+
104+
class IndexSourceSchema(metaclass=PatchedSchemaMeta):
105+
input_data = UnionField(
106+
[NestedField(InputDataSchema), NestedField(InputMLTableSchema)],
107+
required=True,
108+
allow_none=False,
109+
metadata={"description": "Input Data to index files from. MLTable type inputs will use `mode: eval_mount`."},
110+
)
111+
input_glob = fields.Str(
112+
required=False,
113+
metadata={
114+
"description": "Glob pattern to filter files from input_data. If not specified, all files will be indexed."
115+
},
116+
)
117+
chunk_size = fields.Int(
118+
required=False,
119+
allow_none=False,
120+
metadata={"description": "Maximum number of tokens to put in each chunk."},
121+
)
122+
chunk_overlap = fields.Int(
123+
required=False,
124+
allow_none=False,
125+
metadata={"description": "Number of tokens to overlap between chunks."},
126+
)
127+
citation_url = fields.Str(
128+
required=False,
129+
metadata={"description": "Base URL to join with file paths to create full source file URL for chunk metadata."},
130+
)
131+
citation_url_replacement_regex = NestedField(
132+
CitationRegexSchema,
133+
required=False,
134+
metadata={
135+
"description": "Regex match and replacement patterns for citation url. Useful if the paths in `input_data` "
136+
"don't match the desired citation format."
137+
},
138+
)
139+
140+
@post_load
141+
def make(self, data, **kwargs):
142+
from azure.ai.generative.index._dataindex.entities.data_index import IndexSource
143+
144+
return IndexSource(**data)
145+
146+
147+
class EmbeddingSchema(metaclass=PatchedSchemaMeta):
148+
model = fields.Str(
149+
required=True,
150+
allow_none=False,
151+
metadata={
152+
"description": "The model to use to embed data. E.g. 'hugging_face://model/sentence-transformers/"
153+
"all-mpnet-base-v2' or 'azure_open_ai://deployment/{{deployment_name}}/model/{{model_name}}'"
154+
},
155+
)
156+
connection = fields.Str(
157+
required=False,
158+
metadata={
159+
"description": "Connection reference to use for embedding model information, "
160+
"only needed for hosted embeddings models (such as Azure OpenAI)."
161+
},
162+
)
163+
cache_path = generate_path_property(
164+
azureml_type=AzureMLResourceType.DATASTORE,
165+
required=False,
166+
metadata={
167+
"description": "Folder containing previously generated embeddings. "
168+
"Should be parent folder of the 'embeddings' output path used for for this component. "
169+
"Will compare input data to existing embeddings and only embed changed/new data, "
170+
"reusing existing chunks."
171+
},
172+
)
173+
174+
@post_load
175+
def make(self, data, **kwargs):
176+
from azure.ai.generative.index._dataindex.entities.data_index import Embedding
177+
178+
return Embedding(**data)
179+
180+
181+
class IndexStoreSchema(metaclass=PatchedSchemaMeta):
182+
type = StringTransformedEnum(
183+
allowed_values=[
184+
DataIndexTypes.ACS,
185+
DataIndexTypes.FAISS,
186+
],
187+
metadata={"description": "The type of index to write to. Currently supported types are 'acs' and 'faiss'."},
188+
)
189+
name = fields.Str(
190+
required=False,
191+
metadata={"description": "Name of the index to write to. If not specified, a name will be generated."},
192+
)
193+
connection = fields.Str(
194+
required=False,
195+
metadata={
196+
"description": "Connection reference to use for index information, "
197+
"only needed for hosted indexes (such as Azure Cognitive Search)."
198+
},
199+
)
200+
config = fields.Dict(
201+
required=False,
202+
metadata={
203+
"description": "Configuration for the index. Primary use is to configure Azure Cognitive Search specific settings."
204+
"Such as custom `field_mapping` for known field types."
205+
}
206+
)
207+
208+
@post_load
209+
def make(self, data, **kwargs):
210+
from azure.ai.generative.index._dataindex.entities.data_index import IndexStore
211+
212+
return IndexStore(**data)
213+
214+
215+
@experimental
216+
class DataIndexSchema(DataSchema):
217+
source = NestedField(IndexSourceSchema, required=True, allow_none=False)
218+
embedding = NestedField(EmbeddingSchema, required=True, allow_none=False)
219+
index = NestedField(IndexStoreSchema, required=True, allow_none=False)
220+
incremental_update = fields.Bool()
221+
222+
@post_load
223+
def make(self, data, **kwargs):
224+
from azure.ai.generative.index._dataindex.entities.data_index import DataIndex
225+
226+
return DataIndex(**data)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
6+
__path__ = __import__("pkgutil").extend_path(__path__, __name__)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
class DataIndexComponentUri(object):
6+
DATA_INDEX_COG_SEARCH = "azureml://registries/azureml/components/llm_ingest_dataset_to_acs_basic/labels/default"
7+
DATA_INDEX_FAISS = "azureml://registries/azureml/components/llm_ingest_dataset_to_faiss_basic/labels/default"
8+
9+
@staticmethod
10+
def with_registry(component_uri: str, registry_name: str) -> str:
11+
return component_uri.replace("azureml://registries/azureml", f"azureml://registries/{registry_name}")
12+
13+
14+
class LLMRAGComponentUri(object):
15+
LLM_RAG_CRACK_AND_CHUNK = "azureml://registries/azureml/components/llm_rag_crack_and_chunk/labels/default"
16+
LLM_RAG_GENERATE_EMBEDDINGS = "azureml://registries/azureml/components/llm_rag_generate_embeddings/labels/default"
17+
LLM_RAG_CRACK_AND_CHUNK_AND_EMBED = (
18+
"azureml://registries/azureml/components/llm_rag_crack_and_chunk_and_embed/labels/default"
19+
)
20+
LLM_RAG_UPDATE_ACS_INDEX = "azureml://registries/azureml/components/llm_rag_update_acs_index/labels/default"
21+
LLM_RAG_CREATE_FAISS_INDEX = "azureml://registries/azureml/components/llm_rag_create_faiss_index/labels/default"
22+
LLM_RAG_REGISTER_MLINDEX_ASSET = (
23+
"azureml://registries/azureml/components/llm_rag_register_mlindex_asset/labels/default"
24+
)
25+
LLM_RAG_VALIDATE_DEPLOYMENTS = "azureml://registries/azureml/components/llm_rag_validate_deployments/labels/default"
26+
LLM_RAG_CREATE_PROMPTFLOW = "azureml://registries/azureml/components/llm_rag_create_promptflow/labels/default"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
"""DataIndex configuration and operations."""
5+
6+
from azure.ai.generative.index._dataindex.data_index.models import build_model_protocol
7+
from azure.ai.generative.index._dataindex.entities.data_index import CitationRegex, Data, DataIndex, Embedding, IndexSource, IndexStore
8+
from azure.ai.generative.index._dataindex.entities._builders.data_index_func import index_data
9+
10+
__all__ = [
11+
"DataIndex",
12+
"IndexSource",
13+
"Data",
14+
"CitationRegex",
15+
"Embedding",
16+
"IndexStore",
17+
"index_data",
18+
"build_model_protocol",
19+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
"""DataIndex embedding model helpers."""
5+
import re
6+
from typing import Optional
7+
8+
OPEN_AI_PROTOCOL_TEMPLATE = "azure_open_ai://deployment/{}/model/{}"
9+
OPEN_AI_PROTOCOL_REGEX_PATTERN = OPEN_AI_PROTOCOL_TEMPLATE.format(".*", ".*")
10+
OPEN_AI_SHORT_FORM_PROTOCOL_TEMPLATE = "azure_open_ai://deployments?/{}"
11+
OPEN_AI_PROTOCOL_REGEX_PATTERN = OPEN_AI_SHORT_FORM_PROTOCOL_TEMPLATE.format(".*")
12+
13+
HUGGINGFACE_PROTOCOL_TEMPLATE = "hugging_face://model/{}"
14+
HUGGINGFACE_PROTOCOL_REGEX_PATTERN = HUGGINGFACE_PROTOCOL_TEMPLATE.format(".*")
15+
16+
17+
def build_model_protocol(model: Optional[str] = None):
18+
"""Build a model protocol from user input."""
19+
if not model or re.match(OPEN_AI_PROTOCOL_REGEX_PATTERN, model, re.IGNORECASE):
20+
return model
21+
if re.match(OPEN_AI_SHORT_FORM_PROTOCOL_TEMPLATE, model, re.IGNORECASE):
22+
return model
23+
if re.match(HUGGINGFACE_PROTOCOL_REGEX_PATTERN, model, re.IGNORECASE):
24+
return model
25+
26+
return OPEN_AI_PROTOCOL_TEMPLATE.format(model, model)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
6+
__path__ = __import__("pkgutil").extend_path(__path__, __name__)

0 commit comments

Comments
 (0)