Skip to content

Commit 93a750d

Browse files
authored
[ENH] Multimodal Embedding Functions (chroma-core#1345)
## Description of changes This PR introduces multi-modal embeddings into Chroma. - It adds the generic `EmbeddingFunction` which can take various data types. Existing functions take the `Documents` type. - Adds `Images` as a type (numpy NDArray taking ints or floats) - Add `OpenCLIPEmbeddingFunction` which is an `EmbeddingFunction[Union[Documents, Images]]` ## Test Integration tests pass. A new test for multimodal embedding functions: [chromadb/test/ef/test_multimodal_ef.py](https://github.com/chroma-core/chroma/blob/86a9e2620352ee0b2844bc3233f9e001cc4aa3d9/chromadb/test/ef/test_multimodal_ef.py) ## Documentation See chroma-core#1294 ## TODOs - [x] Tests - [x] ~Wiring through FastAPI~ Nothing to wire through - [x] Documentation - [x] Telemetry - [ ] JavaScript
1 parent 5cce85b commit 93a750d

19 files changed

+1663
-269
lines changed

Diff for: chromadb/api/__init__.py

+36-9
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,15 @@
88
from chromadb.api.types import (
99
CollectionMetadata,
1010
Documents,
11+
Embeddable,
1112
EmbeddingFunction,
13+
DataLoader,
1214
Embeddings,
1315
IDs,
1416
Include,
17+
Loadable,
1518
Metadatas,
19+
URIs,
1620
Where,
1721
QueryResult,
1822
GetResult,
@@ -58,7 +62,10 @@ def create_collection(
5862
self,
5963
name: str,
6064
metadata: Optional[CollectionMetadata] = None,
61-
embedding_function: Optional[EmbeddingFunction] = ef.DefaultEmbeddingFunction(),
65+
embedding_function: Optional[
66+
EmbeddingFunction[Embeddable]
67+
] = ef.DefaultEmbeddingFunction(), # type: ignore
68+
data_loader: Optional[DataLoader[Loadable]] = None,
6269
get_or_create: bool = False,
6370
) -> Collection:
6471
"""Create a new collection with the given name and metadata.
@@ -90,9 +97,12 @@ def create_collection(
9097
@abstractmethod
9198
def get_collection(
9299
self,
93-
name: Optional[str] = None,
100+
name: str,
94101
id: Optional[UUID] = None,
95-
embedding_function: Optional[EmbeddingFunction] = ef.DefaultEmbeddingFunction(),
102+
embedding_function: Optional[
103+
EmbeddingFunction[Embeddable]
104+
] = ef.DefaultEmbeddingFunction(), # type: ignore
105+
data_loader: Optional[DataLoader[Loadable]] = None,
96106
) -> Collection:
97107
"""Get a collection with the given name.
98108
Args:
@@ -119,7 +129,10 @@ def get_or_create_collection(
119129
self,
120130
name: str,
121131
metadata: Optional[CollectionMetadata] = None,
122-
embedding_function: Optional[EmbeddingFunction] = ef.DefaultEmbeddingFunction(),
132+
embedding_function: Optional[
133+
EmbeddingFunction[Embeddable]
134+
] = ef.DefaultEmbeddingFunction(), # type: ignore
135+
data_loader: Optional[DataLoader[Loadable]] = None,
123136
) -> Collection:
124137
"""Get or create a collection with the given name and metadata.
125138
Args:
@@ -186,6 +199,7 @@ def _add(
186199
embeddings: Embeddings,
187200
metadatas: Optional[Metadatas] = None,
188201
documents: Optional[Documents] = None,
202+
uris: Optional[URIs] = None,
189203
) -> bool:
190204
"""[Internal] Add embeddings to a collection specified by UUID.
191205
If (some) ids already exist, only the new embeddings will be added.
@@ -196,6 +210,7 @@ def _add(
196210
embedding: The sequence of embeddings to add.
197211
metadata: The metadata to associate with the embeddings. Defaults to None.
198212
documents: The documents to associate with the embeddings. Defaults to None.
213+
uris: URIs of data sources for each embedding. Defaults to None.
199214
200215
Returns:
201216
True if the embeddings were added successfully.
@@ -210,6 +225,7 @@ def _update(
210225
embeddings: Optional[Embeddings] = None,
211226
metadatas: Optional[Metadatas] = None,
212227
documents: Optional[Documents] = None,
228+
uris: Optional[URIs] = None,
213229
) -> bool:
214230
"""[Internal] Update entries in a collection specified by UUID.
215231
@@ -219,7 +235,7 @@ def _update(
219235
embeddings: The sequence of embeddings to update. Defaults to None.
220236
metadatas: The metadata to associate with the embeddings. Defaults to None.
221237
documents: The documents to associate with the embeddings. Defaults to None.
222-
238+
uris: URIs of data sources for each embedding. Defaults to None.
223239
Returns:
224240
True if the embeddings were updated successfully.
225241
"""
@@ -233,6 +249,7 @@ def _upsert(
233249
embeddings: Embeddings,
234250
metadatas: Optional[Metadatas] = None,
235251
documents: Optional[Documents] = None,
252+
uris: Optional[URIs] = None,
236253
) -> bool:
237254
"""[Internal] Add or update entries in the a collection specified by UUID.
238255
If an entry with the same id already exists, it will be updated,
@@ -244,6 +261,7 @@ def _upsert(
244261
embeddings: The sequence of embeddings to add
245262
metadatas: The metadata to associate with the embeddings. Defaults to None.
246263
documents: The documents to associate with the embeddings. Defaults to None.
264+
uris: URIs of data sources for each embedding. Defaults to None.
247265
"""
248266
pass
249267

@@ -486,7 +504,10 @@ def create_collection(
486504
self,
487505
name: str,
488506
metadata: Optional[CollectionMetadata] = None,
489-
embedding_function: Optional[EmbeddingFunction] = ef.DefaultEmbeddingFunction(),
507+
embedding_function: Optional[
508+
EmbeddingFunction[Embeddable]
509+
] = ef.DefaultEmbeddingFunction(), # type: ignore
510+
data_loader: Optional[DataLoader[Loadable]] = None,
490511
get_or_create: bool = False,
491512
tenant: str = DEFAULT_TENANT,
492513
database: str = DEFAULT_DATABASE,
@@ -497,9 +518,12 @@ def create_collection(
497518
@override
498519
def get_collection(
499520
self,
500-
name: Optional[str] = None,
521+
name: str,
501522
id: Optional[UUID] = None,
502-
embedding_function: Optional[EmbeddingFunction] = ef.DefaultEmbeddingFunction(),
523+
embedding_function: Optional[
524+
EmbeddingFunction[Embeddable]
525+
] = ef.DefaultEmbeddingFunction(), # type: ignore
526+
data_loader: Optional[DataLoader[Loadable]] = None,
503527
tenant: str = DEFAULT_TENANT,
504528
database: str = DEFAULT_DATABASE,
505529
) -> Collection:
@@ -511,7 +535,10 @@ def get_or_create_collection(
511535
self,
512536
name: str,
513537
metadata: Optional[CollectionMetadata] = None,
514-
embedding_function: Optional[EmbeddingFunction] = ef.DefaultEmbeddingFunction(),
538+
embedding_function: Optional[
539+
EmbeddingFunction[Embeddable]
540+
] = ef.DefaultEmbeddingFunction(), # type: ignore
541+
data_loader: Optional[DataLoader[Loadable]] = None,
515542
tenant: str = DEFAULT_TENANT,
516543
database: str = DEFAULT_DATABASE,
517544
) -> Collection:

Diff for: chromadb/api/client.py

+26-4
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,18 @@
66
from chromadb.api import AdminAPI, ClientAPI, ServerAPI
77
from chromadb.api.types import (
88
CollectionMetadata,
9+
DataLoader,
910
Documents,
11+
Embeddable,
1012
EmbeddingFunction,
1113
Embeddings,
1214
GetResult,
1315
IDs,
1416
Include,
17+
Loadable,
1518
Metadatas,
1619
QueryResult,
20+
URIs,
1721
)
1822
from chromadb.config import Settings, System
1923
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE
@@ -174,13 +178,17 @@ def create_collection(
174178
self,
175179
name: str,
176180
metadata: Optional[CollectionMetadata] = None,
177-
embedding_function: Optional[EmbeddingFunction] = ef.DefaultEmbeddingFunction(),
181+
embedding_function: Optional[
182+
EmbeddingFunction[Embeddable]
183+
] = ef.DefaultEmbeddingFunction(), # type: ignore
184+
data_loader: Optional[DataLoader[Loadable]] = None,
178185
get_or_create: bool = False,
179186
) -> Collection:
180187
return self._server.create_collection(
181188
name=name,
182189
metadata=metadata,
183190
embedding_function=embedding_function,
191+
data_loader=data_loader,
184192
tenant=self.tenant,
185193
database=self.database,
186194
get_or_create=get_or_create,
@@ -189,14 +197,18 @@ def create_collection(
189197
@override
190198
def get_collection(
191199
self,
192-
name: Optional[str] = None,
200+
name: str,
193201
id: Optional[UUID] = None,
194-
embedding_function: Optional[EmbeddingFunction] = ef.DefaultEmbeddingFunction(),
202+
embedding_function: Optional[
203+
EmbeddingFunction[Embeddable]
204+
] = ef.DefaultEmbeddingFunction(), # type: ignore
205+
data_loader: Optional[DataLoader[Loadable]] = None,
195206
) -> Collection:
196207
return self._server.get_collection(
197208
id=id,
198209
name=name,
199210
embedding_function=embedding_function,
211+
data_loader=data_loader,
200212
tenant=self.tenant,
201213
database=self.database,
202214
)
@@ -206,12 +218,16 @@ def get_or_create_collection(
206218
self,
207219
name: str,
208220
metadata: Optional[CollectionMetadata] = None,
209-
embedding_function: Optional[EmbeddingFunction] = ef.DefaultEmbeddingFunction(),
221+
embedding_function: Optional[
222+
EmbeddingFunction[Embeddable]
223+
] = ef.DefaultEmbeddingFunction(), # type: ignore
224+
data_loader: Optional[DataLoader[Loadable]] = None,
210225
) -> Collection:
211226
return self._server.get_or_create_collection(
212227
name=name,
213228
metadata=metadata,
214229
embedding_function=embedding_function,
230+
data_loader=data_loader,
215231
tenant=self.tenant,
216232
database=self.database,
217233
)
@@ -252,13 +268,15 @@ def _add(
252268
embeddings: Embeddings,
253269
metadatas: Optional[Metadatas] = None,
254270
documents: Optional[Documents] = None,
271+
uris: Optional[URIs] = None,
255272
) -> bool:
256273
return self._server._add(
257274
ids=ids,
258275
collection_id=collection_id,
259276
embeddings=embeddings,
260277
metadatas=metadatas,
261278
documents=documents,
279+
uris=uris,
262280
)
263281

264282
@override
@@ -269,13 +287,15 @@ def _update(
269287
embeddings: Optional[Embeddings] = None,
270288
metadatas: Optional[Metadatas] = None,
271289
documents: Optional[Documents] = None,
290+
uris: Optional[URIs] = None,
272291
) -> bool:
273292
return self._server._update(
274293
collection_id=collection_id,
275294
ids=ids,
276295
embeddings=embeddings,
277296
metadatas=metadatas,
278297
documents=documents,
298+
uris=uris,
279299
)
280300

281301
@override
@@ -286,13 +306,15 @@ def _upsert(
286306
embeddings: Embeddings,
287307
metadatas: Optional[Metadatas] = None,
288308
documents: Optional[Documents] = None,
309+
uris: Optional[URIs] = None,
289310
) -> bool:
290311
return self._server._upsert(
291312
collection_id=collection_id,
292313
ids=ids,
293314
embeddings=embeddings,
294315
metadatas=metadatas,
295316
documents=documents,
317+
uris=uris,
296318
)
297319

298320
@override

0 commit comments

Comments
 (0)