Skip to content

Commit 457c710

Browse files
authored
Update vector_index for export() to take list of VectorIndexDef. (#304)
1 parent e8a6290 commit 457c710

File tree

15 files changed

+69
-43
lines changed

15 files changed

+69
-43
lines changed

README.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,10 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
8080
"doc_embeddings",
8181
cocoindex.storages.Postgres(),
8282
primary_key_fields=["filename", "location"],
83-
vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
83+
vector_indexes=[
84+
cocoindex.VectorIndexDef(
85+
field_name="embedding",
86+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
8487
```
8588

8689
It defines an index flow like this:

docs/docs/core/flow_def.mdx

+4-2
Original file line numberDiff line numberDiff line change
@@ -259,8 +259,10 @@ Export must happen at the top level of a flow, i.e. not within any child scopes
259259

260260
* `name`: the name to identify the export target.
261261
* `target_spec`: the storage spec as the export target.
262-
* `primary_key_fields` (optional): the fields to be used as primary key. Types of the fields must be supported as key fields. See [Key Types](data_types#key-types) for more details.
263-
* `vector_index` (optional): the fields to create vector index. Each item is a tuple of a field name and a similarity metric. See [Vector Type](data_types#vector-type) for more details about supported similarity metrics.
262+
* `primary_key_fields` (`Sequence[str]`): the fields to be used as primary key. Types of the fields must be supported as key fields. See [Key Types](data_types#key-types) for more details.
263+
* `vector_indexes` (`Sequence[VectorIndexDef]`, optional): the fields to create vector index. `VectorIndexDef` has the following fields:
264+
* `field_name`: the field to create vector index.
265+
* `metric`: the similarity metric to use. See [Vector Type](data_types#vector-type) for more details about supported similarity metrics.
264266
* `setup_by_user` (optional):
265267
whether the export target is setup by user.
266268
By default, CocoIndex is managing the target setup (surfaced by the `cocoindex setup` CLI subcommand), e.g. create related tables/collections/etc. with compatible schema, and update them upon change.

docs/docs/getting_started/quickstart.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,10 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
9797
"doc_embeddings",
9898
cocoindex.storages.Postgres(),
9999
primary_key_fields=["filename", "location"],
100-
vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
100+
vector_indexes=[
101+
cocoindex.VectorIndexDef(
102+
field_name="embedding",
103+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
101104
```
102105

103106
Notes:

examples/code_embedding/main.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,10 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
4141
"code_embeddings",
4242
cocoindex.storages.Postgres(),
4343
primary_key_fields=["filename", "location"],
44-
vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
44+
vector_indexes=[
45+
cocoindex.VectorIndexDef(
46+
field_name="embedding",
47+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
4548

4649

4750
query_handler = cocoindex.query.SimpleSemanticsQueryHandler(

examples/docs_to_kg/main.py

+7-9
Original file line numberDiff line numberDiff line change
@@ -93,15 +93,13 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
9393
),
9494
nodes={
9595
"Entity": cocoindex.storages.Neo4jRelationshipNodeSpec(
96-
index_options=cocoindex.IndexOptions(
97-
primary_key_fields=["value"],
98-
vector_index_defs=[
99-
cocoindex.VectorIndexDef(
100-
field_name="embedding",
101-
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
102-
),
103-
],
104-
),
96+
primary_key_fields=["value"],
97+
vector_indexes=[
98+
cocoindex.VectorIndexDef(
99+
field_name="embedding",
100+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
101+
),
102+
],
105103
),
106104
},
107105
),

examples/gdrive_text_embedding/main.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,10 @@ def gdrive_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope:
3737
"doc_embeddings",
3838
cocoindex.storages.Postgres(),
3939
primary_key_fields=["filename", "location"],
40-
vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
40+
vector_indexes=[
41+
cocoindex.VectorIndexDef(
42+
field_name="embedding",
43+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
4144

4245
query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
4346
name="SemanticsSearch",

examples/pdf_embedding/main.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,10 @@ def pdf_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoinde
6363
"doc_embeddings",
6464
cocoindex.storages.Postgres(),
6565
primary_key_fields=["id"],
66-
vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
66+
vector_indexes=[
67+
cocoindex.VectorIndexDef(
68+
field_name="embedding",
69+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
6770

6871
query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
6972
name="SemanticsSearch",

examples/text_embedding/main.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,10 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
3535
"doc_embeddings",
3636
cocoindex.storages.Postgres(),
3737
primary_key_fields=["filename", "location"],
38-
vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
38+
vector_indexes=[
39+
cocoindex.VectorIndexDef(
40+
field_name="embedding",
41+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
3942

4043
query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
4144
name="SemanticsSearch",

python/cocoindex/flow.py

+14-8
Original file line numberDiff line numberDiff line change
@@ -267,21 +267,27 @@ def collect(self, **kwargs):
267267
self._engine_data_collector, regular_kwargs, auto_uuid_field)
268268

269269
def export(self, name: str, target_spec: op.StorageSpec, /, *,
270-
primary_key_fields: Sequence[str] | None = None,
270+
primary_key_fields: Sequence[str],
271+
vector_indexes: Sequence[index.VectorIndexDef] = (),
271272
vector_index: Sequence[tuple[str, index.VectorSimilarityMetric]] = (),
272273
setup_by_user: bool = False):
273274
"""
274275
Export the collected data to the specified target.
276+
277+
`vector_index` is for backward compatibility only. Please use `vector_indexes` instead.
275278
"""
276-
index_options: dict[str, Any] = {}
277-
if primary_key_fields is not None:
278-
index_options["primary_key_fields"] = primary_key_fields
279-
index_options["vector_index_defs"] = [
280-
{"field_name": field_name, "metric": metric.value}
281-
for field_name, metric in vector_index]
279+
# For backward compatibility only.
280+
if len(vector_indexes) == 0 and len(vector_index) > 0:
281+
vector_indexes = [index.VectorIndexDef(field_name=field_name, metric=metric)
282+
for field_name, metric in vector_index]
283+
284+
index_options = index.IndexOptions(
285+
primary_key_fields=primary_key_fields,
286+
vector_indexes=vector_indexes,
287+
)
282288
self._flow_builder_state.engine_flow_builder.export(
283289
name, _spec_kind(target_spec), dump_engine_object(target_spec),
284-
index_options, self._engine_data_collector, setup_by_user)
290+
dump_engine_object(index_options), self._engine_data_collector, setup_by_user)
285291

286292

287293
_flow_name_builder = _NameBuilder()

python/cocoindex/index.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from enum import Enum
22
from dataclasses import dataclass
3-
3+
from typing import Sequence
44
class VectorSimilarityMetric(Enum):
55
COSINE_SIMILARITY = "CosineSimilarity"
66
L2_DISTANCE = "L2Distance"
@@ -19,5 +19,5 @@ class IndexOptions:
1919
"""
2020
Options for an index.
2121
"""
22-
primary_key_fields: list[str] | None = None
23-
vector_index_defs: list[VectorIndexDef] | None = None
22+
primary_key_fields: Sequence[str]
23+
vector_indexes: Sequence[VectorIndexDef] = ()

python/cocoindex/storages.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ class Neo4jRelationshipEndSpec:
3535
@dataclass
3636
class Neo4jRelationshipNodeSpec:
3737
"""Spec for a Neo4j node type."""
38-
key_field_name: str | None = None
39-
index_options: index.IndexOptions | None = None
38+
primary_key_fields: list[str]
39+
vector_indexes: list[index.VectorIndexDef] | None = None
4040
class Neo4jRelationship(op.StorageSpec):
4141
"""Graph storage powered by Neo4j."""
4242

src/base/spec.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ pub struct IndexOptions {
232232
#[serde(default, skip_serializing_if = "Option::is_none")]
233233
pub primary_key_fields: Option<Vec<FieldName>>,
234234
#[serde(default, skip_serializing_if = "Vec::is_empty")]
235-
pub vector_index_defs: Vec<VectorIndexDef>,
235+
pub vector_indexes: Vec<VectorIndexDef>,
236236
}
237237

238238
/// Store data to a given sink.

src/execution/query.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ impl SimpleSemanticsQueryHandler {
4040
.position(|export_op| export_op.name == target_name)
4141
.unwrap();
4242
let export_op = &flow.flow_instance.export_ops[export_op_idx];
43-
let vector_index_defs = &export_op.spec.index_options.vector_index_defs;
43+
let vector_indexes = &export_op.spec.index_options.vector_indexes;
4444
let execution_plan = flow.get_execution_plan().await?;
4545
let analyzed_export_op = &execution_plan.export_ops[export_op_idx];
4646
Ok(Self {
@@ -55,8 +55,8 @@ impl SimpleSemanticsQueryHandler {
5555
},
5656
query_transform_flow,
5757
default_similarity_metric,
58-
default_vector_field_name: if vector_index_defs.len() == 1 {
59-
Some(vector_index_defs[0].field_name.clone())
58+
default_vector_field_name: if vector_indexes.len() == 1 {
59+
Some(vector_indexes[0].field_name.clone())
6060
} else {
6161
None
6262
},

src/ops/storages/neo4j.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ pub struct RelationshipEndSpec {
3939

4040
#[derive(Debug, Deserialize)]
4141
pub struct RelationshipNodeSpec {
42+
#[serde(flatten)]
4243
index_options: spec::IndexOptions,
4344
}
4445

@@ -586,7 +587,7 @@ impl NodeLabelSetupState {
586587
key_constraint_name,
587588
vector_indexes: spec
588589
.index_options
589-
.vector_index_defs
590+
.vector_indexes
590591
.iter()
591592
.map(|v| -> Result<_> {
592593
Ok((
@@ -639,7 +640,7 @@ impl RelationshipSetupState {
639640
key_field_names,
640641
key_constraint_name: format!("r__{}__key", spec.rel_type),
641642
vector_indexes: index_options
642-
.vector_index_defs
643+
.vector_indexes
643644
.iter()
644645
.map(|v| -> Result<_> {
645646
Ok((

src/ops/storages/postgres.rs

+8-7
Original file line numberDiff line numberDiff line change
@@ -508,7 +508,7 @@ impl SetupState {
508508
.map(|f| (f.name.clone(), f.value_type.typ.without_attrs()))
509509
.collect(),
510510
vector_indexes: index_options
511-
.vector_index_defs
511+
.vector_indexes
512512
.iter()
513513
.map(|v| (to_vector_index_name(&table_id.table_name, v), v.clone()))
514514
.collect(),
@@ -611,11 +611,11 @@ impl SetupStatusCheck {
611611
.value_fields_schema
612612
.iter()
613613
.filter(|(field_name, schema)| {
614-
existing.possible_versions().any(|v| {
614+
!existing.current.as_ref().map_or(false, |v| {
615615
v.value_fields_schema
616616
.get(*field_name)
617617
.map(to_column_type_sql)
618-
!= Some(to_column_type_sql(schema))
618+
== Some(to_column_type_sql(schema))
619619
})
620620
})
621621
.map(|(k, v)| (k.clone(), v.clone()))
@@ -639,9 +639,10 @@ impl SetupStatusCheck {
639639
.vector_indexes
640640
.iter()
641641
.filter(|(name, def)| {
642-
existing
643-
.possible_versions()
644-
.any(|v| v.vector_indexes.get(*name) != Some(def))
642+
!existing
643+
.current
644+
.as_ref()
645+
.map_or(false, |v| v.vector_indexes.get(*name) != Some(def))
645646
})
646647
.map(|(k, v)| (k.clone(), v.clone()))
647648
.collect(),
@@ -879,7 +880,7 @@ impl setup::ResourceSetupStatusCheck for SetupStatusCheck {
879880
let sql = format!(
880881
"CREATE INDEX IF NOT EXISTS {} ON {} {}",
881882
index_name,
882-
index_spec.field_name,
883+
self.table_id.table_name,
883884
to_index_spec_sql(index_spec)
884885
);
885886
sqlx::query(&sql).execute(&db_pool).await?;

0 commit comments

Comments
 (0)