cocoindex-io
diff --git a/‎Cargo.toml
+3 b/‎Cargo.toml
+3
diff --git a/‎docs/docs/about/community.md
+1-1 b/‎docs/docs/about/community.md
+1-1
diff --git a/‎docs/docs/about/contributing.md
+1-1 b/‎docs/docs/about/contributing.md
+1-1
diff --git a/‎docs/docs/core/cli.mdx
+1 b/‎docs/docs/core/cli.mdx
+1
diff --git a/‎docs/docs/core/flow_methods.mdx
+21-1 b/‎docs/docs/core/flow_methods.mdx
+21-1
diff --git a/‎docs/docs/getting_started/quickstart.md
+1-1 b/‎docs/docs/getting_started/quickstart.md
+1-1
diff --git a/‎docs/docs/ops/functions.md
+11 b/‎docs/docs/ops/functions.md
+11
diff --git a/‎examples/code_embedding/README.md
+11-1 b/‎examples/code_embedding/README.md
+11-1
diff --git a/‎examples/gdrive_text_embedding/data/1706.03762v7.docx
33 KB b/‎examples/gdrive_text_embedding/data/1706.03762v7.docx
33 KB
diff --git a/‎examples/gdrive_text_embedding/data/1810.04805v2.docx
48.4 KB b/‎examples/gdrive_text_embedding/data/1810.04805v2.docx
48.4 KB
diff --git a/‎examples/gdrive_text_embedding/main.py
+6-11 b/‎examples/gdrive_text_embedding/main.py
+6-11
diff --git a/‎python/cocoindex/__init__.py
+1-1 b/‎python/cocoindex/__init__.py
+1-1
diff --git a/‎python/cocoindex/cli.py
+24 b/‎python/cocoindex/cli.py
+24
diff --git a/‎python/cocoindex/flow.py
+29-9 b/‎python/cocoindex/flow.py
+29-9
diff --git a/‎python/cocoindex/typing.py
+15-4 b/‎python/cocoindex/typing.py
+15-4
diff --git a/‎src/base/field_attrs.rs
+9-3 b/‎src/base/field_attrs.rs
+9-3
@@ -87,4 +87,7 @@ hyper-rustls = { version = "0.27.5" }
 yup-oauth2 = "12.1.0"
 rustls = { version = "0.23.25" }
 http-body-util = "0.1.3"
+yaml-rust2 = "0.10.0"
+urlencoding = "2.1.3"
 qdrant-client = "1.13.0"
+
@@ -7,7 +7,7 @@ description: Join the CocoIndex community
 
 Welcome with a huge coconut hug 🥥⋆｡˚🤗.
 
-We are super excited for community contributions of all kinds - whether it's code improvements, documentation updates, issue reports, feature requests on [GitHub](https://github.com/cocoIndex/cocoindex), and discussions in our [Discord](https://discord.com/invite/zpA9S2DR7s).
+We are super excited for community contributions of all kinds - whether it's code improvements, documentation updates, issue reports, feature requests on [GitHub](https://github.com/cocoindex-io/cocoindex), and discussions in our [Discord](https://discord.com/invite/zpA9S2DR7s).
 
 We would love to fostering an inclusive, welcoming, and supportive environment. Contributing to CocoIndex should feel collaborative, friendly and enjoyable for everyone. Together, we can build better AI applications through robust data infrastructure.
 
 
@@ -36,7 +36,7 @@ We love contributions from our community! This guide explains how to get involve
 
 To submit your code:
 
-1. Fork the [CocoIndex repository](https://github.com/cocoIndex/cocoindex)
+1. Fork the [CocoIndex repository](https://github.com/cocoindex-io/cocoindex)
 2. [Create a new branch](https://docs.github.com/en/desktop/making-changes-in-a-branch/managing-branches-in-github-desktop) on your fork
 3. Make your changes
 4. [Open a Pull Request (PR)](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork) when your work is ready for review
 
@@ -65,6 +65,7 @@ The following subcommands are available:
 | `setup` | Check and apply setup changes for flows, including the internal and target storage (to export). |
 | `show` | Show the spec for a specific flow. |
 | `update` | Update the index defined by the flow. |
+| `evaluate` | Evaluate the flow and dump flow outputs to files.  Instead of updating the index, it dumps what should be indexed to files. Mainly used for evaluation purpose. |
 
 Use `--help` to see the full list of subcommands, and `subcommand --help` to see the usage of a specific one.
 
 
@@ -12,7 +12,7 @@ After a flow is defined as discussed in [Flow Definition](/docs/core/flow_def),
 
 ## update
 
-The `update()` method will update will update the index defined by the flow.
+The `update()` method will update the index defined by the flow.
 
 Once the function returns, the indice is fresh up to the moment when the function is called.
 
@@ -23,5 +23,25 @@ Once the function returns, the indice is fresh up to the moment when the functio
 flow.update()
 ```
 
+</TabItem>
+</Tabs>
+
+## evaluate_and_dump
+
+The `evaluate_and_dump()` method evaluates the flow and dump flow outputs to files.
+
+It takes a `EvaluateAndDumpOptions` dataclass as input to configure, with the following fields:
+
+*   `output_dir` (type: `str`, required): The directory to dump the result to.
+*   `use_cache` (type: `bool`, default: `True`): Use already-cached intermediate data if available.
+    Note that we only reuse existing cached data without updating the cache even if it's turned on.
+
+<Tabs>
+<TabItem value="python" label="Python" default>
+
+```python
+flow.evaluate_and_dump(EvaluateAndDumpOptions(output_dir="./eval_output"))
+```
+
 </TabItem>
 </Tabs>
@@ -217,6 +217,6 @@ It will ask you to enter a query and it will return the top 10 results.
 Next, you may want to:
 
 *   Learn about [CocoIndex Basics](../core/basics.md).
-*   Learn about other examples in the [examples](https://github.com/cocoIndex/cocoindex/tree/main/examples) directory.
+*   Learn about other examples in the [examples](https://github.com/cocoindex-io/cocoindex/tree/main/examples) directory.
     *    The `text_embedding` example is this quickstart with some polishing (loading environment variables from `.env` file, extract pieces shared by the indexing flow and query handler into a function).
     *    Pick other examples to learn upon your interest.
@@ -49,6 +49,17 @@ Return type: `vector[float32; N]`, where `N` is determined by the model
 *   `output_type` (type: `type`, required): The type of the output. e.g. a dataclass type name. See [Data Types](/docs/core/data_types) for all supported data types. The LLM will output values that match the schema of the type.
 *   `instruction` (type: `str`, optional): Additional instruction for the LLM.
 
+:::tip Clear type definitions
+
+Definitions of the `output_type` is fed into LLM as guidance to generate the output.
+To improve the quality of the extracted information, giving clear definitions for your dataclasses is especially important, e.g.
+
+*   Provide readable field names for your dataclasses.
+*   Provide reasonable docstrings for your dataclasses.
+*   For any optional fields, clearly annotate that they are optional, by `SomeType | None` or `typing.Optional[SomeType]`.
+
+:::
+
 Input data:
 
 *   `text` (type: `str`, required): The text to extract information from.
 
@@ -1,4 +1,14 @@
-Simple example for cocoindex: build embedding index based on local files.
+# Build embedding index for codebase
+
+![Build embedding index for codebase](https://cocoindex.io/blogs/assets/images/cover-9bf0a7cff69b66a40918ab2fc1cea0c7.png)
+
+In this example, we will build an embedding index for a codebase using CocoIndex. CocoIndex provides built-in support for code base chunking, with native Tree-sitter support. [Tree-sitter](https://en.wikipedia.org/wiki/Tree-sitter_%28parser_generator%29) is a parser generator tool and an incremental parsing library, it is available in Rust 🦀 - [GitHub](https://github.com/tree-sitter/tree-sitter). CocoIndex has built-in Rust integration with Tree-sitter to efficiently parse code and extract syntax trees for various programming languages.
+
+
+Please give [Cocoindex on Github](https://github.com/cocoindex-io/cocoindex) a star to support us if you like our work. Thank you so much with a warm coconut hug 🥥🤗. [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex)
+
+You can find a detailed blog post with step by step tutorial and explanations [here](https://cocoindex.io/blogs/index-code-base-for-rag).
+
 
 ## Prerequisite
 [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
 
@@ -3,15 +3,6 @@
 import cocoindex
 import os
 
-def text_to_embedding(text: cocoindex.DataSlice) -> cocoindex.DataSlice:
-    """
-    Embed the text using a SentenceTransformer model.
-    This is a shared logic between indexing and querying, so extract it as a function.
-    """
-    return text.transform(
-        cocoindex.functions.SentenceTransformerEmbed(
-            model="sentence-transformers/all-MiniLM-L6-v2"))
-
 @cocoindex.flow_def(name="GoogleDriveTextEmbedding")
 def gdrive_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
     """
@@ -33,7 +24,9 @@ def gdrive_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope:
             language="markdown", chunk_size=2000, chunk_overlap=500)
 
         with doc["chunks"].row() as chunk:
-            chunk["embedding"] = text_to_embedding(chunk["text"])
+            chunk["embedding"] = chunk["text"].transform(
+                cocoindex.functions.SentenceTransformerEmbed(
+                 model="sentence-transformers/all-MiniLM-L6-v2")) 
             doc_embeddings.collect(filename=doc["filename"], location=chunk["location"],
                                    text=chunk["text"], embedding=chunk["embedding"])
 
@@ -47,7 +40,9 @@ def gdrive_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope:
     name="SemanticsSearch",
     flow=gdrive_text_embedding_flow,
     target_name="doc_embeddings",
-    query_transform_flow=text_to_embedding,
+    query_transform_flow=lambda text: text.transform(
+        cocoindex.functions.SentenceTransformerEmbed(
+            model="sentence-transformers/all-MiniLM-L6-v2")),
     default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)
 
 @cocoindex.main_fn()
 
@@ -2,7 +2,7 @@
 Cocoindex is a framework for building and running indexing pipelines.
 """
 from . import flow, functions, query, sources, storages, cli
-from .flow import FlowBuilder, DataScope, DataSlice, Flow, flow_def
+from .flow import FlowBuilder, DataScope, DataSlice, Flow, flow_def, EvaluateAndDumpOptions
 from .llm import LlmSpec, LlmApiType
 from .vector import VectorSimilarityMetric
 from .lib import *
 
@@ -1,4 +1,5 @@
 import click
+import datetime
 
 from . import flow, lib
 from .setup import check_setup_status, CheckSetupStatusOptions, apply_setup_changes
@@ -52,6 +53,29 @@ def update(flow_name: str | None):
     stats = _flow_by_name(flow_name).update()
     print(stats)
 
+@cli.command()
+@click.argument("flow_name", type=str, required=False)
+@click.option(
+    "-o", "--output-dir", type=str, required=False,
+    help="The directory to dump the output to.")
+@click.option(
+    "-c", "--use-cache", is_flag=True, show_default=True, default=True,
+    help="Use already-cached intermediate data if available. "
+         "Note that we only reuse existing cached data without updating the cache "
+         "even if it's turned on.")
+def evaluate(flow_name: str | None, output_dir: str | None, use_cache: bool = True):
+    """
+    Evaluate the flow and dump flow outputs to files.
+
+    Instead of updating the index, it dumps what should be indexed to files.
+    Mainly used for evaluation purpose.
+    """
+    fl = _flow_by_name(flow_name)
+    if output_dir is None:
+        output_dir = f"eval_{fl.name}_{datetime.datetime.now().strftime('%y%m%d_%H%M%S')}"
+    options = flow.EvaluateAndDumpOptions(output_dir=output_dir, use_cache=use_cache)
+    fl.evaluate_and_dump(options)
+
 _default_server_settings = lib.ServerSettings.from_env()
 
 @cli.command()
 
@@ -9,6 +9,7 @@
 from typing import Any, Callable, Sequence, TypeVar, get_origin
 from threading import Lock
 from enum import Enum
+from dataclasses import dataclass
 
 from . import _engine
 from . import vector
@@ -61,18 +62,18 @@ def _create_data_slice(
 def _spec_kind(spec: Any) -> str:
     return spec.__class__.__name__
 
-def _spec_value_dump(v: Any) -> Any:
-    """Recursively dump a spec object and its nested attributes to a dictionary."""
+def _dump_engine_object(v: Any) -> Any:
+    """Recursively dump an object for engine. Engine side uses `Pythonzized` to catch."""
     if isinstance(v, type) or get_origin(v) is not None:
         return encode_enriched_type(v)
     elif isinstance(v, Enum):
         return v.value
     elif hasattr(v, '__dict__'):
-        return {k: _spec_value_dump(v) for k, v in v.__dict__.items()}
+        return {k: _dump_engine_object(v) for k, v in v.__dict__.items()}
     elif isinstance(v, (list, tuple)):
-        return [_spec_value_dump(item) for item in v]
+        return [_dump_engine_object(item) for item in v]
     elif isinstance(v, dict):
-        return {k: _spec_value_dump(v) for k, v in v.items()}
+        return {k: _dump_engine_object(v) for k, v in v.items()}
     return v
 
 T = TypeVar('T')
@@ -177,7 +178,7 @@ def transform(self, fn_spec: op.FunctionSpec, *args, **kwargs) -> DataSlice:
             lambda target_scope, name:
                 flow_builder_state.engine_flow_builder.transform(
                     _spec_kind(fn_spec),
-                    _spec_value_dump(fn_spec),
+                    _dump_engine_object(fn_spec),
                     transform_args,
                     target_scope,
                     flow_builder_state.field_name_builder.build_name(
@@ -267,7 +268,7 @@ def export(self, name: str, target_spec: op.StorageSpec, /, *,
             {"field_name": field_name, "metric": metric.value}
             for field_name, metric in vector_index]
         self._flow_builder_state.engine_flow_builder.export(
-            name, _spec_kind(target_spec), _spec_value_dump(target_spec),
+            name, _spec_kind(target_spec), _dump_engine_object(target_spec),
             index_options, self._engine_data_collector)
 
 
@@ -316,13 +317,20 @@ def add_source(self, spec: op.SourceSpec, /, name: str | None = None) -> DataSli
             self._state,
             lambda target_scope, name: self._state.engine_flow_builder.add_source(
                 _spec_kind(spec),
-                _spec_value_dump(spec),
+                _dump_engine_object(spec),
                 target_scope,
                 self._state.field_name_builder.build_name(
                     name, prefix=_to_snake_case(_spec_kind(spec))+'_'),
             ),
             name
         )
+@dataclass
+class EvaluateAndDumpOptions:
+    """
+    Options for evaluating and dumping a flow.
+    """
+    output_dir: str
+    use_cache: bool = True
 
 class Flow:
     """
@@ -348,20 +356,32 @@ def __str__(self):
     def __repr__(self):
         return repr(self._lazy_engine_flow())
 
+    @property
+    def name(self) -> str:
+        """
+        Get the name of the flow.
+        """
+        return self._lazy_engine_flow().name()
+
     def update(self):
         """
         Update the index defined by the flow.
         Once the function returns, the indice is fresh up to the moment when the function is called.
         """
         return self._lazy_engine_flow().update()
 
+    def evaluate_and_dump(self, options: EvaluateAndDumpOptions):
+        """
+        Evaluate the flow and dump flow outputs to files.
+        """
+        return self._lazy_engine_flow().evaluate_and_dump(_dump_engine_object(options))
+
     def internal_flow(self) -> _engine.Flow:
         """
         Get the engine flow.
         """
         return self._lazy_engine_flow()
 
-
 def _create_lazy_flow(name: str | None, fl_def: Callable[[FlowBuilder, DataScope], None]) -> Flow:
     """
     Create a flow without really building it yet.
 
@@ -2,6 +2,7 @@
 import collections
 import dataclasses
 import types
+import inspect
 from typing import Annotated, NamedTuple, Any, TypeVar, TYPE_CHECKING, overload
 
 class Vector(NamedTuple):
@@ -130,15 +131,23 @@ def analyze_type_info(t) -> AnalyzedTypeInfo:
         elif t is float:
             kind = 'Float64'
         else:
-            raise ValueError(f"type unsupported yet: {base_type}")
+            raise ValueError(f"type unsupported yet: {t}")
 
     return AnalyzedTypeInfo(kind=kind, vector_info=vector_info, elem_type=elem_type,
                             dataclass_type=dataclass_type, attrs=attrs, nullable=nullable)
 
 def _encode_fields_schema(dataclass_type: type) -> list[dict[str, Any]]:
-    return [{ 'name': field.name,
-              **encode_enriched_type_info(analyze_type_info(field.type))
-            } for field in dataclasses.fields(dataclass_type)]
+    result = []
+    for field in dataclasses.fields(dataclass_type):
+        try:
+            type_info = encode_enriched_type_info(analyze_type_info(field.type))
+        except ValueError as e:
+            e.add_note(f"Failed to encode annotation for field - "
+                       f"{dataclass_type.__name__}.{field.name}: {field.type}")
+            raise
+        type_info['name'] = field.name
+        result.append(type_info)
+    return result
 
 def _encode_type(type_info: AnalyzedTypeInfo) -> dict[str, Any]:
     encoded_type: dict[str, Any] = { 'kind': type_info.kind }
@@ -147,6 +156,8 @@ def _encode_type(type_info: AnalyzedTypeInfo) -> dict[str, Any]:
         if type_info.dataclass_type is None:
             raise ValueError("Struct type must have a dataclass type")
         encoded_type['fields'] = _encode_fields_schema(type_info.dataclass_type)
+        if doc := inspect.getdoc(type_info.dataclass_type):
+            encoded_type['description'] = doc
 
     elif type_info.kind == 'Vector':
         if type_info.vector_info is None:
 
@@ -2,10 +2,16 @@ use const_format::concatcp;
 
 pub static COCOINDEX_PREFIX: &str = "cocoindex.io/";
 
-/// Expected mime types for bytes and str.
-pub static _MIME_TYPE: &str = concatcp!(COCOINDEX_PREFIX, "mime_type");
+/// Present for bytes and str. It points to fields that represents the original file name for the data.
+/// Type: AnalyzedValueMapping
+pub static CONTENT_FILENAME: &str = concatcp!(COCOINDEX_PREFIX, "content_filename");
 
-/// Base text for chunks.
+/// Present for bytes and str. It points to fields that represents mime types for the data.
+/// Type: AnalyzedValueMapping
+pub static CONTENT_MIME_TYPE: &str = concatcp!(COCOINDEX_PREFIX, "content_mime_type");
+
+/// Present for chunks. It points to fields that the chunks are for.
+/// Type: AnalyzedValueMapping
 pub static CHUNK_BASE_TEXT: &str = concatcp!(COCOINDEX_PREFIX, "chunk_base_text");
 
 /// Base text for an embedding vector.