Skip to content

Commit 3633eda

Browse files
committed
Add Langchain retriever
0 parents  commit 3633eda

File tree

9 files changed

+1513
-0
lines changed

9 files changed

+1513
-0
lines changed

.gitignore

+175
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
share/python-wheels/
24+
*.egg-info/
25+
.installed.cfg
26+
*.egg
27+
MANIFEST
28+
29+
# PyInstaller
30+
# Usually these files are written by a python script from a template
31+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
32+
*.manifest
33+
*.spec
34+
35+
# Installer logs
36+
pip-log.txt
37+
pip-delete-this-directory.txt
38+
39+
# Unit test / coverage reports
40+
htmlcov/
41+
.tox/
42+
.nox/
43+
.coverage
44+
.coverage.*
45+
.cache
46+
nosetests.xml
47+
coverage.xml
48+
*.cover
49+
*.py,cover
50+
.hypothesis/
51+
.pytest_cache/
52+
cover/
53+
54+
# Translations
55+
*.mo
56+
*.pot
57+
58+
# Django stuff:
59+
*.log
60+
local_settings.py
61+
db.sqlite3
62+
db.sqlite3-journal
63+
64+
# Flask stuff:
65+
instance/
66+
.webassets-cache
67+
68+
# Scrapy stuff:
69+
.scrapy
70+
71+
# Sphinx documentation
72+
docs/_build/
73+
74+
# PyBuilder
75+
.pybuilder/
76+
target/
77+
78+
# Jupyter Notebook
79+
.ipynb_checkpoints
80+
81+
# IPython
82+
profile_default/
83+
ipython_config.py
84+
85+
# pyenv
86+
# For a library or package, you might want to ignore these files since the code is
87+
# intended to run in multiple environments; otherwise, check them in:
88+
# .python-version
89+
90+
# pipenv
91+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
93+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
94+
# install all needed dependencies.
95+
#Pipfile.lock
96+
97+
# UV
98+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99+
# This is especially recommended for binary packages to ensure reproducibility, and is more
100+
# commonly ignored for libraries.
101+
#uv.lock
102+
103+
# poetry
104+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105+
# This is especially recommended for binary packages to ensure reproducibility, and is more
106+
# commonly ignored for libraries.
107+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108+
#poetry.lock
109+
110+
# pdm
111+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112+
#pdm.lock
113+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114+
# in version control.
115+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116+
.pdm.toml
117+
.pdm-python
118+
.pdm-build/
119+
120+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121+
__pypackages__/
122+
123+
# Celery stuff
124+
celerybeat-schedule
125+
celerybeat.pid
126+
127+
# SageMath parsed files
128+
*.sage.py
129+
130+
# Environments
131+
.env
132+
.venv
133+
env/
134+
venv/
135+
ENV/
136+
env.bak/
137+
venv.bak/
138+
139+
# Spyder project settings
140+
.spyderproject
141+
.spyproject
142+
143+
# Rope project settings
144+
.ropeproject
145+
146+
# mkdocs documentation
147+
/site
148+
149+
# mypy
150+
.mypy_cache/
151+
.dmypy.json
152+
dmypy.json
153+
154+
# Pyre type checker
155+
.pyre/
156+
157+
# pytype static type analyzer
158+
.pytype/
159+
160+
# Cython debug symbols
161+
cython_debug/
162+
163+
# PyCharm
164+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166+
# and can be added to the global gitignore or merged into this file. For a more nuclear
167+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
168+
#.idea/
169+
170+
# Ruff stuff:
171+
.ruff_cache/
172+
173+
# PyPI configuration file
174+
.pypirc
175+

langchain/README.md

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# langchain-vectorize
2+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Vectorize integrations with LangChain."""
+145
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
"""Vectorize LangChain retrievers."""
2+
3+
from __future__ import annotations
4+
5+
from typing import TYPE_CHECKING, Any, Optional
6+
7+
import vectorize_client
8+
from langchain_core.documents import Document
9+
from langchain_core.retrievers import BaseRetriever
10+
from typing_extensions import override
11+
from vectorize_client import (
12+
ApiClient,
13+
Configuration,
14+
PipelinesApi,
15+
RetrieveDocumentsRequest,
16+
)
17+
18+
if TYPE_CHECKING:
19+
from langchain_core.callbacks import CallbackManagerForRetrieverRun
20+
from langchain_core.runnables import RunnableConfig
21+
22+
_METADATA_FIELDS = {
23+
"relevancy",
24+
"chunk_id",
25+
"total_chunks",
26+
"origin",
27+
"origin_id",
28+
"similarity",
29+
"source",
30+
"unique_source",
31+
"source_display_name",
32+
"pipeline_id",
33+
"org_id",
34+
}
35+
_NOT_SET = object()
36+
37+
38+
class VectorizeRetriever(BaseRetriever):
39+
"""Vectorize retriever."""
40+
41+
api_token: str
42+
"""The Vectorize API token."""
43+
organization: Optional[str] = None # noqa: UP007
44+
"""The Vectorize organization ID."""
45+
pipeline_id: Optional[str] = None # noqa: UP007
46+
"""The Vectorize pipeline ID."""
47+
num_results: int = 5
48+
"""The number of documents to return."""
49+
rerank: bool = False
50+
"""Whether to rerank the results."""
51+
metadata_filters: list[dict[str, Any]] = []
52+
"""The metadata filters to apply when retrieving the documents."""
53+
54+
_pipelines: PipelinesApi | None = None
55+
56+
@override
57+
def model_post_init(self, /, context: Any) -> None:
58+
api = ApiClient(Configuration(access_token=self.api_token))
59+
self._pipelines = PipelinesApi(api)
60+
61+
@staticmethod
62+
def _convert_document(document: vectorize_client.models.Document) -> Document:
63+
metadata = {field: getattr(document, field) for field in _METADATA_FIELDS}
64+
return Document(id=document.id, page_content=document.text, metadata=metadata)
65+
66+
@override
67+
def _get_relevant_documents(
68+
self,
69+
query: str,
70+
*,
71+
run_manager: CallbackManagerForRetrieverRun,
72+
organization: str | None = None,
73+
pipeline_id: str | None = None,
74+
num_results: int | None = None,
75+
rerank: bool | None = None,
76+
metadata_filters: list[dict[str, Any]] | None = None,
77+
) -> list[Document]:
78+
request = RetrieveDocumentsRequest(
79+
question=query,
80+
num_results=num_results or self.num_results,
81+
rerank=rerank or self.rerank,
82+
metadata_filters=metadata_filters or self.metadata_filters,
83+
)
84+
response = self._pipelines.retrieve_documents(
85+
organization or self.organization, pipeline_id or self.pipeline_id, request
86+
)
87+
return [self._convert_document(doc) for doc in response.documents]
88+
89+
@override
90+
def invoke(
91+
self,
92+
input: str,
93+
config: RunnableConfig | None = None,
94+
*,
95+
organization: str = "",
96+
pipeline_id: str = "",
97+
num_results: int = _NOT_SET,
98+
rerank: bool = _NOT_SET,
99+
metadata_filters: list[dict[str, Any]] = _NOT_SET,
100+
) -> list[Document]:
101+
"""Invoke the retriever to get relevant documents.
102+
103+
Main entry point for retriever invocations.
104+
105+
Args:
106+
input: The query string.
107+
config: Configuration for the retriever. Defaults to None.
108+
organization: The organization to retrieve documents from.
109+
If set, overrides the organization set at the initialization of the
110+
retriever.
111+
pipeline_id: The pipeline ID to retrieve documents from.
112+
If set, overrides the pipeline ID set at the initialization of the
113+
retriever.
114+
num_results: The number of results to retrieve.
115+
If set, overrides the number of results set at the initialization of
116+
the retriever.
117+
rerank: Whether to rerank the retrieved documents.
118+
If set, overrides the reranking set at the initialization of the
119+
retriever.
120+
metadata_filters: The metadata filters to apply when retrieving documents.
121+
If set, overrides the metadata filters set at the initialization of the
122+
retriever.
123+
124+
Returns:
125+
List of relevant documents.
126+
127+
Examples:
128+
129+
.. code-block:: python
130+
131+
retriever.invoke("query")
132+
"""
133+
kwargs = {}
134+
if organization:
135+
kwargs["organization"] = organization
136+
if pipeline_id:
137+
kwargs["pipeline_id"] = pipeline_id
138+
if num_results is not _NOT_SET:
139+
kwargs["num_results"] = num_results
140+
if rerank is not _NOT_SET:
141+
kwargs["rerank"] = rerank
142+
if metadata_filters is not _NOT_SET:
143+
kwargs["metadata_filters"] = metadata_filters
144+
145+
return super().invoke(input, config, **kwargs)

0 commit comments

Comments
 (0)