From bd8f2f57b623f142d86c3ffa9efaf8e5f57df5aa Mon Sep 17 00:00:00 2001 From: davidhou17 <55004296+davidhou17@users.noreply.github.com> Date: Wed, 28 May 2025 15:56:19 -0500 Subject: [PATCH] create new notebook --- .../langchain-parent-document-retrieval.ipynb | 2 +- .../langchain-self-query-retrieval.ipynb | 407 ++++++++++++++++++ 2 files changed, 408 insertions(+), 1 deletion(-) create mode 100644 ai-integrations/langchain-self-query-retrieval.ipynb diff --git a/ai-integrations/langchain-parent-document-retrieval.ipynb b/ai-integrations/langchain-parent-document-retrieval.ipynb index 0ba6392..dd54ceb 100644 --- a/ai-integrations/langchain-parent-document-retrieval.ipynb +++ b/ai-integrations/langchain-parent-document-retrieval.ipynb @@ -11,7 +11,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This notebook is a companion to the [LangChain Parent Document Retrieval](https://www.mongodb.com/docs/atlas/atlas-vector-search/ai-integrations/langchain/parent-document-retrieval/) page. Refer to the page for set-up instructions and detailed explanations.\n", + "This notebook is a companion to the [Parent Document Retrieval](https://www.mongodb.com/docs/atlas/atlas-vector-search/ai-integrations/langchain/parent-document-retrieval/) page. Refer to the page for set-up instructions and detailed explanations.\n", "\n", "\n", " \"Open\n", diff --git a/ai-integrations/langchain-self-query-retrieval.ipynb b/ai-integrations/langchain-self-query-retrieval.ipynb new file mode 100644 index 0000000..4bd0ed2 --- /dev/null +++ b/ai-integrations/langchain-self-query-retrieval.ipynb @@ -0,0 +1,407 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "57956324", + "metadata": {}, + "source": [ + "# LangChain MongoDB Integration - Self-Querying Retrieval\n", + "\n", + "This notebook is a companion to the [Self-Querying Retrieval](https://www.mongodb.com/docs/atlas/atlas-vector-search/ai-integrations/langchain/parent-document-retrieval/) page. Refer to the page for set-up instructions and detailed explanations.\n", + "\n", + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "a9924d95", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "To complete this tutorial, you must have the following:\n", + "- A MongoDB Atlas cluster\n", + "- A Voyage AI API key\n", + "- An OpenAI API key" + ] + }, + { + "cell_type": "markdown", + "id": "fca4d65e", + "metadata": {}, + "source": [ + "## Set up the environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6283b411", + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "pip install --quiet --upgrade langchain-mongodb langchain-voyageai langchain-openai langchain langchain-community langchain-core lark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c7b5f46", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = \"\"\n", + "os.environ[\"VOYAGE_API_KEY\"] = \"\"\n", + "MONGODB_URI = \"\"" + ] + }, + { + "cell_type": "markdown", + "id": "33ce1710", + "metadata": {}, + "source": [ + "## Instantiate the vector store" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "155f5870", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_mongodb import MongoDBAtlasVectorSearch\n", + "from langchain_voyageai import VoyageAIEmbeddings\n", + "\n", + "# Use the voyage-3-large embedding model\n", + "embedding_model = VoyageAIEmbeddings(model=\"voyage-3-large\")\n", + "\n", + "# Create the vector store\n", + "vector_store = MongoDBAtlasVectorSearch.from_connection_string(\n", + " connection_string = MONGODB_URI,\n", + " embedding = embedding_model,\n", + " namespace = \"langchain_db.self_query\",\n", + " text_key = \"page_content\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "652be7fb", + "metadata": {}, + "source": [ + "## Add data to the vector store" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29191663", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.documents import Document\n", + "\n", + "docs = [\n", + " Document(\n", + " page_content=\"A bunch of scientists bring back dinosaurs and mayhem breaks loose\",\n", + " metadata={\"year\": 1993, \"rating\": 7.7, \"genre\": \"action\"},\n", + " ),\n", + " Document(\n", + " page_content=\"Leo DiCaprio gets lost in a dream within a dream within a dream within a ...\",\n", + " metadata={\"year\": 2010, \"genre\": \"thriller\", \"rating\": 8.2},\n", + " ),\n", + " Document(\n", + " page_content=\"A bunch of normal-sized women are supremely wholesome and some men pine after them\",\n", + " metadata={\"year\": 2019, \"rating\": 8.3, \"genre\": \"drama\"},\n", + " ),\n", + " Document(\n", + " page_content=\"Three men walk into the Zone, three men walk out of the Zone\",\n", + " metadata={\"year\": 1979, \"rating\": 9.9, \"genre\": \"science fiction\"},\n", + " ),\n", + " Document(\n", + " page_content=\"A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea\",\n", + " metadata={\"year\": 2006, \"genre\": \"thriller\", \"rating\": 9.0},\n", + " ),\n", + " Document(\n", + " page_content=\"Toys come alive and have a blast doing so\",\n", + " metadata={\"year\": 1995, \"genre\": \"animated\", \"rating\": 9.3},\n", + " ),\n", + "]\n", + "\n", + "# Add data to the vector store, which automaticaly embeds the documents\n", + "vector_store.add_documents(docs)" + ] + }, + { + "cell_type": "markdown", + "id": "a5762ee0", + "metadata": {}, + "source": [ + "## Create the Atlas Vector Search index with filters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39a44b99", + "metadata": {}, + "outputs": [], + "source": [ + "# Use LangChain helper method to create the vector search index\n", + "vector_store.create_vector_search_index(\n", + " dimensions = 1024, # The dimensions of the vector embeddings to be indexed\n", + " filters = [ \"genre\", \"rating\", \"year\" ], # The metadata fields to be indexed for filtering\n", + " wait_until_complete = 60 # Number of seconds to wait for the index to build (can take around a minute)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "28f5ae2c", + "metadata": {}, + "source": [ + "## Create the Self-Querying Retriever" + ] + }, + { + "cell_type": "markdown", + "id": "53bfaa67", + "metadata": {}, + "source": [ + "### Define metadata field and document information" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e393ed2e", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains.query_constructor.schema import AttributeInfo\n", + "\n", + "# Define the document content description \n", + "document_content_description = \"Brief summary of a movie\"\n", + "\n", + "# Define the metadata fields to filter on\n", + "metadata_field_info = [\n", + " AttributeInfo(\n", + " name=\"genre\",\n", + " description=\"The genre of the movie\",\n", + " type=\"string\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"year\",\n", + " description=\"The year the movie was released\",\n", + " type=\"integer\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"rating\", \n", + " description=\"A 1-10 rating for the movie\", \n", + " type=\"float\"\n", + " ),\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "4bdd30c4", + "metadata": {}, + "source": [ + "### Initialize the self-querying retriever" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "977730d1", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_mongodb.retrievers import MongoDBAtlasSelfQueryRetriever\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "llm = ChatOpenAI(model=\"gpt-4o\")\n", + "retriever = MongoDBAtlasSelfQueryRetriever.from_llm(\n", + " llm=llm,\n", + " vectorstore=vector_store,\n", + " metadata_field_info=metadata_field_info,\n", + " document_contents=document_content_description\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b188c52d", + "metadata": {}, + "source": [ + "## Run Queries with the Self-Querying Retriever" + ] + }, + { + "cell_type": "markdown", + "id": "833d90d9", + "metadata": {}, + "source": [ + "### Queries with filters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9bbc3e6a", + "metadata": {}, + "outputs": [], + "source": [ + "# This example specifies a filter (rating > 9)\n", + "retriever.invoke(\"What are some highly rated movies (above 9)?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a5bd474", + "metadata": {}, + "outputs": [], + "source": [ + "# This example specifies a semantic search and a filter (rating > 9)\n", + "retriever.invoke(\"I want to watch a movie about toys rated higher than 9\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c062276", + "metadata": {}, + "outputs": [], + "source": [ + "# This example specifies a composite filter (rating >= 9 and genre = thriller)\n", + "retriever.invoke(\"What's a highly rated (above or equal 9) thriller film?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb67054b", + "metadata": {}, + "outputs": [], + "source": [ + "# This example specifies a query and composite filter (year > 1990 and year < 2005 and genre = action)\n", + "retriever.invoke(\n", + " \"What's a movie after 1990 but before 2005 that's all about dinosaurs, \" +\n", + " \"and preferably has a lot of action\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "052ee6e4", + "metadata": {}, + "source": [ + "### Query with no filters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3902ca22", + "metadata": {}, + "outputs": [], + "source": [ + "# This example only specifies a semantic search query\n", + "retriever.invoke(\"What are some movies about dinosaurs\")" + ] + }, + { + "cell_type": "markdown", + "id": "937966cf", + "metadata": {}, + "source": [ + "## Use the Retriever in Your RAG Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfa7b140", + "metadata": {}, + "outputs": [], + "source": [ + "import pprint\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.prompts import PromptTemplate\n", + "from langchain_core.runnables import RunnablePassthrough\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "llm = ChatOpenAI(model=\"gpt-4o\")\n", + "\n", + "# Configure self-query retriever with a document limit\n", + "retriever = MongoDBAtlasSelfQueryRetriever.from_llm(\n", + " llm=llm,\n", + " vectorstore=vector_store,\n", + " metadata_field_info=metadata_field_info,\n", + " document_contents=document_content_description,\n", + " enable_limit=True\n", + ")\n", + "\n", + "# Define a prompt template\n", + "template = \"\"\"\n", + " Use the following pieces of context to answer the question at the end.\n", + " {context}\n", + " Question: {question}\n", + "\"\"\"\n", + "prompt = PromptTemplate.from_template(template)\n", + "\n", + "# Construct a chain to answer questions on your data\n", + "chain = (\n", + " { \"context\": retriever, \"question\": RunnablePassthrough()}\n", + " | prompt \n", + " | llm\n", + " | StrOutputParser()\n", + ")\n", + "\n", + "# Prompt the chain\n", + "question = \"What are two movies about dinosaurs after 1990?\" # year > 1990 and document limit of 2\n", + "answer = chain.invoke(question)\n", + "\n", + "print(\"Question: \" + question)\n", + "print(\"Answer: \" + answer)\n", + "\n", + "# Return source documents\n", + "documents = retriever.invoke(question)\n", + "print(\"\\nSource documents:\")\n", + "pprint.pprint(documents)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}