pamelafox
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎data/Aphideater_hoverfly.pdf
255 KB b/‎data/Aphideater_hoverfly.pdf
255 KB
diff --git a/‎rag_hybrid.py renamed to ‎rag_documents_hybrid.py b/‎rag_hybrid.py renamed to ‎rag_documents_hybrid.py
diff --git a/‎rag_documents_ingestion.py
+4-3 b/‎rag_documents_ingestion.py
+4-3
@@ -23,7 +23,7 @@ These scripts for RAG:
 * [`rag_queryrewrite.py`](./rag_queryrewrite.py): Adds a query rewriting step to the RAG process, where the user's question is rewritten to improve the retrieval results.
 * [`rag_documents_ingestion.py`](./rag_ingestion.py): Ingests PDFs by using pymupdf to convert to markdown, then using Langchain to split into chunks, then using OpenAI to embed the chunks, and finally storing in a local JSON file.
 * [`rag_documents_flow.py`](./rag_pdfs.py): A RAG flow that retrieves matching results from the local JSON file created by `rag_documents_ingestion.py`.
-* [`rag_hybrid.py`](./rag_hybrid.py): A RAG flow that implements a hybrid retrieval with both vector and keyword search, merging with Reciprocal Rank Fusion (RRF), and semantic re-ranking with a cross-encoder model.
+* [`rag_documents_hybrid.py`](./rag_documents_hybrid.py): A RAG flow that implements a hybrid retrieval with both vector and keyword search, merging with Reciprocal Rank Fusion (RRF), and semantic re-ranking with a cross-encoder model.
 
 ## Setting up the environment
 
 
@@ -1,5 +1,6 @@
 import json
 import os
+import pathlib
 
 import azure.identity
 import openai
@@ -34,12 +35,12 @@
     client = openai.OpenAI(api_key=os.environ["OPENAI_KEY"])
     MODEL_NAME = os.environ["OPENAI_MODEL"]
 
-
-filenames = ["data/California_carpenter_bee.pdf", "data/Centris_pallida.pdf", "data/Western_honey_bee.pdf"]
+data_dir = pathlib.Path(os.path.dirname(__file__)) / "data"
+filenames = ["California_carpenter_bee.pdf", "Centris_pallida.pdf", "Western_honey_bee.pdf", "Aphideater_hoverfly.pdf"]
 all_chunks = []
 for filename in filenames:
     # Extract text from the PDF file
-    md_text = pymupdf4llm.to_markdown(filename)
+    md_text = pymupdf4llm.to_markdown(data_dir / filename)
 
     # Split the text into smaller chunks
     text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(