Skip to content

Commit bd4714c

Browse files
feat: add filter for query in ts templates (#172)
--------- Co-authored-by: Marcus Schiesser <[email protected]>
1 parent 455ab68 commit bd4714c

File tree

16 files changed

+164
-106
lines changed

16 files changed

+164
-106
lines changed

.changeset/curvy-penguins-work.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"create-llama": patch
3+
---
4+
5+
Filter private documents for Typescript (Using MetadataFilters) and update to LlamaIndexTS 0.5.6

templates/components/engines/typescript/agent/chat.ts

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
1-
import { BaseToolWithCall, OpenAIAgent, QueryEngineTool } from "llamaindex";
1+
import {
2+
BaseToolWithCall,
3+
MetadataFilter,
4+
MetadataFilters,
5+
OpenAIAgent,
6+
QueryEngineTool,
7+
} from "llamaindex";
28
import fs from "node:fs/promises";
39
import path from "node:path";
410
import { getDataSource } from "./index";
@@ -14,7 +20,7 @@ export async function createChatEngine(documentIds?: string[]) {
1420
tools.push(
1521
new QueryEngineTool({
1622
queryEngine: index.asQueryEngine({
17-
preFilters: undefined, // TODO: Add filters once LITS supports it (getQueryFilters)
23+
preFilters: generateFilters(documentIds || []),
1824
}),
1925
metadata: {
2026
name: "data_query_engine",
@@ -41,3 +47,27 @@ export async function createChatEngine(documentIds?: string[]) {
4147
systemPrompt: process.env.SYSTEM_PROMPT,
4248
});
4349
}
50+
51+
function generateFilters(documentIds: string[]): MetadataFilters | undefined {
52+
// public documents don't have the "private" field or it's set to "false"
53+
const publicDocumentsFilter: MetadataFilter = {
54+
key: "private",
55+
value: ["true"],
56+
operator: "nin",
57+
};
58+
59+
// if no documentIds are provided, only retrieve information from public documents
60+
if (!documentIds.length) return { filters: [publicDocumentsFilter] };
61+
62+
const privateDocumentsFilter: MetadataFilter = {
63+
key: "doc_id",
64+
value: documentIds,
65+
operator: "in",
66+
};
67+
68+
// if documentIds are provided, retrieve information from public and private documents
69+
return {
70+
filters: [publicDocumentsFilter, privateDocumentsFilter],
71+
condition: "or",
72+
};
73+
}

templates/components/llamaindex/typescript/streaming/annotations.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,9 @@ export function retrieveDocumentIds(annotations?: JSONValue[]): string[] {
3535
) {
3636
const files = data.files as DocumentFile[];
3737
for (const file of files) {
38-
if (Array.isArray(file.content)) {
38+
if (Array.isArray(file.content.value)) {
3939
// it's an array, so it's an array of doc IDs
40-
for (const id of file.content) {
40+
for (const id of file.content.value) {
4141
ids.push(id);
4242
}
4343
}

templates/components/llamaindex/typescript/streaming/events.ts

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,18 @@ import {
88
} from "llamaindex";
99
import { LLamaCloudFileService } from "./service";
1010

11-
export async function appendSourceData(
11+
export function appendSourceData(
1212
data: StreamData,
1313
sourceNodes?: NodeWithScore<Metadata>[],
1414
) {
1515
if (!sourceNodes?.length) return;
1616
try {
17-
const nodes = await Promise.all(
18-
sourceNodes.map(async (node) => ({
19-
...node.node.toMutableJSON(),
20-
id: node.node.id_,
21-
score: node.score ?? null,
22-
url: await getNodeUrl(node.node.metadata),
23-
})),
24-
);
17+
const nodes = sourceNodes.map((node) => ({
18+
...node.node.toMutableJSON(),
19+
id: node.node.id_,
20+
score: node.score ?? null,
21+
url: getNodeUrl(node.node.metadata),
22+
}));
2523
data.appendMessageAnnotation({
2624
type: "sources",
2725
data: {
@@ -76,18 +74,19 @@ export function createStreamTimeout(stream: StreamData) {
7674
export function createCallbackManager(stream: StreamData) {
7775
const callbackManager = new CallbackManager();
7876

79-
callbackManager.on("retrieve-end", async (data) => {
80-
const { nodes, query } = data.detail.payload;
81-
await appendSourceData(stream, nodes);
77+
callbackManager.on("retrieve-end", (data) => {
78+
const { nodes, query } = data.detail;
79+
appendSourceData(stream, nodes);
8280
appendEventData(stream, `Retrieving context for query: '${query}'`);
8381
appendEventData(
8482
stream,
8583
`Retrieved ${nodes.length} sources to use as context for the query`,
8684
);
85+
LLamaCloudFileService.downloadFiles(nodes); // don't await to avoid blocking chat streaming
8786
});
8887

8988
callbackManager.on("llm-tool-call", (event) => {
90-
const { name, input } = event.detail.payload.toolCall;
89+
const { name, input } = event.detail.toolCall;
9190
const inputString = Object.entries(input)
9291
.map(([key, value]) => `${key}: ${value}`)
9392
.join(", ");
@@ -98,14 +97,14 @@ export function createCallbackManager(stream: StreamData) {
9897
});
9998

10099
callbackManager.on("llm-tool-result", (event) => {
101-
const { toolCall, toolResult } = event.detail.payload;
100+
const { toolCall, toolResult } = event.detail;
102101
appendToolData(stream, toolCall, toolResult);
103102
});
104103

105104
return callbackManager;
106105
}
107106

108-
async function getNodeUrl(metadata: Metadata) {
107+
function getNodeUrl(metadata: Metadata) {
109108
if (!process.env.FILESERVER_URL_PREFIX) {
110109
console.warn(
111110
"FILESERVER_URL_PREFIX is not set. File URLs will not be generated.",
@@ -114,13 +113,11 @@ async function getNodeUrl(metadata: Metadata) {
114113
const fileName = metadata["file_name"];
115114
if (fileName && process.env.FILESERVER_URL_PREFIX) {
116115
// file_name exists and file server is configured
117-
const isLocalFile = metadata["is_local_file"] === "true";
118116
const pipelineId = metadata["pipeline_id"];
119-
if (pipelineId && !isLocalFile) {
117+
if (pipelineId && metadata["private"] == null) {
120118
// file is from LlamaCloud and was not ingested locally
121-
// TODO trigger but don't await file download and just use convention to generate the URL (see Python code)
122-
// return `${process.env.FILESERVER_URL_PREFIX}/output/llamacloud/${pipelineId}\$${fileName}`;
123-
return await LLamaCloudFileService.getFileUrl(fileName, pipelineId);
119+
const name = LLamaCloudFileService.toDownloadedName(pipelineId, fileName);
120+
return `${process.env.FILESERVER_URL_PREFIX}/output/llamacloud/${name}`;
124121
}
125122
const isPrivate = metadata["private"] === "true";
126123
const folder = isPrivate ? "output/uploaded" : "data";
Lines changed: 85 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,86 +1,76 @@
1+
import { Metadata, NodeWithScore } from "llamaindex";
12
import fs from "node:fs";
23
import https from "node:https";
34
import path from "node:path";
45

56
const LLAMA_CLOUD_OUTPUT_DIR = "output/llamacloud";
67
const LLAMA_CLOUD_BASE_URL = "https://cloud.llamaindex.ai/api/v1";
8+
const FILE_DELIMITER = "$"; // delimiter between pipelineId and filename
79

8-
export interface LlamaCloudFile {
10+
interface LlamaCloudFile {
911
name: string;
1012
file_id: string;
1113
project_id: string;
1214
}
1315

1416
export class LLamaCloudFileService {
15-
static async getFiles(pipelineId: string): Promise<LlamaCloudFile[]> {
16-
const url = `${LLAMA_CLOUD_BASE_URL}/pipelines/${pipelineId}/files`;
17-
const headers = {
18-
Accept: "application/json",
19-
Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
20-
};
21-
const response = await fetch(url, { method: "GET", headers });
22-
const data = await response.json();
23-
return data;
17+
public static async downloadFiles(nodes: NodeWithScore<Metadata>[]) {
18+
const files = this.nodesToDownloadFiles(nodes);
19+
if (!files.length) return;
20+
console.log("Downloading files from LlamaCloud...");
21+
for (const file of files) {
22+
await this.downloadFile(file.pipelineId, file.fileName);
23+
}
2424
}
2525

26-
static async getFileDetail(
27-
projectId: string,
28-
fileId: string,
29-
): Promise<{ url: string }> {
30-
const url = `${LLAMA_CLOUD_BASE_URL}/files/${fileId}/content?project_id=${projectId}`;
31-
const headers = {
32-
Accept: "application/json",
33-
Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
34-
};
35-
const response = await fetch(url, { method: "GET", headers });
36-
const data = (await response.json()) as { url: string };
37-
return data;
26+
public static toDownloadedName(pipelineId: string, fileName: string) {
27+
return `${pipelineId}${FILE_DELIMITER}${fileName}`;
3828
}
3929

40-
static async getFileUrl(
41-
name: string,
42-
pipelineId: string,
43-
): Promise<string | null> {
44-
try {
45-
const files = await this.getFiles(pipelineId);
46-
for (const file of files) {
47-
if (file.name === name) {
48-
const fileId = file.file_id;
49-
const projectId = file.project_id;
50-
const fileDetail = await this.getFileDetail(projectId, fileId);
51-
const localFileUrl = this.downloadFile(fileDetail.url, fileId, name);
52-
return localFileUrl;
53-
}
30+
/**
31+
* This function will return an array of unique files to download from LlamaCloud
32+
* We only download files that are uploaded directly in LlamaCloud datasources (don't have `private` in metadata)
33+
* Files are uploaded directly in LlamaCloud datasources don't have `private` in metadata (public docs)
34+
* Files are uploaded from local via `generate` command will have `private=false` (public docs)
35+
* Files are uploaded from local via `/chat/upload` endpoint will have `private=true` (private docs)
36+
*
37+
* @param nodes
38+
* @returns list of unique files to download
39+
*/
40+
private static nodesToDownloadFiles(nodes: NodeWithScore<Metadata>[]) {
41+
const downloadFiles: Array<{
42+
pipelineId: string;
43+
fileName: string;
44+
}> = [];
45+
for (const node of nodes) {
46+
const isLocalFile = node.node.metadata["private"] != null;
47+
const pipelineId = node.node.metadata["pipeline_id"];
48+
const fileName = node.node.metadata["file_name"];
49+
if (isLocalFile || !pipelineId || !fileName) continue;
50+
const isDuplicate = downloadFiles.some(
51+
(f) => f.pipelineId === pipelineId && f.fileName === fileName,
52+
);
53+
if (!isDuplicate) {
54+
downloadFiles.push({ pipelineId, fileName });
5455
}
55-
return null;
56-
} catch (error) {
57-
console.error("Error fetching file from LlamaCloud:", error);
58-
return null;
5956
}
57+
return downloadFiles;
6058
}
6159

62-
static downloadFile(url: string, fileId: string, filename: string) {
63-
const FILE_DELIMITER = "$"; // delimiter between fileId and filename
64-
const downloadedFileName = `${fileId}${FILE_DELIMITER}${filename}`;
65-
const downloadedFilePath = path.join(
66-
LLAMA_CLOUD_OUTPUT_DIR,
67-
downloadedFileName,
68-
);
69-
const urlPrefix = `${process.env.FILESERVER_URL_PREFIX}/${LLAMA_CLOUD_OUTPUT_DIR}`;
70-
const fileUrl = `${urlPrefix}/${downloadedFileName}`;
71-
60+
private static async downloadFile(pipelineId: string, fileName: string) {
7261
try {
62+
const downloadedName = this.toDownloadedName(pipelineId, fileName);
63+
const downloadedPath = path.join(LLAMA_CLOUD_OUTPUT_DIR, downloadedName);
64+
7365
// Check if file already exists
74-
if (fs.existsSync(downloadedFilePath)) return fileUrl;
66+
if (fs.existsSync(downloadedPath)) return;
7567

76-
// Create directory if it doesn't exist
77-
if (!fs.existsSync(LLAMA_CLOUD_OUTPUT_DIR)) {
78-
fs.mkdirSync(LLAMA_CLOUD_OUTPUT_DIR, { recursive: true });
79-
}
68+
const urlToDownload = await this.getFileUrlByName(pipelineId, fileName);
69+
if (!urlToDownload) throw new Error("File not found in LlamaCloud");
8070

81-
const file = fs.createWriteStream(downloadedFilePath);
71+
const file = fs.createWriteStream(downloadedPath);
8272
https
83-
.get(url, (response) => {
73+
.get(urlToDownload, (response) => {
8474
response.pipe(file);
8575
file.on("finish", () => {
8676
file.close(() => {
@@ -89,15 +79,50 @@ export class LLamaCloudFileService {
8979
});
9080
})
9181
.on("error", (err) => {
92-
fs.unlink(downloadedFilePath, () => {
82+
fs.unlink(downloadedPath, () => {
9383
console.error("Error downloading file:", err);
9484
throw err;
9585
});
9686
});
97-
98-
return fileUrl;
9987
} catch (error) {
10088
throw new Error(`Error downloading file from LlamaCloud: ${error}`);
10189
}
10290
}
91+
92+
private static async getFileUrlByName(
93+
pipelineId: string,
94+
name: string,
95+
): Promise<string | null> {
96+
const files = await this.getAllFiles(pipelineId);
97+
const file = files.find((file) => file.name === name);
98+
if (!file) return null;
99+
return await this.getFileUrlById(file.project_id, file.file_id);
100+
}
101+
102+
private static async getFileUrlById(
103+
projectId: string,
104+
fileId: string,
105+
): Promise<string> {
106+
const url = `${LLAMA_CLOUD_BASE_URL}/files/${fileId}/content?project_id=${projectId}`;
107+
const headers = {
108+
Accept: "application/json",
109+
Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
110+
};
111+
const response = await fetch(url, { method: "GET", headers });
112+
const data = (await response.json()) as { url: string };
113+
return data.url;
114+
}
115+
116+
private static async getAllFiles(
117+
pipelineId: string,
118+
): Promise<LlamaCloudFile[]> {
119+
const url = `${LLAMA_CLOUD_BASE_URL}/pipelines/${pipelineId}/files`;
120+
const headers = {
121+
Accept: "application/json",
122+
Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
123+
};
124+
const response = await fetch(url, { method: "GET", headers });
125+
const data = await response.json();
126+
return data;
127+
}
103128
}

templates/components/vectordbs/python/llamacloud/generate.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ def generate_datasource():
3030

3131
documents = get_documents()
3232

33-
# Set is_local_file=true to distinguish locally ingested files from LlamaCloud files
33+
# Set private=false to mark the document as public (required for filtering)
3434
for doc in documents:
35-
doc.metadata["is_local_file"] = "true"
35+
doc.metadata["private"] = "false"
3636

3737
LlamaCloudIndex.from_documents(
3838
documents=documents,

templates/components/vectordbs/python/none/generate.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ def generate_datasource():
2121
storage_dir = os.environ.get("STORAGE_DIR", "storage")
2222
# load the documents and create the index
2323
documents = get_documents()
24+
# Set private=false to mark the document as public (required for filtering)
25+
for doc in documents:
26+
doc.metadata["private"] = "false"
2427
index = VectorStoreIndex.from_documents(
2528
documents,
2629
)

templates/components/vectordbs/typescript/llamacloud/generate.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@ dotenv.config();
99

1010
async function loadAndIndex() {
1111
const documents = await getDocuments();
12-
// Set is_local_file=true to distinguish locally ingested files from LlamaCloud files
12+
// Set private=false to mark the document as public (required for filtering)
1313
for (const document of documents) {
1414
document.metadata = {
1515
...document.metadata,
16-
is_local_file: "true",
16+
private: "false",
1717
};
1818
}
1919
await getDataSource();

templates/components/vectordbs/typescript/none/generate.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@ async function generateDatasource() {
2525
persistDir: STORAGE_CACHE_DIR,
2626
});
2727
const documents = await getDocuments();
28+
// Set private=false to mark the document as public (required for filtering)
29+
documents.forEach((doc) => {
30+
doc.metadata["private"] = "false";
31+
});
32+
2833
await VectorStoreIndex.fromDocuments(documents, {
2934
storageContext,
3035
});

templates/types/streaming/express/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
"dotenv": "^16.3.1",
2121
"duck-duck-scrape": "^2.2.5",
2222
"express": "^4.18.2",
23-
"llamaindex": "0.4.14",
23+
"llamaindex": "0.5.6",
2424
"pdf2json": "3.0.5",
2525
"ajv": "^8.12.0",
2626
"@e2b/code-interpreter": "^0.0.5",

0 commit comments

Comments
 (0)