run-llama · marcusschiesser · Jul 12, 2024 · Jun 24, 2024 · Jun 24, 2024 · Jun 26, 2024
diff --git a/.changeset/late-weeks-sneeze.md b/.changeset/late-weeks-sneeze.md
@@ -0,0 +1,5 @@
+---
+"create-llama": patch
+---
+
+Support upload document files: pdf, docx, txt
diff --git a/helpers/index.ts b/helpers/index.ts
@@ -8,6 +8,7 @@ import { writeLoadersConfig } from "./datasources";
 import { createBackendEnvFile, createFrontendEnvFile } from "./env-variables";
 import { PackageManager } from "./get-pkg-manager";
 import { installLlamapackProject } from "./llama-pack";
+import { makeDir } from "./make-dir";
 import { isHavingPoetryLockFile, tryPoetryRun } from "./poetry";
 import { installPythonTemplate } from "./python";
 import { downloadAndExtractRepo } from "./repo";
@@ -175,9 +176,10 @@ export const installTemplate = async (
       }
     }
 
-    // Create tool-output directory
+    // Create outputs directory
     if (props.tools && props.tools.length > 0) {
-      await fsExtra.mkdir(path.join(props.root, "tool-output"));
+      await makeDir(path.join(props.root, "output/tools"));
+      await makeDir(path.join(props.root, "output/uploaded"));
     }
   } else {
     // this is a frontend for a full-stack app, create .env file with model information

diff --git a/helpers/python.ts b/helpers/python.ts
@@ -55,11 +55,11 @@ const getAdditionalDependencies = (
     case "milvus": {
       dependencies.push({
         name: "llama-index-vector-stores-milvus",
-        version: "^0.1.6",
+        version: "^0.1.20",
       });
       dependencies.push({
         name: "pymilvus",
-        version: "2.3.7",
+        version: "2.4.4",
       });
       break;
     }

diff --git a/helpers/typescript.ts b/helpers/typescript.ts
@@ -104,6 +104,12 @@ export const installTSTemplate = async ({
       : path.join("src", "controllers");
   const enginePath = path.join(root, relativeEngineDestPath, "engine");
 
+  // copy llamaindex code for TS templates
+  await copy("**", path.join(root, relativeEngineDestPath, "llamaindex"), {
+    parents: true,
+    cwd: path.join(compPath, "llamaindex", "typescript"),
+  });
+
   // copy vector db component
   if (vectorDb === "llamacloud") {
     console.log(

diff --git a/questions.ts b/questions.ts
@@ -141,7 +141,7 @@ export const getDataSourceChoices = (
   if (selectedDataSource === undefined || selectedDataSource.length === 0) {
     if (template !== "multiagent") {
       choices.push({
-        title: "No data, just a simple chat or agent",
+        title: "No datasource",
         value: "none",
       });
     }

diff --git a/templates/components/engines/python/agent/__init__.py b/templates/components/engines/python/agent/__init__.py
@@ -6,15 +6,17 @@
 from app.engine.index import get_index
 
 
-def get_chat_engine():
+def get_chat_engine(filters=None):
     system_prompt = os.getenv("SYSTEM_PROMPT")
     top_k = os.getenv("TOP_K", "3")
     tools = []
 
     # Add query tool if index exists
     index = get_index()
     if index is not None:
-        query_engine = index.as_query_engine(similarity_top_k=int(top_k))
+        query_engine = index.as_query_engine(
+            similarity_top_k=int(top_k), filters=filters
+        )
         query_engine_tool = QueryEngineTool.from_defaults(query_engine=query_engine)
         tools.append(query_engine_tool)
 

diff --git a/templates/components/engines/python/agent/tools/img_gen.py b/templates/components/engines/python/agent/tools/img_gen.py
@@ -26,7 +26,7 @@ class ImageGeneratorToolOutput(BaseModel):
 
 class ImageGeneratorTool:
     _IMG_OUTPUT_FORMAT = "webp"
-    _IMG_OUTPUT_DIR = "tool-output"
+    _IMG_OUTPUT_DIR = "output/tool"
     _IMG_GEN_API = "https://api.stability.ai/v2beta/stable-image/generate/core"
 
     def __init__(self, api_key: str = None):

diff --git a/templates/components/engines/python/agent/tools/interpreter.py b/templates/components/engines/python/agent/tools/interpreter.py
@@ -27,7 +27,7 @@ class E2BToolOutput(BaseModel):
 
 class E2BCodeInterpreter:
 
-    output_dir = "tool-output"
+    output_dir = "output/tool"
 
     def __init__(self, api_key: str = None):
         if api_key is None:

diff --git a/templates/components/engines/python/chat/__init__.py b/templates/components/engines/python/chat/__init__.py
@@ -3,7 +3,7 @@
 from fastapi import HTTPException
 
 
-def get_chat_engine():
+def get_chat_engine(filters=None):
     system_prompt = os.getenv("SYSTEM_PROMPT")
     top_k = os.getenv("TOP_K", 3)
 
@@ -20,4 +20,5 @@ def get_chat_engine():
         similarity_top_k=int(top_k),
         system_prompt=system_prompt,
         chat_mode="condense_plus_context",
+        filters=filters,
     )
diff --git a/templates/components/engines/typescript/agent/chat.ts b/templates/components/engines/typescript/agent/chat.ts
@@ -4,7 +4,7 @@ import path from "node:path";
 import { getDataSource } from "./index";
 import { createTools } from "./tools";
 
-export async function createChatEngine() {
+export async function createChatEngine(documentIds?: string[]) {
   const tools: BaseToolWithCall[] = [];
 
   // Add a query engine tool if we have a data source
@@ -13,7 +13,9 @@ export async function createChatEngine() {
   if (index) {
     tools.push(
       new QueryEngineTool({
-        queryEngine: index.asQueryEngine(),
+        queryEngine: index.asQueryEngine({
+          preFilters: undefined, // TODO: Add filters once LITS supports it (getQueryFilters)
+        }),
         metadata: {
           name: "data_query_engine",
           description: `A query engine for documents from your data source.`,

diff --git a/templates/components/engines/typescript/agent/tools/img-gen.ts b/templates/components/engines/typescript/agent/tools/img-gen.ts
@@ -37,7 +37,7 @@ const DEFAULT_META_DATA: ToolMetadata<JSONSchemaType<ImgGeneratorParameter>> = {
 
 export class ImgGeneratorTool implements BaseTool<ImgGeneratorParameter> {
   readonly IMG_OUTPUT_FORMAT = "webp";
-  readonly IMG_OUTPUT_DIR = "tool-output";
+  readonly IMG_OUTPUT_DIR = "output/tool";
   readonly IMG_GEN_API =
     "https://api.stability.ai/v2beta/stable-image/generate/core";
 

diff --git a/templates/components/engines/typescript/agent/tools/interpreter.ts b/templates/components/engines/typescript/agent/tools/interpreter.ts
@@ -56,7 +56,7 @@ const DEFAULT_META_DATA: ToolMetadata<JSONSchemaType<InterpreterParameter>> = {
 };
 
 export class InterpreterTool implements BaseTool<InterpreterParameter> {
-  private readonly outputDir = "tool-output";
+  private readonly outputDir = "output/tool";
   private apiKey?: string;
   private fileServerURLPrefix?: string;
   metadata: ToolMetadata<JSONSchemaType<InterpreterParameter>>;

diff --git a/templates/components/engines/typescript/chat/chat.ts b/templates/components/engines/typescript/chat/chat.ts
@@ -1,7 +1,7 @@
 import { ContextChatEngine, Settings } from "llamaindex";
 import { getDataSource } from "./index";
 
-export async function createChatEngine() {
+export async function createChatEngine(documentIds?: string[]) {
   const index = await getDataSource();
   if (!index) {
     throw new Error(

diff --git a/templates/components/llamaindex/typescript/documents/documents.ts b/templates/components/llamaindex/typescript/documents/documents.ts
@@ -0,0 +1,115 @@
+import fs from "fs";
+import {
+  BaseNode,
+  Document,
+  IngestionPipeline,
+  Metadata,
+  Settings,
+  SimpleNodeParser,
+  storageContextFromDefaults,
+  VectorStoreIndex,
+} from "llamaindex";
+import { DocxReader } from "llamaindex/readers/DocxReader";
+import { PDFReader } from "llamaindex/readers/PDFReader";
+import { TextFileReader } from "llamaindex/readers/TextFileReader";
+import crypto from "node:crypto";
+import { getDataSource } from "../../engine";
+
+const MIME_TYPE_TO_EXT: Record<string, string> = {
+  "application/pdf": "pdf",
+  "text/plain": "txt",
+  "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+    "docx",
+};
+
+export async function uploadDocument(raw: string): Promise<string[]> {
+  const [header, content] = raw.split(",");
+  const mimeType = header.replace("data:", "").replace(";base64", "");
+  const fileBuffer = Buffer.from(content, "base64");
+  const documents = await loadDocuments(fileBuffer, mimeType);
+  const { filename } = await saveDocument(fileBuffer, mimeType);
+  return await runPipeline(documents, filename);
+}
+
+async function runPipeline(
+  documents: Document[],
+  filename: string,
+): Promise<string[]> {
+  // mark documents to add to the vector store as private
+  for (const document of documents) {
+    document.metadata = {
+      ...document.metadata,
+      file_name: filename,
+      private: true,
+    };
+  }
+  const pipeline = new IngestionPipeline({
+    transformations: [
+      new SimpleNodeParser({
+        chunkSize: Settings.chunkSize,
+        chunkOverlap: Settings.chunkOverlap,
+      }),
+      Settings.embedModel,
+    ],
+  });
+  const nodes = await pipeline.run({ documents });
+  await addNodesToVectorStore(nodes);
+  return documents.map((document) => document.id_);
+}
+
+async function loadDocuments(fileBuffer: Buffer, mimeType: string) {
+  console.log(`Processing uploaded document of type: ${mimeType}`);
+  switch (mimeType) {
+    case "application/pdf": {
+      const pdfReader = new PDFReader();
+      return await pdfReader.loadDataAsContent(new Uint8Array(fileBuffer));
+    }
+    case "text/plain": {
+      const textReader = new TextFileReader();
+      return await textReader.loadDataAsContent(fileBuffer);
+    }
+    case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": {
+      const docxReader = new DocxReader();
+      return await docxReader.loadDataAsContent(fileBuffer);
+    }
+    default:
+      throw new Error(`Unsupported document type: ${mimeType}`);
+  }
+}
+
+async function saveDocument(fileBuffer: Buffer, mimeType: string) {
+  const fileExt = MIME_TYPE_TO_EXT[mimeType];
+  if (!fileExt) throw new Error(`Unsupported document type: ${mimeType}`);
+
+  const folder = "output/uploaded";
+  const filename = `${crypto.randomUUID()}.${fileExt}`;
+  const filepath = `${folder}/${filename}`;
+  const fileurl = `${process.env.FILESERVER_URL_PREFIX}/${filepath}`;
+
+  if (!fs.existsSync(folder)) {
+    fs.mkdirSync(folder, { recursive: true });
+  }
+  await fs.promises.writeFile(filepath, fileBuffer);
+
+  console.log(`Saved document file to ${filepath}.\nURL: ${fileurl}`);
+  return {
+    filename,
+    filepath,
+    fileurl,
+  };
+}
+
+async function addNodesToVectorStore(nodes: BaseNode<Metadata>[]) {
+  let currentIndex = await getDataSource(); // always not null with an vectordb
+  if (currentIndex) {
+    await currentIndex.insertNodes(nodes);
+  } else {
+    // Not using vectordb and haven't generated local index yet
+    const storageContext = await storageContextFromDefaults({
+      persistDir: "./cache",
+    });
+    currentIndex = await VectorStoreIndex.init({ nodes, storageContext });
+  }
+  currentIndex.storageContext.docStore.persist();
+  console.log("Added nodes to the vector store.");
+}