Skip to content

feat: Use LlamaParse to parse the private files #167

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/gold-mugs-perform.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"create-llama": patch
---

Add using LlamaParse for private file uploader
27 changes: 8 additions & 19 deletions templates/components/llamaindex/typescript/documents/documents.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,9 @@ import {
storageContextFromDefaults,
VectorStoreIndex,
} from "llamaindex";
import { DocxReader } from "llamaindex/readers/DocxReader";
import { PDFReader } from "llamaindex/readers/PDFReader";
import { TextFileReader } from "llamaindex/readers/TextFileReader";
import crypto from "node:crypto";
import { getDataSource } from "../../engine";
import { getExtractors } from "../../engine/loader";

const MIME_TYPE_TO_EXT: Record<string, string> = {
"application/pdf": "pdf",
Expand Down Expand Up @@ -58,23 +56,14 @@ async function runPipeline(
}

async function loadDocuments(fileBuffer: Buffer, mimeType: string) {
console.log(`Processing uploaded document of type: ${mimeType}`);
switch (mimeType) {
case "application/pdf": {
const pdfReader = new PDFReader();
return await pdfReader.loadDataAsContent(new Uint8Array(fileBuffer));
}
case "text/plain": {
const textReader = new TextFileReader();
return await textReader.loadDataAsContent(fileBuffer);
}
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": {
const docxReader = new DocxReader();
return await docxReader.loadDataAsContent(fileBuffer);
}
default:
throw new Error(`Unsupported document type: ${mimeType}`);
const extractors = getExtractors();
const reader = extractors[MIME_TYPE_TO_EXT[mimeType]];

if (!reader) {
throw new Error(`Unsupported document type: ${mimeType}`);
}
console.log(`Processing uploaded document of type: ${mimeType}`);
return await reader.loadDataAsContent(fileBuffer);
}

async function saveDocument(fileBuffer: Buffer, mimeType: string) {
Expand Down
8 changes: 3 additions & 5 deletions templates/components/loaders/python/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import os
import yaml
import importlib
import logging
from typing import Dict

import yaml
from app.engine.loaders.db import DBLoaderConfig, get_db_documents
from app.engine.loaders.file import FileLoaderConfig, get_file_documents
from app.engine.loaders.web import WebLoaderConfig, get_web_documents
from app.engine.loaders.db import DBLoaderConfig, get_db_documents

logger = logging.getLogger(__name__)

Expand Down
3 changes: 2 additions & 1 deletion templates/components/loaders/python/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ def get_file_documents(config: FileLoaderConfig):
)
return reader.load_data()
except Exception as e:
import sys, traceback
import sys
import traceback

# Catch the error if the data dir is empty
# and return as empty document list
Expand Down
9 changes: 8 additions & 1 deletion templates/components/loaders/typescript/file/loader.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
import { SimpleDirectoryReader } from "llamaindex";
import {
FILE_EXT_TO_READER,
SimpleDirectoryReader,
} from "llamaindex/readers/SimpleDirectoryReader";

export const DATA_DIR = "./data";

export function getExtractors() {
return FILE_EXT_TO_READER;
}

export async function getDocuments() {
return await new SimpleDirectoryReader().loadData({
directoryPath: DATA_DIR,
Expand Down
25 changes: 18 additions & 7 deletions templates/components/loaders/typescript/llama_parse/loader.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,30 @@
import { LlamaParseReader } from "llamaindex/readers/LlamaParseReader";
import {
FILE_EXT_TO_READER,
LlamaParseReader,
SimpleDirectoryReader,
} from "llamaindex";
} from "llamaindex/readers/SimpleDirectoryReader";

export const DATA_DIR = "./data";

export function getExtractors() {
const llamaParseParser = new LlamaParseReader({ resultType: "markdown" });
const extractors = FILE_EXT_TO_READER;
// Change all the supported extractors to LlamaParse
// except for .txt, it doesn't need to be parsed
for (const key in extractors) {
if (key === "txt") {
continue;
}
extractors[key] = llamaParseParser;
}
return extractors;
}

export async function getDocuments() {
const reader = new SimpleDirectoryReader();
// Load PDFs using LlamaParseReader
const extractors = getExtractors();
return await reader.loadData({
directoryPath: DATA_DIR,
fileExtToReader: {
...FILE_EXT_TO_READER,
pdf: new LlamaParseReader({ resultType: "markdown" }),
},
fileExtToReader: extractors,
});
}
2 changes: 1 addition & 1 deletion templates/types/streaming/express/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"dotenv": "^16.3.1",
"duck-duck-scrape": "^2.2.5",
"express": "^4.18.2",
"llamaindex": "0.4.6",
"llamaindex": "0.4.12",
"pdf2json": "3.0.5",
"ajv": "^8.12.0",
"@e2b/code-interpreter": "^0.0.5",
Expand Down
9 changes: 5 additions & 4 deletions templates/types/streaming/fastapi/app/api/routers/upload.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import logging
from typing import List

from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from fastapi import HTTPException
from fastapi import APIRouter
from app.api.controllers.file import FileController

from app.api.services.file import PrivateFileService

file_upload_router = r = APIRouter()

Expand All @@ -18,7 +19,7 @@ class FileUploadRequest(BaseModel):
def upload_file(request: FileUploadRequest) -> List[str]:
try:
logger.info("Processing file")
return FileController.process_file(request.base64)
return PrivateFileService.process_file(request.base64)
except Exception as e:
logger.error(f"Error processing file: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Error processing file")
Original file line number Diff line number Diff line change
@@ -1,73 +1,90 @@
import os
import base64
import mimetypes
from uuid import uuid4
import os
from pathlib import Path
from typing import List, Dict
from typing import Dict, List
from uuid import uuid4

from app.engine.index import get_index
from llama_index.core import VectorStoreIndex
from llama_index.readers.file import FlatReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.readers.file.base import (
_try_loading_included_file_formats as get_file_loaders_map,
)
from llama_index.core.readers.file.base import (
default_file_metadata_func,
)
from llama_index.core.schema import Document
from llama_index.core.ingestion import IngestionPipeline
from app.engine.index import get_index

from llama_index.readers.file import FlatReader


def file_metadata_func(*args, **kwargs) -> Dict:
default_meta = default_file_metadata_func(*args, **kwargs)
default_meta["private"] = "true"
return default_meta

def file_loaders_map():

def get_llamaparse_parser():
from app.engine.loaders import load_configs
from app.engine.loaders.file import FileLoaderConfig, llama_parse_parser

config = load_configs()
file_loader_config = FileLoaderConfig(**config["file"])
if file_loader_config.use_llama_parse:
return llama_parse_parser()
else:
return None

Comment on lines +27 to +36
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Enhance error handling in get_llamaparse_parser.

The function dynamically decides which parser to use based on configurations. Consider adding error handling for configuration loading and validation to ensure robustness.

+    try:
+        config = load_configs()
+        file_loader_config = FileLoaderConfig(**config["file"])
+    except KeyError as e:
+        raise ConfigurationError(f"Missing configuration key: {e}")
Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def get_llamaparse_parser():
from app.engine.loaders import load_configs
from app.engine.loaders.file import FileLoaderConfig, llama_parse_parser
config = load_configs()
file_loader_config = FileLoaderConfig(**config["file"])
if file_loader_config.use_llama_parse:
return llama_parse_parser()
else:
return None
def get_llamaparse_parser():
from app.engine.loaders import load_configs
from app.engine.loaders.file import FileLoaderConfig, llama_parse_parser
try:
config = load_configs()
file_loader_config = FileLoaderConfig(**config["file"])
except KeyError as e:
raise ConfigurationError(f"Missing configuration key: {e}")
if file_loader_config.use_llama_parse:
return llama_parse_parser()
else:
return None


def default_file_loaders_map():
default_loaders = get_file_loaders_map()
default_loaders[".txt"] = FlatReader
return default_loaders



class FileController:

PRIVATE_STORE_PATH="output/uploaded"
class PrivateFileService:
PRIVATE_STORE_PATH = "output/uploaded"

@staticmethod
def preprocess_base64_file(base64_content: str) -> tuple:
header, data = base64_content.split(",", 1)
mime_type = header.split(";")[0].split(":", 1)[1]
extension = mimetypes.guess_extension(mime_type)
# File data as bytes
data = base64.b64decode(data)
return data, extension
return base64.b64decode(data), extension

@staticmethod
def store_and_parse_file(file_data, extension) -> List[Document]:
# Store file to the private directory
os.makedirs(FileController.PRIVATE_STORE_PATH, exist_ok=True)
os.makedirs(PrivateFileService.PRIVATE_STORE_PATH, exist_ok=True)

# random file name
file_name = f"{uuid4().hex}{extension}"
file_path = Path(os.path.join(FileController.PRIVATE_STORE_PATH, file_name))
file_path = Path(os.path.join(PrivateFileService.PRIVATE_STORE_PATH, file_name))

# write file
with open(file_path, "wb") as f:
f.write(file_data)

# Load file to documents
reader_cls = file_loaders_map().get(extension)
if reader_cls is None:
raise ValueError(f"File extension {extension} is not supported")
documents = reader_cls().load_data(file_path)
# If LlamaParse is enabled, use it to parse the file
# Otherwise, use the default file loaders
reader = get_llamaparse_parser()
if reader is None:
reader_cls = default_file_loaders_map().get(extension)
if reader_cls is None:
raise ValueError(f"File extension {extension} is not supported")
reader = reader_cls()
documents = reader.load_data(file_path)
# Add custom metadata
for doc in documents:
doc.metadata["private"] = "true"
return documents

@staticmethod
def process_file(base64_content: str) -> List[str]:
file_data, extension = FileController.preprocess_base64_file(base64_content)
documents = FileController.store_and_parse_file(file_data, extension)
file_data, extension = PrivateFileService.preprocess_base64_file(base64_content)
documents = PrivateFileService.store_and_parse_file(file_data, extension)

# Only process nodes, no store the index
pipeline = IngestionPipeline()
Expand Down
2 changes: 1 addition & 1 deletion templates/types/streaming/nextjs/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"duck-duck-scrape": "^2.2.5",
"formdata-node": "^6.0.3",
"got": "^14.4.1",
"llamaindex": "0.4.6",
"llamaindex": "0.4.12",
"lucide-react": "^0.294.0",
"next": "^14.2.4",
"react": "^18.2.0",
Expand Down
Loading