Skip to content

Commit 58e6c15

Browse files
authored
feat: Use LlamaParse to parse the private files (#167)
1 parent e57e981 commit 58e6c15

File tree

10 files changed

+90
-61
lines changed

10 files changed

+90
-61
lines changed

.changeset/gold-mugs-perform.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"create-llama": patch
3+
---
4+
5+
Add using LlamaParse for private file uploader

templates/components/llamaindex/typescript/documents/documents.ts

Lines changed: 8 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,9 @@ import {
99
storageContextFromDefaults,
1010
VectorStoreIndex,
1111
} from "llamaindex";
12-
import { DocxReader } from "llamaindex/readers/DocxReader";
13-
import { PDFReader } from "llamaindex/readers/PDFReader";
14-
import { TextFileReader } from "llamaindex/readers/TextFileReader";
1512
import crypto from "node:crypto";
1613
import { getDataSource } from "../../engine";
14+
import { getExtractors } from "../../engine/loader";
1715

1816
const MIME_TYPE_TO_EXT: Record<string, string> = {
1917
"application/pdf": "pdf",
@@ -58,23 +56,14 @@ async function runPipeline(
5856
}
5957

6058
async function loadDocuments(fileBuffer: Buffer, mimeType: string) {
61-
console.log(`Processing uploaded document of type: ${mimeType}`);
62-
switch (mimeType) {
63-
case "application/pdf": {
64-
const pdfReader = new PDFReader();
65-
return await pdfReader.loadDataAsContent(new Uint8Array(fileBuffer));
66-
}
67-
case "text/plain": {
68-
const textReader = new TextFileReader();
69-
return await textReader.loadDataAsContent(fileBuffer);
70-
}
71-
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": {
72-
const docxReader = new DocxReader();
73-
return await docxReader.loadDataAsContent(fileBuffer);
74-
}
75-
default:
76-
throw new Error(`Unsupported document type: ${mimeType}`);
59+
const extractors = getExtractors();
60+
const reader = extractors[MIME_TYPE_TO_EXT[mimeType]];
61+
62+
if (!reader) {
63+
throw new Error(`Unsupported document type: ${mimeType}`);
7764
}
65+
console.log(`Processing uploaded document of type: ${mimeType}`);
66+
return await reader.loadDataAsContent(fileBuffer);
7867
}
7968

8069
async function saveDocument(fileBuffer: Buffer, mimeType: string) {

templates/components/loaders/python/__init__.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
1-
import os
2-
import yaml
3-
import importlib
41
import logging
5-
from typing import Dict
2+
3+
import yaml
4+
from app.engine.loaders.db import DBLoaderConfig, get_db_documents
65
from app.engine.loaders.file import FileLoaderConfig, get_file_documents
76
from app.engine.loaders.web import WebLoaderConfig, get_web_documents
8-
from app.engine.loaders.db import DBLoaderConfig, get_db_documents
97

108
logger = logging.getLogger(__name__)
119

templates/components/loaders/python/file.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ def get_file_documents(config: FileLoaderConfig):
6262
)
6363
return reader.load_data()
6464
except Exception as e:
65-
import sys, traceback
65+
import sys
66+
import traceback
6667

6768
# Catch the error if the data dir is empty
6869
# and return as empty document list

templates/components/loaders/typescript/file/loader.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,14 @@
1-
import { SimpleDirectoryReader } from "llamaindex";
1+
import {
2+
FILE_EXT_TO_READER,
3+
SimpleDirectoryReader,
4+
} from "llamaindex/readers/SimpleDirectoryReader";
25

36
export const DATA_DIR = "./data";
47

8+
export function getExtractors() {
9+
return FILE_EXT_TO_READER;
10+
}
11+
512
export async function getDocuments() {
613
return await new SimpleDirectoryReader().loadData({
714
directoryPath: DATA_DIR,
Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,30 @@
1+
import { LlamaParseReader } from "llamaindex/readers/LlamaParseReader";
12
import {
23
FILE_EXT_TO_READER,
3-
LlamaParseReader,
44
SimpleDirectoryReader,
5-
} from "llamaindex";
5+
} from "llamaindex/readers/SimpleDirectoryReader";
66

77
export const DATA_DIR = "./data";
88

9+
export function getExtractors() {
10+
const llamaParseParser = new LlamaParseReader({ resultType: "markdown" });
11+
const extractors = FILE_EXT_TO_READER;
12+
// Change all the supported extractors to LlamaParse
13+
// except for .txt, it doesn't need to be parsed
14+
for (const key in extractors) {
15+
if (key === "txt") {
16+
continue;
17+
}
18+
extractors[key] = llamaParseParser;
19+
}
20+
return extractors;
21+
}
22+
923
export async function getDocuments() {
1024
const reader = new SimpleDirectoryReader();
11-
// Load PDFs using LlamaParseReader
25+
const extractors = getExtractors();
1226
return await reader.loadData({
1327
directoryPath: DATA_DIR,
14-
fileExtToReader: {
15-
...FILE_EXT_TO_READER,
16-
pdf: new LlamaParseReader({ resultType: "markdown" }),
17-
},
28+
fileExtToReader: extractors,
1829
});
1930
}

templates/types/streaming/express/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
"dotenv": "^16.3.1",
2121
"duck-duck-scrape": "^2.2.5",
2222
"express": "^4.18.2",
23-
"llamaindex": "0.4.6",
23+
"llamaindex": "0.4.12",
2424
"pdf2json": "3.0.5",
2525
"ajv": "^8.12.0",
2626
"@e2b/code-interpreter": "^0.0.5",

templates/types/streaming/fastapi/app/api/routers/upload.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import logging
22
from typing import List
3+
4+
from fastapi import APIRouter, HTTPException
35
from pydantic import BaseModel
4-
from fastapi import HTTPException
5-
from fastapi import APIRouter
6-
from app.api.controllers.file import FileController
6+
7+
from app.api.services.file import PrivateFileService
78

89
file_upload_router = r = APIRouter()
910

@@ -18,7 +19,7 @@ class FileUploadRequest(BaseModel):
1819
def upload_file(request: FileUploadRequest) -> List[str]:
1920
try:
2021
logger.info("Processing file")
21-
return FileController.process_file(request.base64)
22+
return PrivateFileService.process_file(request.base64)
2223
except Exception as e:
2324
logger.error(f"Error processing file: {e}", exc_info=True)
2425
raise HTTPException(status_code=500, detail="Error processing file")

templates/types/streaming/fastapi/app/api/controllers/file.py renamed to templates/types/streaming/fastapi/app/api/services/file.py

Lines changed: 39 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,73 +1,90 @@
1-
import os
21
import base64
32
import mimetypes
4-
from uuid import uuid4
3+
import os
54
from pathlib import Path
6-
from typing import List, Dict
5+
from typing import Dict, List
6+
from uuid import uuid4
7+
8+
from app.engine.index import get_index
79
from llama_index.core import VectorStoreIndex
8-
from llama_index.readers.file import FlatReader
10+
from llama_index.core.ingestion import IngestionPipeline
911
from llama_index.core.readers.file.base import (
1012
_try_loading_included_file_formats as get_file_loaders_map,
13+
)
14+
from llama_index.core.readers.file.base import (
1115
default_file_metadata_func,
1216
)
1317
from llama_index.core.schema import Document
14-
from llama_index.core.ingestion import IngestionPipeline
15-
from app.engine.index import get_index
16-
18+
from llama_index.readers.file import FlatReader
1719

1820

1921
def file_metadata_func(*args, **kwargs) -> Dict:
2022
default_meta = default_file_metadata_func(*args, **kwargs)
2123
default_meta["private"] = "true"
2224
return default_meta
2325

24-
def file_loaders_map():
26+
27+
def get_llamaparse_parser():
28+
from app.engine.loaders import load_configs
29+
from app.engine.loaders.file import FileLoaderConfig, llama_parse_parser
30+
31+
config = load_configs()
32+
file_loader_config = FileLoaderConfig(**config["file"])
33+
if file_loader_config.use_llama_parse:
34+
return llama_parse_parser()
35+
else:
36+
return None
37+
38+
39+
def default_file_loaders_map():
2540
default_loaders = get_file_loaders_map()
2641
default_loaders[".txt"] = FlatReader
2742
return default_loaders
2843

2944

30-
31-
class FileController:
32-
33-
PRIVATE_STORE_PATH="output/uploaded"
45+
class PrivateFileService:
46+
PRIVATE_STORE_PATH = "output/uploaded"
3447

3548
@staticmethod
3649
def preprocess_base64_file(base64_content: str) -> tuple:
3750
header, data = base64_content.split(",", 1)
3851
mime_type = header.split(";")[0].split(":", 1)[1]
3952
extension = mimetypes.guess_extension(mime_type)
4053
# File data as bytes
41-
data = base64.b64decode(data)
42-
return data, extension
54+
return base64.b64decode(data), extension
4355

4456
@staticmethod
4557
def store_and_parse_file(file_data, extension) -> List[Document]:
4658
# Store file to the private directory
47-
os.makedirs(FileController.PRIVATE_STORE_PATH, exist_ok=True)
59+
os.makedirs(PrivateFileService.PRIVATE_STORE_PATH, exist_ok=True)
4860

4961
# random file name
5062
file_name = f"{uuid4().hex}{extension}"
51-
file_path = Path(os.path.join(FileController.PRIVATE_STORE_PATH, file_name))
63+
file_path = Path(os.path.join(PrivateFileService.PRIVATE_STORE_PATH, file_name))
5264

5365
# write file
5466
with open(file_path, "wb") as f:
5567
f.write(file_data)
5668

5769
# Load file to documents
58-
reader_cls = file_loaders_map().get(extension)
59-
if reader_cls is None:
60-
raise ValueError(f"File extension {extension} is not supported")
61-
documents = reader_cls().load_data(file_path)
70+
# If LlamaParse is enabled, use it to parse the file
71+
# Otherwise, use the default file loaders
72+
reader = get_llamaparse_parser()
73+
if reader is None:
74+
reader_cls = default_file_loaders_map().get(extension)
75+
if reader_cls is None:
76+
raise ValueError(f"File extension {extension} is not supported")
77+
reader = reader_cls()
78+
documents = reader.load_data(file_path)
6279
# Add custom metadata
6380
for doc in documents:
6481
doc.metadata["private"] = "true"
6582
return documents
6683

6784
@staticmethod
6885
def process_file(base64_content: str) -> List[str]:
69-
file_data, extension = FileController.preprocess_base64_file(base64_content)
70-
documents = FileController.store_and_parse_file(file_data, extension)
86+
file_data, extension = PrivateFileService.preprocess_base64_file(base64_content)
87+
documents = PrivateFileService.store_and_parse_file(file_data, extension)
7188

7289
# Only process nodes, no store the index
7390
pipeline = IngestionPipeline()

templates/types/streaming/nextjs/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
"duck-duck-scrape": "^2.2.5",
2525
"formdata-node": "^6.0.3",
2626
"got": "^14.4.1",
27-
"llamaindex": "0.4.6",
27+
"llamaindex": "0.4.12",
2828
"lucide-react": "^0.294.0",
2929
"next": "^14.2.4",
3030
"react": "^18.2.0",

0 commit comments

Comments
 (0)