Skip to content

feat: Use LlamaParse to parse the private files #167

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 17, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/gold-mugs-perform.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"create-llama": patch
---

Add using LlamaParse for private file uploader
8 changes: 3 additions & 5 deletions templates/components/loaders/python/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import os
import yaml
import importlib
import logging
from typing import Dict

import yaml
from app.engine.loaders.db import DBLoaderConfig, get_db_documents
from app.engine.loaders.file import FileLoaderConfig, get_file_documents
from app.engine.loaders.web import WebLoaderConfig, get_web_documents
from app.engine.loaders.db import DBLoaderConfig, get_db_documents

logger = logging.getLogger(__name__)

Expand Down
3 changes: 2 additions & 1 deletion templates/components/loaders/python/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ def get_file_documents(config: FileLoaderConfig):
)
return reader.load_data()
except Exception as e:
import sys, traceback
import sys
import traceback

# Catch the error if the data dir is empty
# and return as empty document list
Expand Down
9 changes: 5 additions & 4 deletions templates/types/streaming/fastapi/app/api/routers/upload.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import logging
from typing import List

from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from fastapi import HTTPException
from fastapi import APIRouter
from app.api.controllers.file import FileController

from app.api.services.file import PrivateFileService

file_upload_router = r = APIRouter()

Expand All @@ -18,7 +19,7 @@ class FileUploadRequest(BaseModel):
def upload_file(request: FileUploadRequest) -> List[str]:
try:
logger.info("Processing file")
return FileController.process_file(request.base64)
return PrivateFileService.process_file(request.base64)
except Exception as e:
logger.error(f"Error processing file: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Error processing file")
Original file line number Diff line number Diff line change
@@ -1,73 +1,90 @@
import os
import base64
import mimetypes
from uuid import uuid4
import os
from pathlib import Path
from typing import List, Dict
from typing import Dict, List
from uuid import uuid4

from app.engine.index import get_index
from llama_index.core import VectorStoreIndex
from llama_index.readers.file import FlatReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.readers.file.base import (
_try_loading_included_file_formats as get_file_loaders_map,
)
from llama_index.core.readers.file.base import (
default_file_metadata_func,
)
from llama_index.core.schema import Document
from llama_index.core.ingestion import IngestionPipeline
from app.engine.index import get_index

from llama_index.readers.file import FlatReader


def file_metadata_func(*args, **kwargs) -> Dict:
default_meta = default_file_metadata_func(*args, **kwargs)
default_meta["private"] = "true"
return default_meta

def file_loaders_map():

def get_llamaparse_parser():
from app.engine.loaders import load_configs
from app.engine.loaders.file import FileLoaderConfig, llama_parse_parser

config = load_configs()
file_loader_config = FileLoaderConfig(**config["file"])
if file_loader_config.use_llama_parse:
return llama_parse_parser()
else:
return None

Comment on lines +27 to +36
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Enhance error handling in get_llamaparse_parser.

The function dynamically decides which parser to use based on configurations. Consider adding error handling for configuration loading and validation to ensure robustness.

+    try:
+        config = load_configs()
+        file_loader_config = FileLoaderConfig(**config["file"])
+    except KeyError as e:
+        raise ConfigurationError(f"Missing configuration key: {e}")
Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def get_llamaparse_parser():
from app.engine.loaders import load_configs
from app.engine.loaders.file import FileLoaderConfig, llama_parse_parser
config = load_configs()
file_loader_config = FileLoaderConfig(**config["file"])
if file_loader_config.use_llama_parse:
return llama_parse_parser()
else:
return None
def get_llamaparse_parser():
from app.engine.loaders import load_configs
from app.engine.loaders.file import FileLoaderConfig, llama_parse_parser
try:
config = load_configs()
file_loader_config = FileLoaderConfig(**config["file"])
except KeyError as e:
raise ConfigurationError(f"Missing configuration key: {e}")
if file_loader_config.use_llama_parse:
return llama_parse_parser()
else:
return None


def default_file_loaders_map():
default_loaders = get_file_loaders_map()
default_loaders[".txt"] = FlatReader
return default_loaders



class FileController:

PRIVATE_STORE_PATH="output/uploaded"
class PrivateFileService:
PRIVATE_STORE_PATH = "output/uploaded"

@staticmethod
def preprocess_base64_file(base64_content: str) -> tuple:
header, data = base64_content.split(",", 1)
mime_type = header.split(";")[0].split(":", 1)[1]
extension = mimetypes.guess_extension(mime_type)
# File data as bytes
data = base64.b64decode(data)
return data, extension
return base64.b64decode(data), extension

@staticmethod
def store_and_parse_file(file_data, extension) -> List[Document]:
# Store file to the private directory
os.makedirs(FileController.PRIVATE_STORE_PATH, exist_ok=True)
os.makedirs(PrivateFileService.PRIVATE_STORE_PATH, exist_ok=True)

# random file name
file_name = f"{uuid4().hex}{extension}"
file_path = Path(os.path.join(FileController.PRIVATE_STORE_PATH, file_name))
file_path = Path(os.path.join(PrivateFileService.PRIVATE_STORE_PATH, file_name))

# write file
with open(file_path, "wb") as f:
f.write(file_data)

# Load file to documents
reader_cls = file_loaders_map().get(extension)
if reader_cls is None:
raise ValueError(f"File extension {extension} is not supported")
documents = reader_cls().load_data(file_path)
# If LlamaParse is enabled, use it to parse the file
# Otherwise, use the default file loaders
reader = get_llamaparse_parser()
if reader is None:
reader_cls = default_file_loaders_map().get(extension)
if reader_cls is None:
raise ValueError(f"File extension {extension} is not supported")
reader = reader_cls()
documents = reader.load_data(file_path)
# Add custom metadata
for doc in documents:
doc.metadata["private"] = "true"
return documents

@staticmethod
def process_file(base64_content: str) -> List[str]:
file_data, extension = FileController.preprocess_base64_file(base64_content)
documents = FileController.store_and_parse_file(file_data, extension)
file_data, extension = PrivateFileService.preprocess_base64_file(base64_content)
documents = PrivateFileService.store_and_parse_file(file_data, extension)

# Only process nodes, no store the index
pipeline = IngestionPipeline()
Expand Down
Loading