Skip to content

upload file to sandbox #355

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 34 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
7dcbf2f
tmp
leehuwuj Oct 8, 2024
1e2502a
update private file handler
leehuwuj Oct 11, 2024
e12bd29
enhance code
leehuwuj Oct 11, 2024
1cef23c
reduce complexity
leehuwuj Oct 11, 2024
5bd3591
fix mypy
leehuwuj Oct 11, 2024
c8a9472
fix mypy
leehuwuj Oct 11, 2024
5fd25f6
remove comment
leehuwuj Oct 11, 2024
a4d3d36
support upload file and enhance interpreter tool
leehuwuj Oct 14, 2024
6efadd4
fix blocking stream event
leehuwuj Oct 14, 2024
3e82be7
fix mypy
leehuwuj Oct 14, 2024
393a926
Merge remote-tracking branch 'origin/main' into feat/upload-file-sandbox
leehuwuj Oct 14, 2024
9602c6c
add changeset and fix mypy after merge
leehuwuj Oct 14, 2024
985cb26
fix mypy
leehuwuj Oct 14, 2024
9a4c0a3
enhance code
leehuwuj Oct 14, 2024
2efc727
typing
leehuwuj Oct 14, 2024
249edf5
wording
leehuwuj Oct 15, 2024
22cd958
exclude indexing private csv file if code executor tool is enabled
leehuwuj Oct 15, 2024
30e408b
remove file content and duplicated file id
leehuwuj Oct 15, 2024
94b338a
simpler file upload
leehuwuj Oct 15, 2024
6bb7a30
support for TS
leehuwuj Oct 15, 2024
bbf321f
support file upload for artifact in TS
leehuwuj Oct 15, 2024
852e6ec
enhance file path
leehuwuj Oct 15, 2024
5ae6b57
enhance code
leehuwuj Oct 15, 2024
c64e2ba
revise vercel streaming
leehuwuj Oct 15, 2024
36cdb1e
remove redundant id
leehuwuj Oct 15, 2024
e0921fe
add show file widget to the
leehuwuj Oct 15, 2024
a3c1c55
allow upload file with empty index store
leehuwuj Oct 15, 2024
bae12e6
Merge branch 'main' into feat/upload-file-sandbox
marcusschiesser Oct 15, 2024
7d9dee2
add data scientist use case
marcusschiesser Oct 15, 2024
3b91e7b
use GPT4o model for data scientist and code artifact
marcusschiesser Oct 15, 2024
954113e
update comments
leehuwuj Oct 15, 2024
624aea7
use previewcard to render documents
marcusschiesser Oct 15, 2024
788fab0
fix: UI overlap, key warning, wrong filename and url in markdown
thucpn Oct 16, 2024
0f56092
use div as tag wrapper for message
thucpn Oct 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions templates/components/engines/python/agent/tools/artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,21 +66,29 @@ class CodeGeneratorTool:
def __init__(self):
pass

def artifact(self, query: str, old_code: Optional[str] = None) -> Dict:
"""Generate a code artifact based on the input.
def artifact(
self,
query: str,
sandbox_files: Optional[List[str]] = None,
old_code: Optional[str] = None,
) -> Dict:
"""Generate a code artifact based on the provided input.

Args:
query (str): The description of the application you want to build.
query (str): A description of the application you want to build.
sandbox_files (Optional[List[str]], optional): A list of sandbox file paths. Defaults to None. Include these files if the code requires them.
old_code (Optional[str], optional): The existing code to be modified. Defaults to None.

Returns:
Dict: A dictionary containing the generated artifact information.
Dict: A dictionary containing information about the generated artifact.
"""

if old_code:
user_message = f"{query}\n\nThe existing code is: \n```\n{old_code}\n```"
else:
user_message = query
if sandbox_files:
user_message += f"\n\nThe provided files are: \n{str(sandbox_files)}"

messages: List[ChatMessage] = [
ChatMessage(role="system", content=CODE_GENERATION_PROMPT),
Expand All @@ -90,7 +98,10 @@ def artifact(self, query: str, old_code: Optional[str] = None) -> Dict:
sllm = Settings.llm.as_structured_llm(output_cls=CodeArtifact) # type: ignore
response = sllm.chat(messages)
data: CodeArtifact = response.raw
return data.model_dump()
data_dict = data.model_dump()
if sandbox_files:
data_dict["files"] = sandbox_files
return data_dict
except Exception as e:
logger.error(f"Failed to generate artifact: {str(e)}")
raise e
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(self, api_key: str = None):
self.interpreter = CodeInterpreter(api_key=api_key)

def __del__(self):
self.interpreter.close()
self.interpreter.kill()

def get_output_path(self, filename: str) -> str:
# if output directory doesn't exist, create it
Expand Down
36 changes: 31 additions & 5 deletions templates/components/routers/python/sandbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
import logging
import os
import uuid
from typing import Dict, List, Optional, Union
from dataclasses import asdict
from typing import Any, Dict, List, Optional, Union

from app.engine.tools.artifact import CodeArtifact
from app.engine.utils.file_helper import save_file
Expand All @@ -36,7 +37,7 @@ class ExecutionResult(BaseModel):
template: str
stdout: List[str]
stderr: List[str]
runtime_error: Optional[Dict[str, Union[str, List[str]]]] = None
runtime_error: Optional[Dict[str, Any]] = None
output_urls: List[Dict[str, str]]
url: Optional[str]

Expand All @@ -54,12 +55,19 @@ def to_response(self):
}


class FileUpload(BaseModel):
id: str
name: str


@sandbox_router.post("")
async def create_sandbox(request: Request):
request_data = await request.json()
artifact_data = request_data["artifact"]
sandbox_files = artifact_data.get("files", [])

try:
artifact = CodeArtifact(**request_data["artifact"])
artifact = CodeArtifact(**artifact_data)
except Exception:
logger.error(f"Could not create artifact from request data: {request_data}")
return HTTPException(
Expand Down Expand Up @@ -94,6 +102,10 @@ async def create_sandbox(request: Request):
f"Installed dependencies: {', '.join(artifact.additional_dependencies)} in sandbox {sbx}"
)

# Copy files
if len(sandbox_files) > 0:
_upload_files(sbx, sandbox_files)

# Copy code to disk
if isinstance(artifact.code, list):
for file in artifact.code:
Expand All @@ -107,11 +119,12 @@ async def create_sandbox(request: Request):
if artifact.template == "code-interpreter-multilang":
result = sbx.notebook.exec_cell(artifact.code or "")
output_urls = _download_cell_results(result.results)
runtime_error = asdict(result.error) if result.error else None
return ExecutionResult(
template=artifact.template,
stdout=result.logs.stdout,
stderr=result.logs.stderr,
runtime_error=result.error,
runtime_error=runtime_error,
output_urls=output_urls,
url=None,
).to_response()
Expand All @@ -126,6 +139,19 @@ async def create_sandbox(request: Request):
).to_response()


def _upload_files(
sandbox: Union[CodeInterpreter, Sandbox],
sandbox_files: List[str] = [],
) -> None:
for file_path in sandbox_files:
file_name = os.path.basename(file_path)
local_file_path = f"output/uploaded/{file_name}"
with open(local_file_path, "rb") as f:
content = f.read()
sandbox.files.write(file_path, content)
return None


def _download_cell_results(cell_results: Optional[List]) -> List[Dict[str, str]]:
"""
To pull results from code interpreter cell and save them to disk for serving
Expand All @@ -148,7 +174,7 @@ def _download_cell_results(cell_results: Optional[List]) -> List[Dict[str, str]]
output.append(
{
"type": ext,
"filename": file_meta.filename,
"filename": file_meta.name,
"url": file_meta.url,
}
)
Expand Down
146 changes: 98 additions & 48 deletions templates/components/services/python/file.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import base64
import mimetypes
import os
import re
import uuid
from io import BytesIO
from pathlib import Path
from typing import List, Optional, Tuple

from app.engine.index import IndexConfig, get_index
from app.engine.utils.file_helper import FileMetadata, save_file
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how about moving save_file to file.py (this service)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also think the file helper and file service are a bit duplicated, but the idea is to separate them out to be reused in both the engine code and API code.

from llama_index.core import VectorStoreIndex
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.readers.file.base import (
Expand All @@ -31,94 +34,141 @@ def get_llamaparse_parser():
def default_file_loaders_map():
default_loaders = get_file_loaders_map()
default_loaders[".txt"] = FlatReader
default_loaders[".csv"] = FlatReader
return default_loaders


class PrivateFileService:
"""
To store the files uploaded by the user and add them to the index.
"""

PRIVATE_STORE_PATH = "output/uploaded"

@staticmethod
def preprocess_base64_file(base64_content: str) -> Tuple[bytes, str | None]:
def _preprocess_base64_file(base64_content: str) -> Tuple[bytes, str | None]:
header, data = base64_content.split(",", 1)
mime_type = header.split(";")[0].split(":", 1)[1]
extension = mimetypes.guess_extension(mime_type)
# File data as bytes
return base64.b64decode(data), extension

@staticmethod
def store_and_parse_file(file_name, file_data, extension) -> List[Document]:
def _store_file(file_name, file_data) -> FileMetadata:
"""
Store the file to the private directory and return the file metadata
"""
# Store file to the private directory
os.makedirs(PrivateFileService.PRIVATE_STORE_PATH, exist_ok=True)
file_path = Path(os.path.join(PrivateFileService.PRIVATE_STORE_PATH, file_name))

# write file
with open(file_path, "wb") as f:
f.write(file_data)
return save_file(file_data, file_path=str(file_path))

@staticmethod
def _load_file_to_documents(file_metadata: FileMetadata) -> List[Document]:
"""
Load the file from the private directory and return the documents
"""
extension = file_metadata.name.split(".")[-1]
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Improve file extension extraction using os.path.splitext.

The current method of extracting the file extension may not handle filenames without extensions correctly. Using os.path.splitext provides a more reliable way to extract the file extension.

Apply this diff to fix the issue:

-extension = file_metadata.name.split(".")[-1]
+_, extension = os.path.splitext(file_metadata.name)
+extension = extension.lstrip(".")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
extension = file_metadata.name.split(".")[-1]
_, extension = os.path.splitext(file_metadata.name)
extension = extension.lstrip(".")


# Load file to documents
# If LlamaParse is enabled, use it to parse the file
# Otherwise, use the default file loaders
reader = get_llamaparse_parser()
if reader is None:
reader_cls = default_file_loaders_map().get(extension)
reader_cls = default_file_loaders_map().get(f".{extension}")
if reader_cls is None:
raise ValueError(f"File extension {extension} is not supported")
reader = reader_cls()
documents = reader.load_data(file_path)
documents = reader.load_data(Path(file_metadata.path))
# Add custom metadata
for doc in documents:
doc.metadata["file_name"] = file_name
doc.metadata["file_name"] = file_metadata.name
doc.metadata["private"] = "true"
return documents

@staticmethod
def _add_documents_to_vector_store_index(
documents: List[Document], index: VectorStoreIndex
) -> None:
"""
Add the documents to the vector store index
"""
pipeline = IngestionPipeline()
nodes = pipeline.run(documents=documents)

# Add the nodes to the index and persist it
if index is None:
index = VectorStoreIndex(nodes=nodes)
else:
index.insert_nodes(nodes=nodes)
index.storage_context.persist(
persist_dir=os.environ.get("STORAGE_DIR", "storage")
)

@staticmethod
def _add_file_to_llama_cloud_index(
index: LlamaCloudIndex,
file_name: str,
file_data: bytes,
) -> None:
"""
Add the file to the LlamaCloud index.
LlamaCloudIndex is a managed index so we can directly use the files.
"""
try:
from app.engine.service import LLamaCloudFileService
except ImportError:
raise ValueError("LlamaCloudFileService is not found")

project_id = index._get_project_id()
pipeline_id = index._get_pipeline_id()
# LlamaCloudIndex is a managed index so we can directly use the files
upload_file = (file_name, BytesIO(file_data))
return [
LLamaCloudFileService.add_file_to_pipeline(
project_id,
pipeline_id,
upload_file,
custom_metadata={},
)
]

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Inconsistent return type in _add_file_to_llama_cloud_index method.

The method _add_file_to_llama_cloud_index is annotated to return None, but it returns a list containing the result of LLamaCloudFileService.add_file_to_pipeline. Ensure the return type matches the annotation or update the annotation if a return value is intended.

Consider updating the method to return None if the return value is not needed:

-return [
-    LLamaCloudFileService.add_file_to_pipeline(
-        project_id,
-        pipeline_id,
-        upload_file,
-        custom_metadata={},
-    )
-]
+LLamaCloudFileService.add_file_to_pipeline(
+    project_id,
+    pipeline_id,
+    upload_file,
+    custom_metadata={},
+)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
LLamaCloudFileService.add_file_to_pipeline(
project_id,
pipeline_id,
upload_file,
custom_metadata={},
)
]
LLamaCloudFileService.add_file_to_pipeline(
project_id,
pipeline_id,
upload_file,
custom_metadata={},
)

@staticmethod
def _sanitize_file_name(file_name: str) -> str:
file_name, extension = os.path.splitext(file_name)
return re.sub(r"[^a-zA-Z0-9]", "_", file_name) + extension

@classmethod
def process_file(
file_name: str, base64_content: str, params: Optional[dict] = None
) -> List[str]:
cls,
file_name: str,
base64_content: str,
params: Optional[dict] = None,
) -> FileMetadata:
if params is None:
params = {}

file_data, extension = PrivateFileService.preprocess_base64_file(base64_content)

# Add the nodes to the index and persist it
index_config = IndexConfig(**params)
current_index = get_index(index_config)
index = get_index(index_config)

# Insert the documents into the index
if isinstance(current_index, LlamaCloudIndex):
from app.engine.service import LLamaCloudFileService
# Generate a new file name if the same file is uploaded multiple times
file_id = str(uuid.uuid4())
new_file_name = f"{file_id}_{cls._sanitize_file_name(file_name)}"

# Preprocess and store the file
file_data, extension = cls._preprocess_base64_file(base64_content)
file_metadata = cls._store_file(new_file_name, file_data)

project_id = current_index._get_project_id()
pipeline_id = current_index._get_pipeline_id()
# LlamaCloudIndex is a managed index so we can directly use the files
upload_file = (file_name, BytesIO(file_data))
return [
LLamaCloudFileService.add_file_to_pipeline(
project_id,
pipeline_id,
upload_file,
custom_metadata={
# Set private=true to mark the document as private user docs (required for filtering)
"private": "true",
},
)
]
# Insert the file into the index
if isinstance(index, LlamaCloudIndex):
_ = cls._add_file_to_llama_cloud_index(index, new_file_name, file_data)
else:
# First process documents into nodes
documents = PrivateFileService.store_and_parse_file(
file_name, file_data, extension
)
pipeline = IngestionPipeline()
nodes = pipeline.run(documents=documents)

# Add the nodes to the index and persist it
if current_index is None:
current_index = VectorStoreIndex(nodes=nodes)
else:
current_index.insert_nodes(nodes=nodes)
current_index.storage_context.persist(
persist_dir=os.environ.get("STORAGE_DIR", "storage")
)
documents = cls._load_file_to_documents(file_metadata)
cls._add_documents_to_vector_store_index(documents, index)
# Add document ids to the file metadata
file_metadata.document_ids = [doc.doc_id for doc in documents]

# Return the document ids
return [doc.doc_id for doc in documents]
# Return the file metadata
return file_metadata
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Ensure file_metadata.document_ids is consistently populated.

In the process_file method, when using LlamaCloudIndex, file_metadata.document_ids is not populated. If document_ids are needed later, consider updating the method to handle this case.

If document_ids are not applicable for LlamaCloudIndex, ensure that any downstream code handles file_metadata.document_ids being None or empty.

Loading
Loading