run-llama · marcusschiesser · Oct 16, 2024 · Oct 8, 2024 · Oct 11, 2024 · Oct 11, 2024
diff --git a/templates/components/engines/python/agent/tools/artifact.py b/templates/components/engines/python/agent/tools/artifact.py
@@ -66,21 +66,29 @@ class CodeGeneratorTool:
     def __init__(self):
         pass
 
-    def artifact(self, query: str, old_code: Optional[str] = None) -> Dict:
-        """Generate a code artifact based on the input.
+    def artifact(
+        self,
+        query: str,
+        sandbox_files: Optional[List[str]] = None,
+        old_code: Optional[str] = None,
+    ) -> Dict:
+        """Generate a code artifact based on the provided input.
 
         Args:
-            query (str): The description of the application you want to build.
+            query (str): A description of the application you want to build.
+            sandbox_files (Optional[List[str]], optional): A list of sandbox file paths. Defaults to None. Include these files if the code requires them.
             old_code (Optional[str], optional): The existing code to be modified. Defaults to None.
 
         Returns:
-            Dict: A dictionary containing the generated artifact information.
+            Dict: A dictionary containing information about the generated artifact.
         """
 
         if old_code:
             user_message = f"{query}\n\nThe existing code is: \n```\n{old_code}\n```"
         else:
             user_message = query
+        if sandbox_files:
+            user_message += f"\n\nThe provided files are: \n{str(sandbox_files)}"
 
         messages: List[ChatMessage] = [
             ChatMessage(role="system", content=CODE_GENERATION_PROMPT),
@@ -90,7 +98,10 @@ def artifact(self, query: str, old_code: Optional[str] = None) -> Dict:
             sllm = Settings.llm.as_structured_llm(output_cls=CodeArtifact)  # type: ignore
             response = sllm.chat(messages)
             data: CodeArtifact = response.raw
-            return data.model_dump()
+            data_dict = data.model_dump()
+            if sandbox_files:
+                data_dict["files"] = sandbox_files
+            return data_dict
         except Exception as e:
             logger.error(f"Failed to generate artifact: {str(e)}")
             raise e

diff --git a/templates/components/engines/python/agent/tools/interpreter.py b/templates/components/engines/python/agent/tools/interpreter.py
@@ -45,7 +45,7 @@ def __init__(self, api_key: str = None):
         self.interpreter = CodeInterpreter(api_key=api_key)
 
     def __del__(self):
-        self.interpreter.close()
+        self.interpreter.kill()
 
     def get_output_path(self, filename: str) -> str:
         # if output directory doesn't exist, create it

diff --git a/templates/components/routers/python/sandbox.py b/templates/components/routers/python/sandbox.py
@@ -16,7 +16,8 @@
 import logging
 import os
 import uuid
-from typing import Dict, List, Optional, Union
+from dataclasses import asdict
+from typing import Any, Dict, List, Optional, Union
 
 from app.engine.tools.artifact import CodeArtifact
 from app.engine.utils.file_helper import save_file
@@ -36,7 +37,7 @@ class ExecutionResult(BaseModel):
     template: str
     stdout: List[str]
     stderr: List[str]
-    runtime_error: Optional[Dict[str, Union[str, List[str]]]] = None
+    runtime_error: Optional[Dict[str, Any]] = None
     output_urls: List[Dict[str, str]]
     url: Optional[str]
 
@@ -54,12 +55,19 @@ def to_response(self):
         }
 
 
+class FileUpload(BaseModel):
+    id: str
+    name: str
+
+
 @sandbox_router.post("")
 async def create_sandbox(request: Request):
     request_data = await request.json()
+    artifact_data = request_data["artifact"]
+    sandbox_files = artifact_data.get("files", [])
 
     try:
-        artifact = CodeArtifact(**request_data["artifact"])
+        artifact = CodeArtifact(**artifact_data)
     except Exception:
         logger.error(f"Could not create artifact from request data: {request_data}")
         return HTTPException(
@@ -94,6 +102,10 @@ async def create_sandbox(request: Request):
                 f"Installed dependencies: {', '.join(artifact.additional_dependencies)} in sandbox {sbx}"
             )
 
+    # Copy files
+    if len(sandbox_files) > 0:
+        _upload_files(sbx, sandbox_files)
+
     # Copy code to disk
     if isinstance(artifact.code, list):
         for file in artifact.code:
@@ -107,11 +119,12 @@ async def create_sandbox(request: Request):
     if artifact.template == "code-interpreter-multilang":
         result = sbx.notebook.exec_cell(artifact.code or "")
         output_urls = _download_cell_results(result.results)
+        runtime_error = asdict(result.error) if result.error else None
         return ExecutionResult(
             template=artifact.template,
             stdout=result.logs.stdout,
             stderr=result.logs.stderr,
-            runtime_error=result.error,
+            runtime_error=runtime_error,
             output_urls=output_urls,
             url=None,
         ).to_response()
@@ -126,6 +139,19 @@ async def create_sandbox(request: Request):
         ).to_response()
 
 
+def _upload_files(
+    sandbox: Union[CodeInterpreter, Sandbox],
+    sandbox_files: List[str] = [],
+) -> None:
+    for file_path in sandbox_files:
+        file_name = os.path.basename(file_path)
+        local_file_path = f"output/uploaded/{file_name}"
+        with open(local_file_path, "rb") as f:
+            content = f.read()
+            sandbox.files.write(file_path, content)
+    return None
+
+
 def _download_cell_results(cell_results: Optional[List]) -> List[Dict[str, str]]:
     """
     To pull results from code interpreter cell and save them to disk for serving
@@ -148,7 +174,7 @@ def _download_cell_results(cell_results: Optional[List]) -> List[Dict[str, str]]
                     output.append(
                         {
                             "type": ext,
-                            "filename": file_meta.filename,
+                            "filename": file_meta.name,
                             "url": file_meta.url,
                         }
                     )

diff --git a/templates/components/services/python/file.py b/templates/components/services/python/file.py
@@ -1,11 +1,14 @@
 import base64
 import mimetypes
 import os
+import re
+import uuid
 from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Tuple
 
 from app.engine.index import IndexConfig, get_index
+from app.engine.utils.file_helper import FileMetadata, save_file
 from llama_index.core import VectorStoreIndex
 from llama_index.core.ingestion import IngestionPipeline
 from llama_index.core.readers.file.base import (
@@ -31,94 +34,141 @@ def get_llamaparse_parser():
 def default_file_loaders_map():
     default_loaders = get_file_loaders_map()
     default_loaders[".txt"] = FlatReader
+    default_loaders[".csv"] = FlatReader
     return default_loaders
 
 
 class PrivateFileService:
+    """
+    To store the files uploaded by the user and add them to the index.
+    """
+
     PRIVATE_STORE_PATH = "output/uploaded"
 
     @staticmethod
-    def preprocess_base64_file(base64_content: str) -> Tuple[bytes, str | None]:
+    def _preprocess_base64_file(base64_content: str) -> Tuple[bytes, str | None]:
         header, data = base64_content.split(",", 1)
         mime_type = header.split(";")[0].split(":", 1)[1]
         extension = mimetypes.guess_extension(mime_type)
         # File data as bytes
         return base64.b64decode(data), extension
 
     @staticmethod
-    def store_and_parse_file(file_name, file_data, extension) -> List[Document]:
+    def _store_file(file_name, file_data) -> FileMetadata:
+        """
+        Store the file to the private directory and return the file metadata
+        """
         # Store file to the private directory
         os.makedirs(PrivateFileService.PRIVATE_STORE_PATH, exist_ok=True)
         file_path = Path(os.path.join(PrivateFileService.PRIVATE_STORE_PATH, file_name))
 
-        # write file
-        with open(file_path, "wb") as f:
-            f.write(file_data)
+        return save_file(file_data, file_path=str(file_path))
+
+    @staticmethod
+    def _load_file_to_documents(file_metadata: FileMetadata) -> List[Document]:
+        """
+        Load the file from the private directory and return the documents
+        """
+        extension = file_metadata.name.split(".")[-1]
-        extension = file_metadata.name.split(".")[-1]
+        _, extension = os.path.splitext(file_metadata.name)
+        extension = extension.lstrip(".")
-        extension = file_metadata.name.split(".")[-1]
+        _, extension = os.path.splitext(file_metadata.name)
+        extension = extension.lstrip(".")
 
         # Load file to documents
         # If LlamaParse is enabled, use it to parse the file
         # Otherwise, use the default file loaders
         reader = get_llamaparse_parser()
         if reader is None:
-            reader_cls = default_file_loaders_map().get(extension)
+            reader_cls = default_file_loaders_map().get(f".{extension}")
             if reader_cls is None:
                 raise ValueError(f"File extension {extension} is not supported")
             reader = reader_cls()
-        documents = reader.load_data(file_path)
+        documents = reader.load_data(Path(file_metadata.path))
         # Add custom metadata
         for doc in documents:
-            doc.metadata["file_name"] = file_name
+            doc.metadata["file_name"] = file_metadata.name
             doc.metadata["private"] = "true"
         return documents
 
     @staticmethod
+    def _add_documents_to_vector_store_index(
+        documents: List[Document], index: VectorStoreIndex
+    ) -> None:
+        """
+        Add the documents to the vector store index
+        """
+        pipeline = IngestionPipeline()
+        nodes = pipeline.run(documents=documents)
+
+        # Add the nodes to the index and persist it
+        if index is None:
+            index = VectorStoreIndex(nodes=nodes)
+        else:
+            index.insert_nodes(nodes=nodes)
+        index.storage_context.persist(
+            persist_dir=os.environ.get("STORAGE_DIR", "storage")
+        )
+
+    @staticmethod
+    def _add_file_to_llama_cloud_index(
+        index: LlamaCloudIndex,
+        file_name: str,
+        file_data: bytes,
+    ) -> None:
+        """
+        Add the file to the LlamaCloud index.
+        LlamaCloudIndex is a managed index so we can directly use the files.
+        """
+        try:
+            from app.engine.service import LLamaCloudFileService
+        except ImportError:
+            raise ValueError("LlamaCloudFileService is not found")
+
+        project_id = index._get_project_id()
+        pipeline_id = index._get_pipeline_id()
+        # LlamaCloudIndex is a managed index so we can directly use the files
+        upload_file = (file_name, BytesIO(file_data))
+        return [
+            LLamaCloudFileService.add_file_to_pipeline(
+                project_id,
+                pipeline_id,
+                upload_file,
+                custom_metadata={},
+            )
+        ]
+
-            LLamaCloudFileService.add_file_to_pipeline(
-                project_id,
-                pipeline_id,
-                upload_file,
-                custom_metadata={},
-            )
-        ]
+            LLamaCloudFileService.add_file_to_pipeline(
+                project_id,
+                pipeline_id,
+                upload_file,
+                custom_metadata={},
+            )
-            LLamaCloudFileService.add_file_to_pipeline(
-                project_id,
-                pipeline_id,
-                upload_file,
-                custom_metadata={},
-            )
-        ]
+            LLamaCloudFileService.add_file_to_pipeline(
+                project_id,
+                pipeline_id,
+                upload_file,
+                custom_metadata={},
+            )
+    @staticmethod
+    def _sanitize_file_name(file_name: str) -> str:
+        file_name, extension = os.path.splitext(file_name)
+        return re.sub(r"[^a-zA-Z0-9]", "_", file_name) + extension
+
+    @classmethod
     def process_file(
-        file_name: str, base64_content: str, params: Optional[dict] = None
-    ) -> List[str]:
+        cls,
+        file_name: str,
+        base64_content: str,
+        params: Optional[dict] = None,
+    ) -> FileMetadata:
         if params is None:
             params = {}
 
-        file_data, extension = PrivateFileService.preprocess_base64_file(base64_content)
-
         # Add the nodes to the index and persist it
         index_config = IndexConfig(**params)
-        current_index = get_index(index_config)
+        index = get_index(index_config)
 
-        # Insert the documents into the index
-        if isinstance(current_index, LlamaCloudIndex):
-            from app.engine.service import LLamaCloudFileService
+        # Generate a new file name if the same file is uploaded multiple times
+        file_id = str(uuid.uuid4())
+        new_file_name = f"{file_id}_{cls._sanitize_file_name(file_name)}"
+
+        # Preprocess and store the file
+        file_data, extension = cls._preprocess_base64_file(base64_content)
+        file_metadata = cls._store_file(new_file_name, file_data)
 
-            project_id = current_index._get_project_id()
-            pipeline_id = current_index._get_pipeline_id()
-            # LlamaCloudIndex is a managed index so we can directly use the files
-            upload_file = (file_name, BytesIO(file_data))
-            return [
-                LLamaCloudFileService.add_file_to_pipeline(
-                    project_id,
-                    pipeline_id,
-                    upload_file,
-                    custom_metadata={
-                        # Set private=true to mark the document as private user docs (required for filtering)
-                        "private": "true",
-                    },
-                )
-            ]
+        # Insert the file into the index
+        if isinstance(index, LlamaCloudIndex):
+            _ = cls._add_file_to_llama_cloud_index(index, new_file_name, file_data)
         else:
-            # First process documents into nodes
-            documents = PrivateFileService.store_and_parse_file(
-                file_name, file_data, extension
-            )
-            pipeline = IngestionPipeline()
-            nodes = pipeline.run(documents=documents)
-
-            # Add the nodes to the index and persist it
-            if current_index is None:
-                current_index = VectorStoreIndex(nodes=nodes)
-            else:
-                current_index.insert_nodes(nodes=nodes)
-            current_index.storage_context.persist(
-                persist_dir=os.environ.get("STORAGE_DIR", "storage")
-            )
+            documents = cls._load_file_to_documents(file_metadata)
+            cls._add_documents_to_vector_store_index(documents, index)
+            # Add document ids to the file metadata
+            file_metadata.document_ids = [doc.doc_id for doc in documents]
 
-            # Return the document ids
-            return [doc.doc_id for doc in documents]
+        # Return the file metadata
+        return file_metadata