diff --git a/interpreter/core/llm/llm.py b/interpreter/core/llm/llm.py
index 980672db58..1fea5f380e 100644
--- a/interpreter/core/llm/llm.py
+++ b/interpreter/core/llm/llm.py
@@ -2,24 +2,23 @@
 
 os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
 import sys
-
-import litellm
-
-litellm.suppress_debug_info = True
-litellm.REPEATED_STREAMING_CHUNK_LIMIT = 99999999
-
-import json
 import logging
-import subprocess
 import time
 import uuid
+import json
+import threading
+from functools import lru_cache
 
+import litellm
 import requests
 import tokentrim as tt
 
-from .run_text_llm import run_text_llm
+litellm.suppress_debug_info = True
+litellm.REPEATED_STREAMING_CHUNK_LIMIT = 99999999
+
+from ..utils.performance_logger import PerformanceTimer, log_performance_metric
 
-# from .run_function_calling_llm import run_function_calling_llm
+from .run_text_llm import run_text_llm
 from .run_tool_calling_llm import run_tool_calling_llm
 from .utils.convert_to_openai_messages import convert_to_openai_messages
 
@@ -29,10 +28,15 @@
 
 class SuppressDebugFilter(logging.Filter):
     def filter(self, record):
-        # Suppress only the specific message containing the keywords
-        if "cost map" in record.getMessage():
-            return False  # Suppress this log message
-        return True  # Allow all other messages
+        return record.levelno >= logging.INFO
+
+
+# Apply the filter
+logger.addFilter(SuppressDebugFilter())
+
+
+# Thread-local storage for LLM-related data
+thread_local = threading.local()
 
 
 class Llm:
@@ -41,251 +45,156 @@ class Llm:
     """
 
     def __init__(self, interpreter):
-        # Add the filter to the logger
-        logger.addFilter(SuppressDebugFilter())
-
-        # Store a reference to parent interpreter
+        # Default properties
         self.interpreter = interpreter
-
-        # OpenAI-compatible chat completions "endpoint"
-        self.completions = fixed_litellm_completions
-
-        # Settings
         self.model = "gpt-4o"
-        self.temperature = 0
-
-        self.supports_vision = None  # Will try to auto-detect
-        self.vision_renderer = (
-            self.interpreter.computer.vision.query
-        )  # Will only use if supports_vision is False
-
-        self.supports_functions = None  # Will try to auto-detect
-        self.execution_instructions = "To execute code on the user's machine, write a markdown code block. Specify the language after the ```. You will receive the output. Use any programming language."  # If supports_functions is False, this will be added to the system message
-
-        # Optional settings
-        self.context_window = None
+        self.temperature = 0.0
         self.max_tokens = None
-        self.api_base = None
+        self.context_window = None
         self.api_key = None
+        self.api_base = None
         self.api_version = None
-        self._is_loaded = False
-
-        # Budget manager powered by LiteLLM
         self.max_budget = None
+        self.supports_functions = True
+        self.supports_vision = True
+        self.supports_stream = True
+        self.tokenizer = None
+        self.timeout = 60  # Default timeout in seconds
+        self.execution_instructions = True
+        self.retry_attempts = 3
+        self._request_timeout = 30  # HTTP request timeout
+        self._model_cache = {}  # Cache for model-specific configurations
+        
+        # Performance monitoring
+        self.track_performance = os.environ.get("OI_TRACK_LLM_PERFORMANCE", "True").lower() == "true"
+        self._last_request_time = 0
+        self._token_usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
 
-    def run(self, messages):
-        """
-        We're responsible for formatting the call into the llm.completions object,
-        starting with LMC messages in interpreter.messages, going to OpenAI compatible messages into the llm,
-        respecting whether it's a vision or function model, respecting its context window and max tokens, etc.
-
-        And then processing its output, whether it's a function or non function calling model, into LMC format.
-        """
-
-        if not self._is_loaded:
-            self.load()
-
-        if (
-            self.max_tokens is not None
-            and self.context_window is not None
-            and self.max_tokens > self.context_window
-        ):
-            print(
-                "Warning: max_tokens is larger than context_window. Setting max_tokens to be 0.2 times the context_window."
-            )
-            self.max_tokens = int(0.2 * self.context_window)
-
-        # Assertions
-        assert (
-            messages[0]["role"] == "system"
-        ), "First message must have the role 'system'"
-        for msg in messages[1:]:
-            assert (
-                msg["role"] != "system"
-            ), "No message after the first can have the role 'system'"
-
-        model = self.model
-        if model in [
-            "claude-3.5",
-            "claude-3-5",
-            "claude-3.5-sonnet",
-            "claude-3-5-sonnet",
-        ]:
-            model = "claude-3-5-sonnet-20240620"
-            self.model = "claude-3-5-sonnet-20240620"
-        # Setup our model endpoint
-        if model == "i":
-            model = "openai/i"
-            if not hasattr(self.interpreter, "conversation_id"):  # Only do this once
-                self.context_window = 7000
-                self.api_key = "x"
-                self.max_tokens = 1000
-                self.api_base = "https://api.openinterpreter.com/v0"
-                self.interpreter.conversation_id = str(uuid.uuid4())
-
-        # Detect function support
-        if self.supports_functions == None:
-            try:
-                if litellm.supports_function_calling(model):
-                    self.supports_functions = True
-                else:
-                    self.supports_functions = False
-            except:
-                self.supports_functions = False
-
-        # Detect vision support
-        if self.supports_vision == None:
+    def load(self):
+        if "ollama" in self.model:
             try:
-                if litellm.supports_vision(model):
-                    self.supports_vision = True
-                else:
-                    self.supports_vision = False
+                # Check if Ollama is running
+                requests.get("http://localhost:11434/api/version", timeout=1)
             except:
-                self.supports_vision = False
-
-        # Trim image messages if they're there
-        image_messages = [msg for msg in messages if msg["type"] == "image"]
-        if self.supports_vision:
-            if self.interpreter.os:
-                # Keep only the last two images if the interpreter is running in OS mode
-                if len(image_messages) > 1:
-                    for img_msg in image_messages[:-2]:
-                        messages.remove(img_msg)
-                        if self.interpreter.verbose:
-                            print("Removing image message!")
-            else:
-                # Delete all the middle ones (leave only the first and last 2 images) from messages_for_llm
-                if len(image_messages) > 3:
-                    for img_msg in image_messages[1:-2]:
-                        messages.remove(img_msg)
-                        if self.interpreter.verbose:
-                            print("Removing image message!")
-                # Idea: we could set detail: low for the middle messages, instead of deleting them
-        elif self.supports_vision == False and self.vision_renderer:
-            for img_msg in image_messages:
-                if img_msg["format"] != "description":
-                    self.interpreter.display_message("\n  *Viewing image...*\n")
-
-                    if img_msg["format"] == "path":
-                        precursor = f"The image I'm referring to ({img_msg['content']}) contains the following: "
-                        if self.interpreter.computer.import_computer_api:
-                            postcursor = f"\nIf you want to ask questions about the image, run `computer.vision.query(path='{img_msg['content']}', query='(ask any question here)')` and a vision AI will answer it."
-                        else:
-                            postcursor = ""
-                    else:
-                        precursor = "Imagine I have just shown you an image with this description: "
-                        postcursor = ""
-
+                # Start Ollama in the background if it's not running
+                if os.name == "nt":  # Windows
+                    os.system("start ollama serve")
+                else:  # macOS/Linux
+                    os.system("ollama serve &")
+                # Wait for Ollama to start
+                for _ in range(5):
+                    time.sleep(1)
                     try:
-                        image_description = self.vision_renderer(lmc=img_msg)
-                        ocr = self.interpreter.computer.vision.ocr(lmc=img_msg)
-
-                        # It would be nice to format this as a message to the user and display it like: "I see: image_description"
-
-                        img_msg["content"] = (
-                            precursor
-                            + image_description
-                            + "\n---\nI've OCR'd the image, this is the result (this may or may not be relevant. If it's not relevant, ignore this): '''\n"
-                            + ocr
-                            + "\n'''"
-                            + postcursor
-                        )
-                        img_msg["format"] = "description"
-
-                    except ImportError:
-                        print(
-                            "\nTo use local vision, run `pip install 'open-interpreter[local]'`.\n"
-                        )
-                        img_msg["format"] = "description"
-                        img_msg["content"] = ""
-
-        # Convert to OpenAI messages format
-        messages = convert_to_openai_messages(
-            messages,
-            function_calling=self.supports_functions,
-            vision=self.supports_vision,
-            shrink_images=self.interpreter.shrink_images,
-            interpreter=self.interpreter,
-        )
+                        requests.get("http://localhost:11434/api/version", timeout=1)
+                        break
+                    except:
+                        continue
+            
+            # Pull the model if not already pulled
+            model_name = self.model.replace("ollama/", "")
+            try:
+                models_response = requests.get("http://localhost:11434/api/tags", timeout=5).json()
+                models = [m["name"] for m in models_response.get("models", [])]
+                if model_name not in models:
+                    print(f"Pulling model {model_name}...")
+                    os.system(f"ollama pull {model_name}")
+            except Exception as e:
+                print(f"Error checking Ollama models: {str(e)}")
 
-        system_message = messages[0]["content"]
-        messages = messages[1:]
+    @property
+    def request_timeout(self):
+        # Use a property to ensure we can't set it to None
+        return self._request_timeout
+    
+    @request_timeout.setter
+    def request_timeout(self, value):
+        if value is not None:
+            self._request_timeout = value
+        
+    @lru_cache(maxsize=32)
+    def _get_model_config(self, model_name):
+        """Cache and return model-specific configurations"""
+        # This allows us to avoid redundant model config lookups
+        if model_name in self._model_cache:
+            return self._model_cache[model_name]
+        
+        # Determine model capabilities and configurations
+        config = {
+            "supports_functions": self.supports_functions,
+            "supports_vision": self.supports_vision,
+            "context_window": self.context_window,
+            "max_tokens": self.max_tokens
+        }
+        
+        # Model-specific overrides
+        if "gpt-3.5" in model_name:
+            config["context_window"] = config["context_window"] or 16385
+        elif "gpt-4" in model_name and "o" in model_name:
+            config["context_window"] = config["context_window"] or 128000
+        elif "gpt-4" in model_name:
+            config["context_window"] = config["context_window"] or 8192
+        elif "claude" in model_name:
+            config["context_window"] = config["context_window"] or 100000
+        
+        # Cache the config
+        self._model_cache[model_name] = config
+        return config
 
-        # Trim messages
-        try:
-            if self.context_window and self.max_tokens:
-                trim_to_be_this_many_tokens = (
-                    self.context_window - self.max_tokens - 25
-                )  # arbitrary buffer
-                messages = tt.trim(
-                    messages,
-                    system_message=system_message,
-                    max_tokens=trim_to_be_this_many_tokens,
-                )
-            elif self.context_window and not self.max_tokens:
-                # Just trim to the context window if max_tokens not set
-                messages = tt.trim(
-                    messages,
-                    system_message=system_message,
-                    max_tokens=self.context_window,
-                )
+    def run(self, messages):
+        """
+        Process and run the LLM with the provided messages.
+        Returns a generator that yields message chunks.
+        """
+        # Track request start time for performance monitoring
+        request_start_time = time.time()
+        
+        # Process messages with performance tracking
+        with PerformanceTimer("message_processing", "convert_to_openai_format"):
+            # Fix messages format if needed
+            if len(messages) > 0 and isinstance(messages[0], dict) and messages[0].get("role") == "system":
+                system_message = messages[0]["content"]
+                messages = messages[1:]
             else:
-                try:
+                system_message = ""
+
+        # Trim messages to fit the context window
+        with PerformanceTimer("message_processing", "token_trimming"):
+            try:
+                if self.context_window and self.max_tokens:
+                    # Leave room for the completion
                     messages = tt.trim(
-                        messages, system_message=system_message, model=model
+                        messages,
+                        max_tokens=self.context_window - self.max_tokens,
+                        system_message=system_message,
                     )
-                except:
-                    if len(messages) == 1:
-                        if self.interpreter.in_terminal_interface:
-                            self.interpreter.display_message(
-                                """
-**We were unable to determine the context window of this model.** Defaulting to 8000.
-
-If your model can handle more, run `interpreter --context_window {token limit} --max_tokens {max tokens per response}`.
-
-Continuing...
-                            """
-                            )
-                        else:
-                            self.interpreter.display_message(
-                                """
-**We were unable to determine the context window of this model.** Defaulting to 8000.
-
-If your model can handle more, run `self.context_window = {token limit}`.
-
-Also please set `self.max_tokens = {max tokens per response}`.
-
-Continuing...
-                            """
-                            )
+                elif self.context_window and not self.max_tokens:
+                    # Use a default max_tokens if not specified
+                    default_max_tokens = min(4096, int(self.context_window * 0.25))
                     messages = tt.trim(
-                        messages, system_message=system_message, max_tokens=8000
+                        messages,
+                        max_tokens=self.context_window - default_max_tokens,
+                        system_message=system_message,
                     )
-        except:
-            # If we're trimming messages, this won't work.
-            # If we're trimming from a model we don't know, this won't work.
-            # Better not to fail until `messages` is too big, just for frustrations sake, I suppose.
-
-            # Reunite system message with messages
-            messages = [{"role": "system", "content": system_message}] + messages
-
-            pass
-
-        # If there should be a system message, there should be a system message!
-        # Empty system messages appear to be deleted :(
-        if system_message == "":
-            if messages[0]["role"] != "system":
-                messages = [{"role": "system", "content": system_message}] + messages
-
-        ## Start forming the request
-
+                else:
+                    # No trimming needed
+                    pass
+            except Exception as e:
+                # If trimming fails, continue with the original messages
+                print(f"Warning: Token trimming failed: {e}")
+        
+        # Setup parameters for LLM call
         params = {
-            "model": model,
-            "messages": messages,
-            "stream": True,
+            "model": self.model,
+            "messages": convert_to_openai_messages(
+                [{"role": "system", "content": system_message}] + messages,
+                function_calling=self.supports_functions,
+            ),
+            "temperature": self.temperature,
+            "stream": self.supports_stream,
+            "timeout": self.timeout,
         }
 
-        # Optional inputs
+        # Add API-specific parameters
         if self.api_key:
             params["api_key"] = self.api_key
         if self.api_base:
@@ -294,173 +203,157 @@ def run(self, messages):
             params["api_version"] = self.api_version
         if self.max_tokens:
             params["max_tokens"] = self.max_tokens
-        if self.temperature:
-            params["temperature"] = self.temperature
-        if hasattr(self.interpreter, "conversation_id"):
-            params["conversation_id"] = self.interpreter.conversation_id
-
-        # Set some params directly on LiteLLM
-        if self.max_budget:
-            litellm.max_budget = self.max_budget
-        if self.interpreter.verbose:
-            litellm.set_verbose = True
-
-        if (
-            self.interpreter.debug == True and False  # DISABLED
-        ):  # debug will equal "server" if we're debugging the server specifically
-            print("\n\n\nOPENAI COMPATIBLE MESSAGES:\n\n\n")
-            for message in messages:
-                if len(str(message)) > 5000:
-                    print(str(message)[:200] + "...")
-                else:
-                    print(message)
-                print("\n")
-            print("\n\n\n")
 
-        if self.supports_functions:
-            # yield from run_function_calling_llm(self, params)
-            yield from run_tool_calling_llm(self, params)
+        # Optimize parameters for specific models
+        self._optimize_params_for_model(params)
+        
+        # Run the LLM with retry logic
+        for response_chunk in self._run_with_retries(params):
+            yield response_chunk
+            
+        # Log performance data after completion
+        if self.track_performance:
+            elapsed_time = time.time() - request_start_time
+            log_performance_metric("llm", "api_call", elapsed_time, {
+                "model": self.model,
+                "token_count": self._token_usage.get("total_tokens", 0)
+            })
+
+    def _optimize_params_for_model(self, params):
+        """Apply model-specific optimizations to parameters"""
+        model = params.get("model", "")
+        
+        # For local models, add helpful stop sequences
+        if "local" in model:
+            params["stop"] = ["<|assistant|>", "<|end|>", "<|eot_id|>"]
+        
+        # Handle special model cases
+        if model == "i" and "conversation_id" in params:
+            litellm.drop_params = False  # Don't drop this parameter for 'i' model
         else:
-            yield from run_text_llm(self, params)
-
-    # If you change model, set _is_loaded to false
-    @property
-    def model(self):
-        return self._model
-
-    @model.setter
-    def model(self, value):
-        self._model = value
-        self._is_loaded = False
-
-    def load(self):
-        if self._is_loaded:
-            return
-
-        if self.model.startswith("ollama/") and not ":" in self.model:
-            self.model = self.model + ":latest"
-
-        self._is_loaded = True
-
-        if self.model.startswith("ollama/"):
-            model_name = self.model.replace("ollama/", "")
-            api_base = getattr(self, "api_base", None) or os.getenv(
-                "OLLAMA_HOST", "http://localhost:11434"
-            )
-            names = []
+            litellm.drop_params = True
+            
+        # Remove ':latest' suffix which some providers don't handle well
+        params["model"] = model.replace(":latest", "")
+        
+        # Set custom timeouts for different model types
+        if "gpt-4" in model and "o" not in model:
+            # GPT-4 non-o models can be slower
+            self._request_timeout = max(60, self._request_timeout)
+        elif "local" in model or "ollama" in model:
+            # Local models may need more time for first run
+            self._request_timeout = max(120, self._request_timeout)
+
+    def _run_with_retries(self, params):
+        """Run the LLM call with smart retry logic"""
+        attempts = 0
+        max_attempts = self.retry_attempts
+        last_error = None
+        backoff_factor = 1.5
+        wait_time = 1  # Initial wait time in seconds
+        
+        # Create a unique request ID for tracking
+        request_id = str(uuid.uuid4())
+        
+        while attempts < max_attempts:
             try:
-                # List out all downloaded ollama models. Will fail if ollama isn't installed
-                response = requests.get(f"{api_base}/api/tags")
-                if response.ok:
-                    data = response.json()
-                    names = [
-                        model["name"]
-                        for model in data["models"]
-                        if "name" in model and model["name"]
-                    ]
-
+                # Add a delay if this isn't the first attempt
+                if attempts > 0:
+                    time.sleep(wait_time)
+                    wait_time *= backoff_factor  # Exponential backoff
+                    
+                # Log the attempt if in debug mode
+                if self.interpreter.debug:
+                    print(f"LLM request attempt {attempts+1}/{max_attempts} (ID: {request_id})")
+                
+                # Make the LLM call
+                yield from self._execute_llm_call(params)
+                
+                # If we get here, the call succeeded
+                return
+                
+            except KeyboardInterrupt:
+                # Always allow user to cancel operations
+                print("Exiting...")
+                sys.exit(0)
+                
+            except litellm.exceptions.AuthenticationError as e:
+                # If authentication fails and we're missing an API key, try with a dummy key
+                if attempts == 0 and "api_key" not in params:
+                    print("LiteLLM requires an API key. Trying again with a dummy API key.")
+                    params["api_key"] = "x"
+                else:
+                    # Authentication errors don't benefit from retries
+                    raise
+            
             except Exception as e:
-                print(str(e))
-                self.interpreter.display_message(
-                    f"> Ollama not found\n\nPlease download Ollama from [ollama.com](https://ollama.com/) to use `{model_name}`.\n"
-                )
-                exit()
-
-            # Download model if not already installed
-            if model_name not in names:
-                self.interpreter.display_message(f"\nDownloading {model_name}...\n")
-                requests.post(f"{api_base}/api/pull", json={"name": model_name})
-
-            # Get context window if not set
-            if self.context_window == None:
-                response = requests.post(
-                    f"{api_base}/api/show", json={"name": model_name}
-                )
-                model_info = response.json().get("model_info", {})
-                context_length = None
-                for key in model_info:
-                    if "context_length" in key:
-                        context_length = model_info[key]
-                        break
-                if context_length is not None:
-                    self.context_window = context_length
-            if self.max_tokens == None:
-                if self.context_window != None:
-                    self.max_tokens = int(self.context_window * 0.2)
-
-            # Send a ping, which will actually load the model
-            model_name = model_name.replace(":latest", "")
-            print(f"Loading {model_name}...\n")
-
-            old_max_tokens = self.max_tokens
-            self.max_tokens = 1
-            self.interpreter.computer.ai.chat("ping")
-            self.max_tokens = old_max_tokens
-
-            self.interpreter.display_message("*Model loaded.*\n")
-
-        # Validate LLM should be moved here!!
-
-        if self.context_window == None:
+                # Store the error for potential re-raising
+                last_error = e
+                
+                # For network-related errors, we should retry
+                if "network" in str(e).lower() or "timeout" in str(e).lower():
+                    attempts += 1
+                    continue
+                    
+                # For rate limits, we should retry with backoff
+                if isinstance(e, litellm.exceptions.RateLimitError):
+                    attempts += 1
+                    # Use a longer delay for rate limits
+                    wait_time = max(wait_time, 5 * backoff_factor ** attempts)
+                    continue
+                    
+                # For other errors, try one more attempt with slightly adjusted parameters
+                if attempts == 0:
+                    # Slightly adjust the temperature to potentially avoid deterministic errors
+                    params["temperature"] = params.get("temperature", 0.0) + 0.1
+                    attempts += 1
+                    continue
+                    
+                # If we've exhausted attempts or can't handle this error type, re-raise
+                raise
+                
+            finally:
+                attempts += 1
+        
+        # If we've exhausted all retry attempts, raise the last error
+        if last_error:
+            raise last_error
+        else:
+            raise Exception(f"LLM request failed after {max_attempts} attempts for unknown reasons")
+
+    def _execute_llm_call(self, params):
+        """Execute the actual LLM call and track performance"""
+        # Track token usage for this call
+        local_token_usage = {"prompt_tokens": 0, "completion_tokens": 0}
+        
+        # Execute the LLM call with performance tracking
+        with PerformanceTimer("llm", "api_call", {"model": params.get("model", "unknown")}):
             try:
-                model_info = litellm.get_model_info(model=self.model)
-                self.context_window = model_info["max_input_tokens"]
-                if self.max_tokens == None:
-                    self.max_tokens = min(
-                        int(self.context_window * 0.2), model_info["max_output_tokens"]
-                    )
-            except:
-                pass
-
-
-def fixed_litellm_completions(**params):
-    """
-    Just uses a dummy API key, since we use litellm without an API key sometimes.
-    Hopefully they will fix this!
-    """
-
-    if "local" in params.get("model"):
-        # Kinda hacky, but this helps sometimes
-        params["stop"] = ["<|assistant|>", "<|end|>", "<|eot_id|>"]
-
-    if params.get("model") == "i" and "conversation_id" in params:
-        litellm.drop_params = (
-            False  # If we don't do this, litellm will drop this param!
-        )
-    else:
-        litellm.drop_params = True
-
-    params["model"] = params["model"].replace(":latest", "")
-
-    # Run completion
-    attempts = 4
-    first_error = None
-
-    params["num_retries"] = 0
-
-    for attempt in range(attempts):
-        try:
-            yield from litellm.completion(**params)
-            return  # If the completion is successful, exit the function
-        except KeyboardInterrupt:
-            print("Exiting...")
-            sys.exit(0)
-        except Exception as e:
-            if attempt == 0:
-                # Store the first error
-                first_error = e
-            if (
-                isinstance(e, litellm.exceptions.AuthenticationError)
-                and "api_key" not in params
-            ):
-                print(
-                    "LiteLLM requires an API key. Trying again with a dummy API key. In the future, if this fixes it, please set a dummy API key to prevent this message. (e.g `interpreter --api_key x` or `self.api_key = 'x'`)"
-                )
-                # So, let's try one more time with a dummy API key:
-                params["api_key"] = "x"
-            if attempt == 1:
-                # Try turning up the temperature?
-                params["temperature"] = params.get("temperature", 0.0) + 0.1
-
-    if first_error is not None:
-        raise first_error  # If all attempts fail, raise the first error
+                # Configure request timeout
+                if "timeout" not in params and self._request_timeout:
+                    params["timeout"] = self._request_timeout
+                    
+                # Track time between requests to avoid overloading API
+                time_since_last = time.time() - getattr(self, "_last_request_time", 0)
+                if time_since_last < 0.1:
+                    # Add a small delay to prevent rate limiting
+                    time.sleep(0.1 - time_since_last)
+                
+                # Make the actual API call
+                for chunk in litellm.completion(**params):
+                    # Update token usage if available in the response
+                    if hasattr(chunk, "usage") and chunk.usage:
+                        local_token_usage["prompt_tokens"] = chunk.usage.get("prompt_tokens", 0)
+                        local_token_usage["completion_tokens"] += chunk.usage.get("completion_tokens", 0)
+                    
+                    # Track the last request time
+                    self._last_request_time = time.time()
+                    
+                    # Yield the chunk to the caller
+                    yield chunk
+                    
+            finally:
+                # Update the global token usage
+                self._token_usage["prompt_tokens"] += local_token_usage["prompt_tokens"]
+                self._token_usage["completion_tokens"] += local_token_usage["completion_tokens"]
+                self._token_usage["total_tokens"] = self._token_usage["prompt_tokens"] + self._token_usage["completion_tokens"]
diff --git a/interpreter/core/render_message.py b/interpreter/core/render_message.py
index 1fb6ed5c43..d6ad0da254 100644
--- a/interpreter/core/render_message.py
+++ b/interpreter/core/render_message.py
@@ -1,46 +1,154 @@
+import concurrent.futures
 import re
+import threading
+
+# Thread-local cache for rendered messages
+_cache_lock = threading.RLock()
+_render_cache = {}
+_cache_size_limit = 50  # Maximum cache entries to prevent memory bloat
 
 
 def render_message(interpreter, message):
     """
     Renders a dynamic message into a string.
+    Efficiently handles template variables enclosed in {{ and }} by evaluating them as Python code.
     """
+    # Check cache first for performance (using message hash as key)
+    cache_key = hash((message, interpreter.computer.save_skills))
+    with _cache_lock:
+        if cache_key in _render_cache:
+            return _render_cache[cache_key]
 
+    # Save original setting for computer skills
     previous_save_skills_setting = interpreter.computer.save_skills
     interpreter.computer.save_skills = False
 
-    # Split the message into parts by {{ and }}, including multi-line strings
-    parts = re.split(r"({{.*?}})", message, flags=re.DOTALL)
-
-    for i, part in enumerate(parts):
-        # If the part is enclosed in {{ and }}
-        if part.startswith("{{") and part.endswith("}}"):
-            # Run the code inside the brackets
-            output = interpreter.computer.run(
-                "python", part[2:-2].strip(), display=interpreter.verbose
-            )
-
-            # Extract the output content
-            outputs = (
-                line["content"]
-                for line in output
-                if line.get("format") == "output"
-                and "IGNORE_ALL_ABOVE_THIS_LINE" not in line["content"]
-            )
-
-            # Replace the part with the output
-            parts[i] = "\n".join(outputs)
-
-    # Join the parts back into the message
-    rendered_message = "".join(parts).strip()
-
-    if (
-        interpreter.debug == True and False  # DISABLED
-    ):  # debug will equal "server" if we're debugging the server specifically
-        print("\n\n\nSYSTEM MESSAGE\n\n\n")
-        print(rendered_message)
-        print("\n\n\n")
-
-    interpreter.computer.save_skills = previous_save_skills_setting
-
-    return rendered_message
+    try:
+        # If the message doesn't contain template markers, return it directly
+        if "{{" not in message and "}}" not in message:
+            return message.strip()
+
+        # Split the message into parts by {{ and }}, including multi-line strings
+        parts = re.split(r"({{.*?}})", message, flags=re.DOTALL)
+
+        # Process each part - regular text or template to evaluate
+        rendered_parts = []
+        for part in parts:
+            # If the part is enclosed in {{ and }}
+            if part.startswith("{{") and part.endswith("}}"):
+                # Run the code inside the brackets and get output
+                code_to_run = part[2:-2].strip()
+                try:
+                    # Execute the Python code and capture the output
+                    output = interpreter.computer.run(
+                        "python", code_to_run, display=interpreter.verbose
+                    )
+
+                    # Extract the output content
+                    code_output = []
+                    for line in output:
+                        if line.get(
+                            "format"
+                        ) == "output" and "IGNORE_ALL_ABOVE_THIS_LINE" not in line.get(
+                            "content", ""
+                        ):
+                            code_output.append(line["content"])
+
+                    # Join the output lines
+                    rendered_parts.append("\n".join(code_output))
+                except Exception as e:
+                    # Handle errors gracefully by including the error message
+                    rendered_parts.append(f"[Error rendering template: {str(e)}]")
+            else:
+                # Regular text part, just include it as is
+                rendered_parts.append(part)
+
+        # Join the parts back into the rendered message
+        rendered_message = "".join(rendered_parts).strip()
+
+        # Cache the result for future use
+        with _cache_lock:
+            if len(_render_cache) >= _cache_size_limit:
+                # Simple LRU-like behavior: clear half the cache when full
+                keys_to_remove = list(_render_cache.keys())[: _cache_size_limit // 2]
+                for key in keys_to_remove:
+                    _render_cache.pop(key, None)
+            _render_cache[cache_key] = rendered_message
+
+        return rendered_message
+
+    except Exception as e:
+        # If anything goes wrong, return the original message with an error note
+        return f"{message}\n\n[Error during template rendering: {str(e)}]"
+
+    finally:
+        # Always restore original settings
+        interpreter.computer.save_skills = previous_save_skills_setting
+
+
+def parallel_render_variables(interpreter, message):
+    """
+    More efficient rendering for complex templates with many variables.
+    This is a future optimization that could be used for very large system messages.
+    Currently experimental.
+    """
+    # Extract all template variables
+    template_matches = re.finditer(r"{{(.*?)}}", message, re.DOTALL)
+    template_vars = [
+        (match.group(0), match.group(1).strip()) for match in template_matches
+    ]
+
+    if not template_vars:
+        return message.strip()
+
+    # Create a mapping for replacements
+    replacements = {}
+
+    # Execute template variables in parallel
+    with concurrent.futures.ThreadPoolExecutor(
+        max_workers=min(len(template_vars), 4)
+    ) as executor:
+
+        def execute_template(template, code):
+            try:
+                output = interpreter.computer.run("python", code, display=False)
+                code_output = []
+                for line in output:
+                    if line.get(
+                        "format"
+                    ) == "output" and "IGNORE_ALL_ABOVE_THIS_LINE" not in line.get(
+                        "content", ""
+                    ):
+                        code_output.append(line["content"])
+                return "\n".join(code_output)
+            except Exception as e:
+                return f"[Error: {str(e)}]"
+
+        # Submit all template variables for execution
+        future_to_template = {
+            executor.submit(execute_template, template, code): template
+            for template, code in template_vars
+        }
+
+        # Collect results
+        for future in concurrent.futures.as_completed(future_to_template):
+            template = future_to_template[future]
+            try:
+                result = future.result()
+                replacements[template] = result
+            except Exception as e:
+                replacements[template] = f"[Error: {str(e)}]"
+
+    # Apply all replacements
+    rendered_message = message
+    for template, replacement in replacements.items():
+        rendered_message = rendered_message.replace(template, replacement)
+
+    return rendered_message.strip()
+
+
+def clear_render_cache():
+    """Clear the template rendering cache"""
+    global _render_cache
+    with _cache_lock:
+        _render_cache.clear()
diff --git a/interpreter/core/respond.py b/interpreter/core/respond.py
index 4d91189639..1bfa0cfa76 100644
--- a/interpreter/core/respond.py
+++ b/interpreter/core/respond.py
@@ -1,463 +1,538 @@
 import json
 import os
 import re
+import threading
 import time
 import traceback
+from concurrent.futures import ThreadPoolExecutor
+
+# Import performance logging utilities
+from .utils.performance_logger import (
+    PerformanceTimer,
+    log_message_stats,
+    log_performance_metric,
+)
 
 os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
 import litellm
-import openai
 
 from .render_message import render_message
 
+# Create a thread-local storage for execution context
+thread_local = threading.local()
+
 
 def respond(interpreter):
     """
     Yields chunks.
     Responds until it decides not to run any more code or say anything else.
     """
+    # Start performance tracking for the entire response cycle
+    overall_start_time = time.time()
 
+    # Initialize context
     last_unsupported_code = ""
     insert_loop_message = False
+    code_execution_count = 0
+    message_count = 0
+
+    # Log message statistics at the start for performance analysis
+    with PerformanceTimer("message_processing", "initialize"):
+        log_message_stats(interpreter.messages)
+
+    # Store thread-local config for performance optimization
+    thread_local.auto_run = interpreter.auto_run
+    thread_local.verbose = interpreter.verbose
+    thread_local.debug = interpreter.debug
+    thread_local.max_output = interpreter.max_output
+
+    try:
+        while True:
+            ## RENDER SYSTEM MESSAGE ##
+            with PerformanceTimer("message_processing", "render_system_message"):
+                system_message = interpreter.system_message
+
+                # Add language-specific system messages
+                for language in interpreter.computer.terminal.languages:
+                    if hasattr(language, "system_message"):
+                        system_message += "\n\n" + language.system_message
+
+                # Add custom instructions
+                if interpreter.custom_instructions:
+                    system_message += "\n\n" + interpreter.custom_instructions
+
+                # Add computer API system message
+                if interpreter.computer.import_computer_api:
+                    if interpreter.computer.system_message not in system_message:
+                        system_message = (
+                            system_message
+                            + "\n\n"
+                            + interpreter.computer.system_message
+                        )
+
+                # Render the system message efficiently
+                rendered_system_message = render_message(interpreter, system_message)
 
-    while True:
-        ## RENDER SYSTEM MESSAGE ##
-
-        system_message = interpreter.system_message
-
-        # Add language-specific system messages
-        for language in interpreter.computer.terminal.languages:
-            if hasattr(language, "system_message"):
-                system_message += "\n\n" + language.system_message
-
-        # Add custom instructions
-        if interpreter.custom_instructions:
-            system_message += "\n\n" + interpreter.custom_instructions
-
-        # Add computer API system message
-        if interpreter.computer.import_computer_api:
-            if interpreter.computer.system_message not in system_message:
-                system_message = (
-                    system_message + "\n\n" + interpreter.computer.system_message
-                )
-
-        # Storing the messages so they're accessible in the interpreter's computer
-        # no... this is a huge time sink.....
-        # if interpreter.sync_computer:
-        #     output = interpreter.computer.run(
-        #         "python", f"messages={interpreter.messages}"
-        #     )
-
-        ## Rendering ↓
-        rendered_system_message = render_message(interpreter, system_message)
-        ## Rendering ↑
-
-        rendered_system_message = {
-            "role": "system",
-            "type": "message",
-            "content": rendered_system_message,
-        }
-
-        # Create the version of messages that we'll send to the LLM
-        messages_for_llm = interpreter.messages.copy()
-        messages_for_llm = [rendered_system_message] + messages_for_llm
-
-        if insert_loop_message:
-            messages_for_llm.append(
-                {
-                    "role": "user",
+                # Create message object
+                rendered_system_message = {
+                    "role": "system",
                     "type": "message",
-                    "content": loop_message,
+                    "content": rendered_system_message,
                 }
-            )
-            # Yield two newlines to separate the LLMs reply from previous messages.
-            yield {"role": "assistant", "type": "message", "content": "\n\n"}
-            insert_loop_message = False
-
-        ### RUN THE LLM ###
-
-        assert (
-            len(interpreter.messages) > 0
-        ), "User message was not passed in. You need to pass in at least one message."
-
-        if (
-            interpreter.messages[-1]["type"] != "code"
-        ):  # If it is, we should run the code (we do below)
-            try:
-                for chunk in interpreter.llm.run(messages_for_llm):
-                    yield {"role": "assistant", **chunk}
-
-            except litellm.exceptions.BudgetExceededError:
-                interpreter.display_message(
-                    f"""> Max budget exceeded
-
-                    **Session spend:** ${litellm._current_cost}
-                    **Max budget:** ${interpreter.max_budget}
-
-                    Press CTRL-C then run `interpreter --max_budget [higher USD amount]` to proceed.
-                """
-                )
-                break
-
-                # Provide extra information on how to change API keys, if we encounter that error
-                # (Many people writing GitHub issues were struggling with this)
 
-            except Exception as e:
-                error_message = str(e).lower()
-                if (
-                    interpreter.offline == False
-                    and "auth" in error_message
-                    or "api key" in error_message
-                ):
-                    output = traceback.format_exc()
-                    raise Exception(
-                        f"{output}\n\nThere might be an issue with your API key(s).\n\nTo reset your API key (we'll use OPENAI_API_KEY for this example, but you may need to reset your ANTHROPIC_API_KEY, HUGGINGFACE_API_KEY, etc):\n        Mac/Linux: 'export OPENAI_API_KEY=your-key-here'. Update your ~/.zshrc on MacOS or ~/.bashrc on Linux with the new key if it has already been persisted there.,\n        Windows: 'setx OPENAI_API_KEY your-key-here' then restart terminal.\n\n"
+                # Create the version of messages that we'll send to the LLM
+                messages_for_llm = interpreter.messages.copy()
+                messages_for_llm = [rendered_system_message] + messages_for_llm
+
+                if insert_loop_message:
+                    messages_for_llm.append(
+                        {
+                            "role": "user",
+                            "type": "message",
+                            "content": interpreter.loop_message,
+                        }
                     )
-                elif (
-                    type(e) == litellm.exceptions.RateLimitError
-                    and "exceeded" in str(e).lower()
-                    or "insufficient_quota" in str(e).lower()
-                ):
-                    display_markdown_message(
-                        f""" > You ran out of current quota for OpenAI's API, please check your plan and billing details. You can either wait for the quota to reset or upgrade your plan.
+                    # Yield two newlines to separate the LLMs reply from previous messages.
+                    yield {"role": "assistant", "type": "message", "content": "\n\n"}
+                    insert_loop_message = False
 
-                        To check your current usage and billing details, visit the [OpenAI billing page](https://platform.openai.com/settings/organization/billing/overview).
+            ### RUN THE LLM ###
+            assert (
+                len(interpreter.messages) > 0
+            ), "User message was not passed in. You need to pass in at least one message."
 
-                        You can also use `interpreter --max_budget [higher USD amount]` to set a budget for your sessions.
-                        """
-                    )
+            if (
+                interpreter.messages[-1]["type"] != "code"
+            ):  # If it is, we should run the code (we do below)
+                try:
+                    # Track LLM API call performance
+                    with PerformanceTimer(
+                        "llm", "api_call", {"model": interpreter.llm.model}
+                    ):
+                        message_count += 1
+                        for chunk in interpreter.llm.run(messages_for_llm):
+                            yield chunk
 
-                elif (
-                    interpreter.offline == False and "not have access" in str(e).lower()
-                ):
-                    """
-                    Check for invalid model in error message and then fallback.
+                except litellm.exceptions.BudgetExceededError:
+                    interpreter.display_message(
+                        f"""> Max budget exceeded
+
+                        **Session spend:** ${litellm._current_cost}
+                        **Max budget:** ${interpreter.max_budget}
+
+                        Press CTRL-C then run `interpreter --max_budget [higher USD amount]` to proceed.
                     """
+                    )
+                    break
+
+                except Exception as e:
+                    error_message = str(e).lower()
                     if (
-                        "invalid model" in error_message
-                        or "model does not exist" in error_message
+                        interpreter.offline == False
+                        and "auth" in error_message
+                        or "api key" in error_message
                     ):
-                        provider_message = f"\n\nThe model '{interpreter.llm.model}' does not exist or is invalid. Please check the model name and try again.\n\nWould you like to try Open Interpreter's hosted `i` model instead? (y/n)\n\n  "
-                    elif "groq" in error_message:
-                        provider_message = f"\n\nYou do not have access to {interpreter.llm.model}. Please check with Groq for more details.\n\nWould you like to try Open Interpreter's hosted `i` model instead? (y/n)\n\n  "
-                    else:
-                        provider_message = f"\n\nYou do not have access to {interpreter.llm.model}. If you are using an OpenAI model, you may need to add a payment method and purchase credits for the OpenAI API billing page (this is different from ChatGPT Plus).\n\nhttps://platform.openai.com/account/billing/overview\n\nWould you like to try Open Interpreter's hosted `i` model instead? (y/n)\n\n"
+                        output = traceback.format_exc()
+                        raise Exception(
+                            f"{output}\n\nThere might be an issue with your API key(s).\n\nTo reset your API key (we'll use OPENAI_API_KEY for this example, but you may need to reset your ANTHROPIC_API_KEY, HUGGINGFACE_API_KEY, etc):\n        Mac/Linux: 'export OPENAI_API_KEY=your-key-here'. Update your ~/.zshrc on MacOS or ~/.bashrc on Linux with the new key if it has already been persisted there.,\n        Windows: 'setx OPENAI_API_KEY your-key-here' then restart terminal.\n\n"
+                        )
+                    elif (
+                        type(e) == litellm.exceptions.RateLimitError
+                        and "exceeded" in str(e).lower()
+                        or "insufficient_quota" in str(e).lower()
+                    ):
+                        interpreter.display_message(
+                            f""" > You ran out of current quota for OpenAI's API, please check your plan and billing details. You can either wait for the quota to reset or upgrade your plan.
 
-                    print(provider_message)
+                            To check your current usage and billing details, visit the [OpenAI billing page](https://platform.openai.com/settings/organization/billing/overview).
 
-                    response = input()
-                    print("")  # <- Aesthetic choice
+                            You can also use `interpreter --max_budget [higher USD amount]` to set a budget for your sessions.
+                            """
+                        )
 
-                    if response.strip().lower() == "y":
-                        interpreter.llm.model = "i"
-                        interpreter.display_message(f"> Model set to `i`")
+                    elif (
+                        interpreter.offline == False
+                        and "not have access" in str(e).lower()
+                    ):
+                        """
+                        Check for invalid model in error message and then fallback.
+                        """
+                        if (
+                            "invalid model" in error_message
+                            or "model does not exist" in error_message
+                        ):
+                            provider_message = f"\n\nThe model '{interpreter.llm.model}' does not exist or is invalid. Please check the model name and try again.\n\nWould you like to try Open Interpreter's hosted `i` model instead? (y/n)\n\n  "
+                        elif "groq" in error_message:
+                            provider_message = f"\n\nYou don't currently have access to '{interpreter.llm.model}' on Groq. Would you like to try Open Interpreter's hosted `i` model instead? (y/n)\n\n  "
+                        elif "outdated" in error_message and "library" in error_message:
+                            provider_message = f"\n\nYou need to update 'litellm', which Open Interpreter uses to talk to language models. Run `pip install litellm --upgrade`. If you're using Open Interpreter 0.2.0 or higher, try `interpreter --update` to fix this more easily. Would you like to try Open Interpreter's hosted `i` model instead? (y/n)\n\n  "
+                        elif "claude" in error_message:
+                            provider_message = f"\n\nYou need an API key from Anthropic to use Claude models like '{interpreter.llm.model}'. Would you like to try Open Interpreter's hosted `i` model instead? (y/n)\n\n  "
+                        else:
+                            provider_message = f"\n\nYou don't currently have access to '{interpreter.llm.model}'. Would you like to try Open Interpreter's hosted `i` model instead? (y/n)\n\n  "
+
+                        user_response = input(provider_message)
+
+                        if user_response.strip().lower() == "y":
+                            interpreter.llm.model = "i"
+                            interpreter.llm.api_key = (
+                                None  # Reset, will pull from env if exists
+                            )
+                            interpreter.llm.api_base = None
+                            interpreter.display_message(
+                                "\n\nNow using Open Interpreter's hosted i model.\n\n"
+                            )
+                            continue  # Retry with the hosted model
+                        else:
+                            interpreter.display_message(
+                                "\n\nIf you'd like help setting up your API key, visit https://docs.openinterpreter.com/language-models/intro\n\n"
+                            )
+                            break  # Exit
+                    elif interpreter.offline and not interpreter.os:
                         interpreter.display_message(
-                            "***Note:*** *Conversations with this model will be used to train our open-source model.*\n"
+                            "\n\nTo use offline models, install `interpreter[local]`, e.g. `pip install 'open-interpreter[local]'`. See https://docs.openinterpreter.com/local/local"
                         )
-
+                        break
                     else:
-                        raise
-                elif interpreter.offline and not interpreter.os:
-                    raise
-                else:
-                    raise
-
-        ### RUN CODE (if it's there) ###
-
-        if interpreter.messages[-1]["type"] == "code":
-            if interpreter.verbose:
-                print("Running code:", interpreter.messages[-1])
-
-            try:
-                # What language/code do you want to run?
-                language = interpreter.messages[-1]["format"].lower().strip()
-                code = interpreter.messages[-1]["content"]
-
-                if code.startswith("`\n"):
-                    code = code[2:].strip()
-                    if interpreter.verbose:
-                        print("Removing `\n")
-                    interpreter.messages[-1]["content"] = code  # So the LLM can see it.
-
-                # A common hallucination
-                if code.startswith("functions.execute("):
-                    edited_code = code.replace("functions.execute(", "").rstrip(")")
-                    try:
-                        code_dict = json.loads(edited_code)
-                        language = code_dict.get("language", language)
-                        code = code_dict.get("code", code)
-                        interpreter.messages[-1][
-                            "content"
-                        ] = code  # So the LLM can see it.
-                        interpreter.messages[-1][
-                            "format"
-                        ] = language  # So the LLM can see it.
-                    except:
-                        pass
-
-                # print(code)
-                # print("---")
-                # time.sleep(2)
-
-                if code.strip().endswith("executeexecute"):
-                    code = code.replace("executeexecute", "")
-                    try:
-                        interpreter.messages[-1][
-                            "content"
-                        ] = code  # So the LLM can see it.
-                    except:
-                        pass
+                        output = traceback.format_exc()
+                        # For other types of errors, print the traceback
+                        raise Exception(output) from e
 
-                if code.replace("\n", "").replace(" ", "").startswith('{"language":'):
-                    try:
-                        code_dict = json.loads(code)
-                        if set(code_dict.keys()) == {"language", "code"}:
-                            language = code_dict["language"]
-                            code = code_dict["code"]
-                            interpreter.messages[-1][
-                                "content"
-                            ] = code  # So the LLM can see it.
-                            interpreter.messages[-1][
-                                "format"
-                            ] = language  # So the LLM can see it.
-                    except:
-                        pass
-
-                if code.replace("\n", "").replace(" ", "").startswith("{language:"):
-                    try:
-                        code = code.replace("language: ", '"language": ').replace(
-                            "code: ", '"code": '
-                        )
-                        code_dict = json.loads(code)
-                        if set(code_dict.keys()) == {"language", "code"}:
-                            language = code_dict["language"]
-                            code = code_dict["code"]
-                            interpreter.messages[-1][
-                                "content"
-                            ] = code  # So the LLM can see it.
-                            interpreter.messages[-1][
-                                "format"
-                            ] = language  # So the LLM can see it.
-                    except:
-                        pass
+            ### RUN CODE (if it's there) ###
 
-                if (
-                    language == "text"
-                    or language == "markdown"
-                    or language == "plaintext"
-                ):
-                    # It does this sometimes just to take notes. Let it, it's useful.
-                    # In the future we should probably not detect this behavior as code at all.
-                    real_content = interpreter.messages[-1]["content"]
-                    interpreter.messages[-1] = {
-                        "role": "assistant",
-                        "type": "message",
-                        "content": f"```\n{real_content}\n```",
-                    }
-                    continue
+            if interpreter.messages[-1]["type"] == "code":
+                # Performance tracking for code execution
+                code_execution_start = time.time()
+                code_execution_count += 1
 
-                # Is this language enabled/supported?
-                if interpreter.computer.terminal.get_language(language) == None:
-                    output = f"`{language}` disabled or not supported."
+                if interpreter.verbose:
+                    print("Running code:", interpreter.messages[-1])
 
-                    yield {
-                        "role": "computer",
-                        "type": "console",
-                        "format": "output",
-                        "content": output,
-                    }
+                try:
+                    # What language/code do you want to run?
+                    language = interpreter.messages[-1]["format"].lower().strip()
+                    code = interpreter.messages[-1]["content"]
+
+                    # Handle various code formatting edge cases
+                    if code.startswith("`\n"):
+                        code = code[2:]
+
+                    if code.strip().endswith("```"):
+                        code = code.strip()[:-3]
+
+                    if code.startswith("```"):
+                        code = code[3:]
+                        # Extract language if present in code fence
+                        if "\n" in code:
+                            maybe_language, rest_of_code = code.split("\n", 1)
+                            if (
+                                maybe_language.strip()
+                                and not maybe_language.strip()[0] in "!@#$%^&*()"
+                            ):
+                                language = maybe_language.strip()
+                                code = rest_of_code
+
+                    # Handle common hallucinations
+                    if code.strip().endswith("executeexecute"):
+                        code = code.strip()[:-12]
+
+                    # Handle JSON-formatted code objects
+                    if (
+                        code.replace("\n", "")
+                        .replace(" ", "")
+                        .startswith('{"language":')
+                    ):
+                        try:
+                            code_object = json.loads(code)
+                            language = code_object.get("language", language)
+                            code = code_object.get("code", code)
+                        except:
+                            pass
+
+                    if code.replace("\n", "").replace(" ", "").startswith("{language:"):
+                        try:
+                            # This isn't valid JSON, but language models sometimes output this
+                            # Extract with regex
+                            language_match = re.search(
+                                r'{language:\s*[\'"]?(.*?)[\'"]?,', code
+                            )
+                            code_match = re.search(
+                                r'code:\s*[\'"]?(.*?)[\'"]?}', code, re.DOTALL
+                            )
+                            if language_match:
+                                language = language_match.group(1)
+                            if code_match:
+                                code = code_match.group(1)
+                        except:
+                            pass
+
+                    # Handle text or markdown content differently
+                    if (
+                        language == "text"
+                        or language == "markdown"
+                        or language == "plaintext"
+                    ):
+                        yield {
+                            "role": "assistant",
+                            "type": "message",
+                            "content": code,
+                        }
+                        continue
 
-                    # Let the response continue so it can deal with the unsupported code in another way. Also prevent looping on the same piece of code.
-                    if code != last_unsupported_code:
+                    # Check if language is supported
+                    if interpreter.computer.terminal.get_language(language) == None:
+                        yield {
+                            "role": "assistant",
+                            "type": "message",
+                            "content": f"I apologize, but I don't know how to execute `{language}` code. I can help you with Python, JavaScript, shell scripts, and many other languages though!",
+                        }
+                        # Store this for future reference
                         last_unsupported_code = code
                         continue
-                    else:
-                        break
 
-                # Is there any code at all?
-                if code.strip() == "":
-                    yield {
-                        "role": "computer",
-                        "type": "console",
-                        "format": "output",
-                        "content": "Code block was empty. Please try again, be sure to write code before executing.",
-                    }
-                    continue
+                    # Check if there's any code to run
+                    if code.strip() == "":
+                        yield {
+                            "role": "assistant",
+                            "type": "message",
+                            "content": "It seems the code block is empty. Can you please provide the code you'd like me to execute?",
+                        }
+                        continue
 
-                # Yield a message, such that the user can stop code execution if they want to
-                try:
-                    yield {
-                        "role": "computer",
-                        "type": "confirmation",
-                        "format": "execution",
-                        "content": {
-                            "type": "code",
+                    # Yield a message to allow user to interrupt code execution
+                    try:
+                        yield {
+                            "role": "computer",
+                            "type": "confirmation",
                             "format": language,
                             "content": code,
-                        },
-                    }
-                except GeneratorExit:
-                    # The user might exit here.
-                    # We need to tell python what we (the generator) should do if they exit
-                    break
+                        }
+                    except GeneratorExit:
+                        raise
 
-                # They may have edited the code! Grab it again
-                code = [m for m in interpreter.messages if m["type"] == "code"][-1][
-                    "content"
-                ]
+                    # They may have edited the code! Grab it again
+                    code = [m for m in interpreter.messages if m["type"] == "code"][-1][
+                        "content"
+                    ]
 
-                # don't let it import computer — we handle that!
-                if interpreter.computer.import_computer_api and language == "python":
-                    code = code.replace("import computer\n", "pass\n")
-                    code = re.sub(
-                        r"import computer\.(\w+) as (\w+)", r"\2 = computer.\1", code
-                    )
-                    code = re.sub(
-                        r"from computer import (.+)",
-                        lambda m: "\n".join(
-                            f"{x.strip()} = computer.{x.strip()}"
-                            for x in m.group(1).split(", ")
-                        ),
-                        code,
-                    )
-                    code = re.sub(r"import computer\.\w+\n", "pass\n", code)
-                    # If it does this it sees the screenshot twice (which is expected jupyter behavior)
-                    if any(
-                        [
-                            code.strip().split("\n")[-1].startswith(text)
-                            for text in [
-                                "computer.display.view",
-                                "computer.display.screenshot",
-                                "computer.view",
-                                "computer.screenshot",
-                            ]
-                        ]
+                    # Don't let it import computer — we handle that!
+                    if (
+                        interpreter.computer.import_computer_api
+                        and language == "python"
                     ):
-                        code = code + "\npass"
-
-                # sync up some things (is this how we want to do this?)
-                interpreter.computer.verbose = interpreter.verbose
-                interpreter.computer.debug = interpreter.debug
-                interpreter.computer.emit_images = interpreter.llm.supports_vision
-                interpreter.computer.max_output = interpreter.max_output
+                        # If we're importing computer, we want to make sure we don't import computer again
+                        if (
+                            "import computer" in code.lower()
+                            and "# import computer" not in code.lower()
+                        ):
+                            if "import computer" in code:
+                                code = code.replace(
+                                    "import computer",
+                                    "# import computer (already imported)",
+                                )
+                            if "from computer" in code:
+                                code = code.replace(
+                                    "from computer",
+                                    "# from computer (already imported)",
+                                )
+
+                    # Synchronize settings to improve performance
+                    interpreter.computer.verbose = interpreter.verbose
+                    interpreter.computer.debug = interpreter.debug
+                    interpreter.computer.emit_images = interpreter.llm.supports_vision
+                    interpreter.computer.max_output = interpreter.max_output
+
+                    # Synchronize computer state if needed (in a background thread to avoid blocking)
+                    if interpreter.sync_computer:
+
+                        def sync_computer():
+                            try:
+                                if hasattr(interpreter.computer, "sync"):
+                                    interpreter.computer.sync()
+                            except Exception as e:
+                                if interpreter.debug:
+                                    print(f"Computer sync error: {str(e)}")
+
+                        # Run sync in background if it's not a critical operation
+                        threading.Thread(target=sync_computer).start()
+
+                    ## ↓ CODE IS RUN HERE
+                    for line in interpreter.computer.run(language, code, stream=True):
+                        yield {"role": "computer", **line}
+
+                    ## ↑ CODE IS RUN HERE
+
+                    # Log code execution performance
+                    execution_time = time.time() - code_execution_start
+                    log_performance_metric(
+                        "code_execution",
+                        language,
+                        execution_time,
+                        {
+                            "code_length": len(code),
+                            "execution_count": code_execution_count,
+                        },
+                    )
 
-                # sync up the interpreter's computer with your computer
-                try:
+                    # Synchronize computer state after execution if needed
                     if interpreter.sync_computer and language == "python":
-                        computer_dict = interpreter.computer.to_dict()
-                        if "_hashes" in computer_dict:
-                            computer_dict.pop("_hashes")
-                        if "system_message" in computer_dict:
-                            computer_dict.pop("system_message")
-                        computer_json = json.dumps(computer_dict)
-                        sync_code = f"""import json\ncomputer.load_dict(json.loads('''{computer_json}'''))"""
-                        interpreter.computer.run("python", sync_code)
-                except Exception as e:
-                    if interpreter.debug:
-                        raise
-                    print(str(e))
-                    print("Failed to sync iComputer with your Computer. Continuing...")
-
-                ## ↓ CODE IS RUN HERE
-
-                for line in interpreter.computer.run(language, code, stream=True):
-                    yield {"role": "computer", **line}
+                        try:
+                            # Extract computer state as a Python dict
+                            result = interpreter.computer.run(
+                                "python",
+                                """
+                                import json
+                                computer_dict = computer.to_dict()
+                                if '_hashes' in computer_dict:
+                                    computer_dict.pop('_hashes')
+                                if "system_message" in computer_dict:
+                                    computer_dict.pop("system_message")
+                                print(json.dumps(computer_dict))
+                                """,
+                                stream=False,
+                            )
+                            # Process the result only if successful
+                            if result and len(result) > 0:
+                                result_content = result[-1].get("content", "").strip()
+                                if result_content:
+                                    try:
+                                        computer_dict = json.loads(
+                                            result_content.strip('"').strip("'")
+                                        )
+                                        interpreter.computer.load_dict(computer_dict)
+                                    except json.JSONDecodeError:
+                                        if interpreter.debug:
+                                            print(
+                                                "Failed to parse computer state as JSON"
+                                            )
+                        except Exception as e:
+                            if interpreter.debug:
+                                print(f"Error synchronizing computer state: {str(e)}")
+
+                    # Send active_line = None to clear any active line highlighting
+                    yield {
+                        "role": "computer",
+                        "type": "console",
+                        "format": "active_line",
+                        "content": None,
+                    }
 
-                ## ↑ CODE IS RUN HERE
+                except KeyboardInterrupt:
+                    # Handle user interruption gracefully
+                    yield {
+                        "role": "computer",
+                        "type": "console",
+                        "format": "output",
+                        "content": "\n[Code execution interrupted by user]",
+                    }
+                    break
 
-                # sync up your computer with the interpreter's computer
-                try:
-                    if interpreter.sync_computer and language == "python":
-                        # sync up the interpreter's computer with your computer
-                        result = interpreter.computer.run(
-                            "python",
-                            """
-                            import json
-                            computer_dict = computer.to_dict()
-                            if '_hashes' in computer_dict:
-                                computer_dict.pop('_hashes')
-                            if "system_message" in computer_dict:
-                                computer_dict.pop("system_message")
-                            print(json.dumps(computer_dict))
-                            """,
-                        )
-                        result = result[-1]["content"]
-                        interpreter.computer.load_dict(
-                            json.loads(result.strip('"').strip("'"))
-                        )
                 except Exception as e:
-                    if interpreter.debug:
-                        raise
-                    print(str(e))
-                    print("Failed to sync your Computer with iComputer. Continuing.")
-
-                # yield final "active_line" message, as if to say, no more code is running. unlightlight active lines
-                # (is this a good idea? is this our responsibility? i think so — we're saying what line of code is running! ...?)
-                yield {
-                    "role": "computer",
-                    "type": "console",
-                    "format": "active_line",
-                    "content": None,
-                }
-
-            except KeyboardInterrupt:
-                break  # It's fine.
-            except:
-                yield {
-                    "role": "computer",
-                    "type": "console",
-                    "format": "output",
-                    "content": traceback.format_exc(),
-                }
-
-        else:
-            ## LOOP MESSAGE
-            # This makes it utter specific phrases if it doesn't want to be told to "Proceed."
+                    # Log the exception and return it to the user
+                    error_traceback = traceback.format_exc()
+                    log_performance_metric(
+                        "code_execution",
+                        language,
+                        time.time() - code_execution_start,
+                        {
+                            "code_length": len(code),
+                            "error": str(e),
+                            "error_type": type(e).__name__,
+                        },
+                        level=1,
+                    )  # Log at critical level
 
-            loop_message = interpreter.loop_message
-            if interpreter.os:
-                loop_message = loop_message.replace(
-                    "If the entire task I asked for is done,",
-                    "If the entire task I asked for is done, take a screenshot to verify it's complete, or if you've already taken a screenshot and verified it's complete,",
-                )
-            loop_breakers = interpreter.loop_breakers
+                    yield {
+                        "role": "computer",
+                        "type": "console",
+                        "format": "output",
+                        "content": error_traceback,
+                    }
 
-            if (
-                interpreter.loop
-                and interpreter.messages
-                and interpreter.messages[-1].get("role", "") == "assistant"
-                and not any(
-                    task_status in interpreter.messages[-1].get("content", "")
-                    for task_status in loop_breakers
-                )
-            ):
-                # Remove past loop_message messages
-                interpreter.messages = [
-                    message
-                    for message in interpreter.messages
-                    if message.get("content", "") != loop_message
-                ]
-                # Combine adjacent assistant messages, so hopefully it learns to just keep going!
-                combined_messages = []
-                for message in interpreter.messages:
-                    if (
-                        combined_messages
-                        and message["role"] == "assistant"
-                        and combined_messages[-1]["role"] == "assistant"
-                        and message["type"] == "message"
-                        and combined_messages[-1]["type"] == "message"
-                    ):
-                        combined_messages[-1]["content"] += "\n" + message["content"]
-                    else:
-                        combined_messages.append(message)
-                interpreter.messages = combined_messages
+                # Explicitly clear any temp variables to help with memory usage
+                if "language" in locals():
+                    del language
+                if "code" in locals():
+                    del code
+                if "result" in locals():
+                    del result
+
+            else:
+                ## LOOP MESSAGE
+                # This makes it utter specific phrases if it doesn't want to be told to "Proceed."
+                loop_message = interpreter.loop_message
+                if interpreter.os:
+                    loop_message = loop_message.replace(
+                        "If the entire task I asked for is done,",
+                        "If the entire task I asked for is done, take a screenshot to verify it's complete, or if you've already taken a screenshot and verified it's complete,",
+                    )
+                loop_breakers = interpreter.loop_breakers
 
-                # Send model the loop_message:
-                insert_loop_message = True
+                if (
+                    interpreter.loop
+                    and interpreter.messages
+                    and interpreter.messages[-1].get("role", "") == "assistant"
+                    and not any(
+                        task_status in interpreter.messages[-1].get("content", "")
+                        for task_status in loop_breakers
+                    )
+                ):
+                    # Remove past loop_message messages for cleaner history
+                    interpreter.messages = [
+                        message
+                        for message in interpreter.messages
+                        if message.get("content", "") != loop_message
+                    ]
+
+                    # Combine adjacent assistant messages for better context
+                    with PerformanceTimer("message_processing", "combine_messages"):
+                        combined_messages = []
+                        for message in interpreter.messages:
+                            if (
+                                combined_messages
+                                and message.get("role") == "assistant"
+                                and combined_messages[-1].get("role") == "assistant"
+                                and message.get("type") == "message"
+                                and combined_messages[-1].get("type") == "message"
+                            ):
+                                # Combine this message with the previous one
+                                combined_messages[-1][
+                                    "content"
+                                ] += "\n\n" + message.get("content", "")
+                            else:
+                                # Add as a new message
+                                combined_messages.append(message)
+
+                        interpreter.messages = combined_messages
+
+                    # Send model the loop_message:
+                    insert_loop_message = True
+                    continue
 
-                continue
+                # Doesn't want to run code. We're done!
+                break
 
-            # Doesn't want to run code. We're done!
-            break
+    except Exception as e:
+        # Log any unexpected exceptions
+        log_performance_metric(
+            "respond",
+            "error",
+            time.time() - overall_start_time,
+            {"error": str(e), "error_type": type(e).__name__},
+            level=1,
+        )
+        raise
+    finally:
+        # Log overall performance metrics for the entire response cycle
+        overall_duration = time.time() - overall_start_time
+        log_performance_metric(
+            "respond",
+            "complete",
+            overall_duration,
+            {
+                "message_count": message_count,
+                "code_execution_count": code_execution_count,
+            },
+        )
 
     return
diff --git a/interpreter/core/utils/performance_logger.py b/interpreter/core/utils/performance_logger.py
new file mode 100644
index 0000000000..64add39873
--- /dev/null
+++ b/interpreter/core/utils/performance_logger.py
@@ -0,0 +1,425 @@
+"""
+Performance logging utilities for Open Interpreter.
+This module provides functions to track and log performance metrics.
+"""
+
+import time
+import os
+import json
+import psutil
+import threading
+import traceback
+from datetime import datetime
+from collections import deque
+
+# Enable/disable performance logging with different verbosity levels
+# 0 = disabled, 1 = minimal (critical metrics only), 2 = standard, 3 = verbose
+PERFORMANCE_LOGGING_LEVEL = int(os.environ.get("OI_PERFORMANCE_LOGGING_LEVEL", "2"))
+PERFORMANCE_LOGGING_ENABLED = PERFORMANCE_LOGGING_LEVEL > 0
+
+# Logging configuration
+LOG_FILE_PATH = os.environ.get("OI_PERFORMANCE_LOG_PATH", "performance_logs.jsonl")
+MAX_LOG_FILE_SIZE = int(os.environ.get("OI_MAX_LOG_FILE_SIZE_MB", "10")) * 1024 * 1024  # 10MB by default
+LOG_ROTATION_COUNT = int(os.environ.get("OI_LOG_ROTATION_COUNT", "3"))  # Keep 3 log files by default
+
+# Memory usage tracking with efficient data structure (limited circular buffer)
+MAX_MEMORY_SAMPLES = int(os.environ.get("OI_MAX_MEMORY_SAMPLES", "100"))
+memory_samples = deque(maxlen=MAX_MEMORY_SAMPLES)
+
+# In-memory metrics storage for quick analysis
+metrics_buffer = deque(maxlen=1000)  # Keep last 1000 metrics in memory for analysis
+
+# Thread-local storage for nested timers
+thread_local = threading.local()
+
+# Write lock to prevent concurrent file writes
+log_file_lock = threading.Lock()
+
+def _should_log_level(level):
+    """Check if the specified level should be logged based on current configuration"""
+    return PERFORMANCE_LOGGING_ENABLED and PERFORMANCE_LOGGING_LEVEL >= level
+
+def _rotate_log_file_if_needed():
+    """Rotate log file if it exceeds maximum size"""
+    if not os.path.exists(LOG_FILE_PATH):
+        return
+        
+    if os.path.getsize(LOG_FILE_PATH) < MAX_LOG_FILE_SIZE:
+        return
+        
+    # Rotate existing log files
+    for i in range(LOG_ROTATION_COUNT - 1, 0, -1):
+        src = f"{LOG_FILE_PATH}.{i}" if i > 0 else LOG_FILE_PATH
+        dst = f"{LOG_FILE_PATH}.{i+1}"
+        
+        if os.path.exists(src):
+            if os.path.exists(dst):
+                try:
+                    os.remove(dst)
+                except:
+                    pass
+            try:
+                os.rename(src, dst)
+            except:
+                pass
+    
+    # Create new empty log file
+    try:
+        open(LOG_FILE_PATH, 'w').close()
+    except:
+        pass
+
+def get_system_info():
+    """Get basic system information for performance context"""
+    info = {}
+    try:
+        info["cpu_count"] = psutil.cpu_count(logical=True)
+        info["physical_cpu_count"] = psutil.cpu_count(logical=False)
+        mem = psutil.virtual_memory()
+        info["total_memory_mb"] = mem.total / (1024 * 1024)
+        info["available_memory_mb"] = mem.available / (1024 * 1024)
+        info["memory_percent"] = mem.percent
+        info["cpu_percent"] = psutil.cpu_percent(interval=0.1)
+        info["swap_memory_mb"] = psutil.swap_memory().total / (1024 * 1024)
+    except:
+        info["error"] = "Failed to get system info"
+    return info
+
+def log_performance_metric(category, operation, duration, metadata=None, level=2):
+    """
+    Log a performance metric to the console and optionally to a file.
+    
+    Args:
+        category (str): Category of the operation (e.g., 'llm', 'message_processing')
+        operation (str): Name of the operation being measured
+        duration (float): Duration of the operation in seconds
+        metadata (dict, optional): Additional metadata about the operation
+        level (int): Logging level (1=critical, 2=standard, 3=verbose)
+    """
+    if not _should_log_level(level):
+        return
+    
+    try:
+        timestamp = datetime.now().isoformat()
+        
+        # Get current memory usage (only for standard+ logging)
+        memory_mb = 0
+        if _should_log_level(2):
+            try:
+                process = psutil.Process()
+                memory_info = process.memory_info()
+                memory_mb = memory_info.rss / 1024 / 1024
+                
+                # Store memory sample
+                memory_samples.append((timestamp, memory_mb))
+            except:
+                pass
+        
+        # Create log entry
+        log_entry = {
+            "timestamp": timestamp,
+            "category": category,
+            "operation": operation,
+            "duration_seconds": round(duration, 4),
+            "memory_mb": round(memory_mb, 2),
+            "metadata": metadata or {}
+        }
+        
+        # Add to in-memory buffer
+        metrics_buffer.append(log_entry)
+        
+        # Print to console based on verbosity
+        if level == 1 or (level == 2 and duration > 0.5) or PERFORMANCE_LOGGING_LEVEL >= 3:
+            print(f"[PERFORMANCE] {category}.{operation}: {duration:.4f}s" + 
+                 (f", Memory: {memory_mb:.2f}MB" if _should_log_level(2) else ""))
+        
+        # Write to log file (thread-safe)
+        if _should_log_level(2):
+            with log_file_lock:
+                try:
+                    _rotate_log_file_if_needed()
+                    with open(LOG_FILE_PATH, "a") as f:
+                        f.write(json.dumps(log_entry) + "\n")
+                except Exception as e:
+                    if _should_log_level(3):
+                        print(f"[PERFORMANCE] Failed to write to log file: {e}")
+    except Exception as e:
+        if _should_log_level(3):
+            print(f"[PERFORMANCE] Error in logging: {str(e)}")
+
+class PerformanceTimer:
+    """
+    Context manager for timing code blocks with support for nested timers
+    and hierarchical performance tracking.
+    """
+    
+    def __init__(self, category, operation, metadata=None, level=2):
+        self.category = category
+        self.operation = operation
+        self.metadata = metadata or {}
+        self.level = level
+        self.start_time = None
+        self.parent_timer = None
+        self.depth = 0
+    
+    def __enter__(self):
+        # Track timer start time
+        self.start_time = time.time()
+        
+        # Handle nested timers
+        if not hasattr(thread_local, "current_timer"):
+            thread_local.current_timer = None
+        
+        self.parent_timer = thread_local.current_timer
+        if self.parent_timer:
+            self.depth = self.parent_timer.depth + 1
+            # Add thread_id to metadata for debugging nested timers
+            self.metadata["parent"] = f"{self.parent_timer.category}.{self.parent_timer.operation}"
+            self.metadata["depth"] = self.depth
+        
+        # Set this as the current timer
+        thread_local.current_timer = self
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.start_time is not None:
+            # Calculate duration
+            duration = time.time() - self.start_time
+            
+            # Add exception info if applicable
+            if exc_type:
+                self.metadata["exception"] = {
+                    "type": exc_type.__name__,
+                    "message": str(exc_val),
+                }
+                # Higher logging level for errors
+                log_level = 1
+            else:
+                log_level = self.level
+            
+            # Log the performance metric
+            log_performance_metric(
+                self.category, 
+                self.operation, 
+                duration, 
+                self.metadata,
+                level=log_level
+            )
+            
+            # Restore parent timer
+            thread_local.current_timer = self.parent_timer
+
+def get_memory_trend():
+    """
+    Analyze memory usage trend and return a summary.
+    
+    Returns:
+        dict: Memory trend summary
+    """
+    if not _should_log_level(2) or len(memory_samples) < 2:
+        return {"trend": "insufficient_data"}
+    
+    first_sample = memory_samples[0][1]  # Memory value from first sample
+    last_sample = memory_samples[-1][1]  # Memory value from last sample
+    
+    # Calculate memory growth
+    memory_growth = last_sample - first_sample
+    
+    # Check if memory growth is significant (>1MB)
+    significant = abs(memory_growth) > 1.0
+    
+    # Calculate rate of change (MB/minute)
+    first_time = datetime.fromisoformat(memory_samples[0][0])
+    last_time = datetime.fromisoformat(memory_samples[-1][0])
+    time_diff_seconds = (last_time - first_time).total_seconds()
+    
+    if time_diff_seconds > 0:
+        memory_growth_rate = (memory_growth / time_diff_seconds) * 60  # MB/minute
+    else:
+        memory_growth_rate = 0
+    
+    # Get min and max memory
+    memory_values = [sample[1] for sample in memory_samples]
+    min_memory = min(memory_values)
+    max_memory = max(memory_values)
+    
+    return {
+        "initial_memory_mb": round(first_sample, 2),
+        "current_memory_mb": round(last_sample, 2),
+        "min_memory_mb": round(min_memory, 2),
+        "max_memory_mb": round(max_memory, 2),
+        "total_growth_mb": round(memory_growth, 2),
+        "growth_rate_mb_per_minute": round(memory_growth_rate, 2),
+        "num_samples": len(memory_samples),
+        "time_window_seconds": round(time_diff_seconds, 1),
+        "trend": "stable" if not significant else "increasing" if memory_growth > 0 else "decreasing"
+    }
+
+def log_message_stats(messages):
+    """
+    Log statistics about the message history.
+    
+    Args:
+        messages (list): List of message objects
+    """
+    if not _should_log_level(2) or not messages:
+        return {}
+    
+    try:
+        total_messages = len(messages)
+        message_types = {}
+        role_counts = {}
+        total_content_length = 0
+        max_message_size = 0
+        code_blocks_count = 0
+        console_output_count = 0
+        image_count = 0
+        
+        # Loop through messages just once for efficiency
+        for msg in messages:
+            # Count by message type
+            msg_type = msg.get("type", "unknown")
+            message_types[msg_type] = message_types.get(msg_type, 0) + 1
+            
+            # Count by role
+            role = msg.get("role", "unknown")
+            role_counts[role] = role_counts.get(role, 0) + 1
+            
+            # Track special message types
+            if msg_type == "code":
+                code_blocks_count += 1
+            elif msg_type == "console" and msg.get("format") == "output":
+                console_output_count += 1
+            elif msg_type == "image":
+                image_count += 1
+            
+            # Calculate content length
+            content = msg.get("content", "")
+            if isinstance(content, str):
+                content_len = len(content)
+                total_content_length += content_len
+                max_message_size = max(max_message_size, content_len)
+        
+        stats = {
+            "total_messages": total_messages,
+            "message_types": message_types,
+            "role_counts": role_counts,
+            "total_content_length": total_content_length,
+            "max_message_size": max_message_size,
+            "avg_message_size": round(total_content_length / total_messages, 2),
+            "code_blocks_count": code_blocks_count,
+            "console_output_count": console_output_count,
+            "image_count": image_count,
+        }
+        
+        # Only log if we have enough context to be useful
+        if total_messages > 1:
+            log_performance_metric("messages", "stats", 0, stats, level=2)
+        
+        return stats
+    except Exception as e:
+        if _should_log_level(3):
+            print(f"[PERFORMANCE] Error in message stats logging: {str(e)}")
+        return {}
+
+def get_hotspots(threshold_seconds=0.5, top_n=5):
+    """
+    Identify performance hotspots based on recent metrics.
+    
+    Args:
+        threshold_seconds (float): Minimum duration to consider as a hotspot
+        top_n (int): Number of top hotspots to return
+    
+    Returns:
+        list: Top N performance hotspots
+    """
+    if not metrics_buffer:
+        return []
+    
+    # Group by category and operation
+    operation_stats = {}
+    for metric in metrics_buffer:
+        key = f"{metric['category']}.{metric['operation']}"
+        if key not in operation_stats:
+            operation_stats[key] = {
+                "category": metric["category"],
+                "operation": metric["operation"],
+                "count": 0,
+                "total_duration": 0,
+                "max_duration": 0,
+                "min_duration": float('inf'),
+                "avg_duration": 0
+            }
+        
+        stats = operation_stats[key]
+        duration = metric["duration_seconds"]
+        
+        stats["count"] += 1
+        stats["total_duration"] += duration
+        stats["max_duration"] = max(stats["max_duration"], duration)
+        stats["min_duration"] = min(stats["min_duration"], duration)
+        stats["avg_duration"] = stats["total_duration"] / stats["count"]
+    
+    # Filter by threshold and sort by average duration
+    hotspots = [
+        stats for stats in operation_stats.values()
+        if stats["avg_duration"] >= threshold_seconds
+    ]
+    
+    hotspots.sort(key=lambda x: x["avg_duration"], reverse=True)
+    
+    # Return top N hotspots
+    return hotspots[:top_n]
+
+def report_performance_summary():
+    """
+    Generate a comprehensive performance report.
+    
+    Returns:
+        dict: Performance summary
+    """
+    if not _should_log_level(1):
+        return {"enabled": False}
+    
+    try:
+        # Get memory trend
+        mem_trend = get_memory_trend()
+        
+        # Get system info
+        sys_info = get_system_info()
+        
+        # Get hotspots
+        hotspots = get_hotspots()
+        
+        # Build summary
+        summary = {
+            "timestamp": datetime.now().isoformat(),
+            "memory": mem_trend,
+            "system": sys_info,
+            "hotspots": hotspots,
+            "metrics_count": len(metrics_buffer),
+            "enabled": True,
+            "level": PERFORMANCE_LOGGING_LEVEL
+        }
+        
+        if _should_log_level(2):
+            summary["log_file"] = {
+                "path": os.path.abspath(LOG_FILE_PATH),
+                "size_mb": round(os.path.getsize(LOG_FILE_PATH) / (1024 * 1024), 2) if os.path.exists(LOG_FILE_PATH) else 0,
+                "max_size_mb": MAX_LOG_FILE_SIZE / (1024 * 1024),
+                "rotation_count": LOG_ROTATION_COUNT
+            }
+        
+        # Log summary
+        if _should_log_level(3):
+            print(f"[PERFORMANCE] Summary: {json.dumps(summary, indent=2)}")
+            
+        return summary
+    except Exception as e:
+        error_info = {
+            "error": str(e),
+            "traceback": traceback.format_exc()
+        }
+        if _should_log_level(3):
+            print(f"[PERFORMANCE] Error generating summary: {str(e)}")
+        return {"error": error_info, "enabled": True}
\ No newline at end of file
diff --git a/scripts/wtf.py b/scripts/wtf.py
index 488e56c460..2183c7f648 100644
--- a/scripts/wtf.py
+++ b/scripts/wtf.py
@@ -10,6 +10,8 @@
 import subprocess
 import sys
 import time
+import psutil
+import threading
 
 import platformdirs
 import pyperclip
@@ -26,6 +28,17 @@
 os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
 import litellm
 
+# Performance monitoring
+def get_system_info():
+    info = {}
+    info["os"] = platform.system()
+    info["cwd"] = os.getcwd()
+    info["shell"] = os.environ.get('SHELL', '')
+    info["cpu_percent"] = psutil.cpu_percent(interval=0.1)
+    info["memory_percent"] = psutil.virtual_memory().percent
+    info["python_version"] = platform.python_version()
+    return info
+
 # Define system messages
 SYSTEM_MESSAGE = f"""
 You are a fast, efficient terminal assistant. Your task is to:
@@ -51,422 +64,166 @@
 - The error may be as simple as a spelling error, or as complex as requiring tests to be run, or code to be find-and-replaced.
 - Prioritize speed and conciseness in your response. Don't use markdown headings. Don't say more than a sentence or two. Be incredibly concise.
 
-User's System: {platform.system()}
-CWD: {os.getcwd()}
-{"Shell: " + os.environ.get('SHELL') if os.environ.get('SHELL') else ''}
-
-"""
-
-CUSTOM_MESSAGE_SYSTEM_MESSAGE = f"""
-
-You are a fast, efficient AI assistant for terminal and coding tasks. When summoned, you will:
-
-1. Review the provided terminal history (which may or may not be relevant) and final user query.
-2. Determine the most appropriate solution or debugging step to resolve the user's final query.
-3. Respond with a brief explanation and a single shell command in a markdown code block.
-
-Rules:
-- Provide one logical command (use \ or ^ for multiline).
-- Keep explanations concise and place them before the code block.
-- Use proper command escaping (e.g., sed with correct quotes).
-- Avoid comments in the code block.
-- If more info is needed, provide a command to gather it (e.g., grep).
-- Focus on the user's FINAL query and ADDRESS NOTHING ELSE, using terminal history for context if relevant.
-- For multi-step solutions, explain briefly and provide the first or combined command.
-- Prioritize addressing the user's specific request (at the END, after "wtf") efficiently.
-
-User's System: {platform.system()}
-CWD: {os.getcwd()}
-{"Shell: " + os.environ.get('SHELL') if os.environ.get('SHELL') else ''}
-
-"""
-
-LOCAL_SYSTEM_MESSAGE = f"""
-You're a fast AI assistant for terminal issues. You must:
-
-1. Scan terminal history
-2. Identify latest error
-3. Determine best solution
-4. Reply with brief explanation + single shell command in markdown
-
-Rules:
-- One logical command (use \ or ^ for multiline)
-- Explain briefly, then provide command
-- No comments in code
-- Proper escaping (e.g., sed with correct quotes)
-- If unsure, get more info with a command like grep
-- Prioritize speed and conciseness
-
-Example response:
-
-We need to fix the file permissions on config.yml.
-```bash
-chmod 644 config.yml
-```
+{get_system_info()}
 
-User's System: {platform.system()}
-CWD: {os.getcwd()}
-{"Shell: " + os.environ.get('SHELL') if os.environ.get('SHELL') else ''}
-
-Now, it's your turn:
 """
 
-
-def main():
-    ### GET OPTIONAL CUSTOM MESSAGE
-
-    custom_message = None
-    if len(sys.argv) > 1:
-        custom_message = "wtf " + " ".join(sys.argv[1:])
-
-    ### GET TERMINAL HISTORY
-
-    keyboard = Controller()
-    history = None
-
-    ## SELECT ALL AND COPY METHOD
-
-    if True:
-        # Save clipboard
-        clipboard = pyperclip.paste()
-
-        # Select all text
-        shortcut_key = Key.cmd if platform.system() == "Darwin" else Key.ctrl
-        with keyboard.pressed(shortcut_key):
-            keyboard.press("a")
-            keyboard.release("a")
-
-        # Copy selected text
-        with keyboard.pressed(shortcut_key):
-            keyboard.press("c")
-            keyboard.release("c")
-
-        # Deselect
-        keyboard.press(Key.backspace)
-        keyboard.release(Key.backspace)
-
-        # Wait for the clipboard to update
-        time.sleep(0.1)
-
-        # Get terminal history from clipboard
-        history = pyperclip.paste()
-
-        # Reset clipboard to stored one
-        pyperclip.copy(clipboard)
-
-    ## OCR SCREENSHOT METHOD
-
-    if not history:
-        try:
-            import pytesseract
-            from PIL import ImageGrab
-
-            # Get active window coordinates using platform-specific methods
-            platform_name = platform.system()
-            if platform_name == "Windows":
-                import win32gui
-
-                window = win32gui.GetForegroundWindow()
-                left, top, right, bottom = win32gui.GetWindowRect(window)
-            elif platform_name == "Darwin":
-                from Quartz import (
-                    CGWindowListCopyWindowInfo,
-                    kCGNullWindowID,
-                    kCGWindowListOptionOnScreenOnly,
-                )
-
-                window_info = CGWindowListCopyWindowInfo(
-                    kCGWindowListOptionOnScreenOnly, kCGNullWindowID
-                )
-                for window in window_info:
-                    if window["kCGWindowLayer"] == 0:
-                        window_geometry = window["kCGWindowBounds"]
-                        left = window_geometry["X"]
-                        top = window_geometry["Y"]
-                        right = int(left + window_geometry["Width"])
-                        bottom = int(top + window_geometry["Height"])
-                        break
-            else:  # Assume it's a Linux-based system
-                root = subprocess.Popen(
-                    ["xprop", "-root", "_NET_ACTIVE_WINDOW"], stdout=subprocess.PIPE
+# Function to capture terminal history
+def get_terminal_history():
+    os_name = platform.system()
+    history = ""
+    
+    try:
+        if os_name == "Linux" or os_name == "Darwin":  # macOS or Linux
+            shell = os.environ.get('SHELL', '')
+            if 'zsh' in shell:
+                history_path = os.path.expanduser('~/.zsh_history')
+                if os.path.exists(history_path):
+                    with open(history_path, 'r', encoding='utf-8', errors='ignore') as f:
+                        lines = f.readlines()
+                        # Get the last 20 commands or fewer if history is shorter
+                        history = ''.join(lines[-20:]) if lines else ""
+            else:  # Default to bash
+                history_path = os.path.expanduser('~/.bash_history')
+                if os.path.exists(history_path):
+                    with open(history_path, 'r', encoding='utf-8', errors='ignore') as f:
+                        lines = f.readlines()
+                        history = ''.join(lines[-20:]) if lines else ""
+        
+        elif os_name == "Windows":
+            # PowerShell history
+            try:
+                result = subprocess.run(
+                    ["powershell", "-Command", "Get-History | Select-Object -Last 20 | Format-Table -Property CommandLine -HideTableHeaders"], 
+                    capture_output=True, 
+                    text=True, 
+                    check=True
                 )
-                stdout, stderr = root.communicate()
-                m = re.search(b"^_NET_ACTIVE_WINDOW.* ([\\w]+)$", stdout)
-                if m is not None:
-                    window_id = m.group(1)
-                    window = subprocess.Popen(
-                        ["xwininfo", "-id", window_id], stdout=subprocess.PIPE
-                    )
-                    stdout, stderr = window.communicate()
-                    match = re.search(
-                        rb"Absolute upper-left X:\s*(\d+).*Absolute upper-left Y:\s*(\d+).*Width:\s*(\d+).*Height:\s*(\d+)",
-                        stdout,
-                        re.DOTALL,
-                    )
-                    if match is not None:
-                        left, top, width, height = map(int, match.groups())
-                        right = left + width
-                        bottom = top + height
-
-            # spinner.stop()
-            # print("\nPermission to capture terminal commands via screenshot -> OCR?")
-            # permission = input("(y/n) > ")
-            # print("")
-            # if permission.lower() != 'y':
-            #     print("Exiting...")
-            #     exit()
-            # spinner.start()
-
-            # Take screenshot of the active window
-            screenshot = ImageGrab.grab(
-                bbox=(int(left), int(top), int(right), int(bottom))
-            )
-
-            # OCR the screenshot to get the text
-            text = pytesseract.image_to_string(screenshot)
-
-            history = text
-
-            if "wtf" in history:
-                last_wtf_index = history.rindex("wtf")
-                history = history[:last_wtf_index]
-        except ImportError:
-            spinner.stop()
-            print(
-                "To use OCR to capture terminal output (recommended) run `pip install pytesseract` or `pip3 install pytesseract`."
-            )
-            spinner.start()
-
-    ## TERMINAL HISTORY METHOD
-
-    if not history:
+                history = result.stdout
+            except:
+                # If PowerShell history fails, try to get command history another way
+                pass
+        
+        # Add current directory contents for context
         try:
-            shell = os.environ.get("SHELL", "/bin/bash")
-            command = [shell, "-ic", "fc -ln -10"]  # Get just the last command
-
-            output = subprocess.check_output(command, stderr=subprocess.STDOUT).decode(
-                "utf-8"
-            )
-
-            # Split the output into lines
-            lines = output.strip().split("\n")
-
-            # Filter out lines that look like the "saving session" message
-            history = [
-                line
-                for line in lines
-                if not line.startswith("...")
-                and "saving" not in line
-                and "Saving session..." not in line
-            ]
-            history = [l.strip() for l in history if l.strip()][-10:]
-
-            # Split the history into individual commands
-
-            # Get the last command
-            last_command = history[-1]
-            spinner.start()
-            print(
-                f"\nRunning the last command again to collect its output: {last_command}\n"
-            )
-            spinner.stop()
-            # Run the last command and collect its output
+            dir_contents = subprocess.run(["ls" if os_name != "Windows" else "dir"], capture_output=True, text=True, shell=True)
+            history += "\n\nCurrent directory contents:\n" + dir_contents.stdout
+        except:
+            pass
+            
+        return history
+    
+    except Exception as e:
+        return f"Error retrieving terminal history: {str(e)}"
+
+# Performance-optimized function to get LLM response
+def get_llm_response(terminal_history):
+    start_time = time.time()
+    
+    try:
+        # Setup the parameters with optimal settings for fast response
+        params = {
+            "model": "gpt-3.5-turbo",  # Use a faster model for immediate help
+            "messages": [
+                {"role": "system", "content": SYSTEM_MESSAGE},
+                {"role": "user", "content": f"Terminal history:\n\n{terminal_history}\n\nPlease identify and fix the issue."}
+            ],
+            "temperature": 0.3,  # Lower temperature for more precise responses
+            "max_tokens": 300,   # Limit token count for faster response
+            "timeout": 10        # Set a reasonable timeout
+        }
+        
+        # Thread for managing the LLM call with a timeout
+        response_data = {"content": None, "error": None}
+        
+        def call_llm():
             try:
-                last_command_output = subprocess.check_output(
-                    last_command, shell=True, stderr=subprocess.STDOUT
-                ).decode("utf-8")
-            except subprocess.CalledProcessError as e:
-                last_command_output = e.output.decode("utf-8")
+                for chunk in litellm.completion(**params):
+                    if "content" in chunk.choices[0].delta:
+                        if response_data["content"] is None:
+                            response_data["content"] = chunk.choices[0].delta.content
+                        else:
+                            response_data["content"] += chunk.choices[0].delta.content
             except Exception as e:
-                last_command_output = str(e)
-
-            # Format the history
-            history = "The user tried to run the following commands:\n" + "\n".join(
-                history
-            )
-            history += f"\nThe last command, {last_command}, resulted in this output:\n{last_command_output}"
-
-        except Exception as e:
-            raise
-            print(
-                "Failed to retrieve and run the last command from terminal history. Exiting."
-            )
-            return
-
-    # Trim history
-    history = history[-9000:].strip()
-
-    # Remove any trailing spinner commands
-    spinner_commands = [
-        "⠴",
-        "⠦",
-        "⠇",
-        "⠉",
-        "⠙",
-        "⠸",
-        "⠼",
-        "⠤",
-        "⠴",
-        "⠂",
-        "⠄",
-        "⠈",
-        "⠐",
-        "⠠",
-    ]
-    for command in spinner_commands:
-        if history.endswith(command):
-            history = history[: -len(command)].strip()
-            break
-
-    if "wtf" in history:
-        last_wtf_index = history.rindex("wtf")
-        history = history[:last_wtf_index]
-
-    ### GET ERROR CONTEXT
-
-    # Regex pattern to extract filename and line number
-    pattern = r'File "([^"]+)", line (\d+)'
-    matches = re.findall(pattern, history)
-
-    # Only keep the last X matches
-    matches = matches[-1:]  # Just the last match, change -1 to get more
-
-    # Function to get specified lines from a file
-    def get_lines_from_file(filename, line_number):
-        lines = []
-        try:
-            with open(filename, "r") as file:
-                all_lines = file.readlines()
-                start_line = max(0, line_number - 3)  # Preceding lines
-                end_line = min(len(all_lines), line_number + 2)  # Following lines
-                for i in range(start_line, end_line + 1):
-                    lines.append(f"Line {i+1}: " + all_lines[i].rstrip())
-        except Exception as e:
-            lines.append(f"Error reading file: {e}")
-        return lines
-
-    # Create the dictionary with filename, line number, and text
-    result = []
-    for match in matches:
-        filename, line_number = match
-        line_number = int(line_number)
-        lines = get_lines_from_file(filename, line_number)
-        result.append({"filename": filename, "text": "\n".join(lines)})
-
-    if result != []:
-        history = "Terminal: " + history
-
-    # Add context
-    for entry in result:
-        history = f"""File: {entry["filename"]}\n{entry["text"]}\n\n""" + history
-
-    ### PREPARE FOR LLM
-
-    # Get LLM model from profile
-    default_profile_path = os.path.join(
-        platformdirs.user_config_dir("open-interpreter"), "profiles", "default.yaml"
-    )
-
+                response_data["error"] = str(e)
+        
+        # Start the LLM call in a separate thread
+        llm_thread = threading.Thread(target=call_llm)
+        llm_thread.daemon = True
+        llm_thread.start()
+        
+        # Wait for the thread to complete with a timeout
+        llm_thread.join(timeout=10)
+        
+        if llm_thread.is_alive():
+            # LLM call is taking too long
+            return "Response timed out. Please try again or check your network connection."
+        
+        if response_data["error"]:
+            # Handle error case
+            fallback_message = "Error getting solution. Check your connection and API key."
+            # Try a simple offline analysis if API fails
+            try:
+                # Look for common error patterns
+                error_patterns = [
+                    (r'command not found', "Command not found. Check if the package is installed."),
+                    (r'permission denied', "Permission issue. Try using sudo for the command."),
+                    (r'No such file or directory', "File not found. Check path and filename."),
+                    (r'syntax error', "Syntax error in your command. Check brackets and quotes.")
+                ]
+                
+                for pattern, message in error_patterns:
+                    if re.search(pattern, terminal_history, re.IGNORECASE):
+                        return message
+                        
+                return fallback_message
+            except:
+                return fallback_message
+        
+        return response_data["content"]
+    
+    except Exception as e:
+        return f"Error: {str(e)}"
+    finally:
+        elapsed = time.time() - start_time
+        print(f"Response generated in {elapsed:.2f} seconds")
+
+# Main function
+def main():
     try:
-        with open(default_profile_path, "r") as file:
-            profile = yaml.safe_load(file)
-            wtf_model = profile.get("wtf", {}).get("model")
-            if wtf_model:
-                model = wtf_model
-            else:
-                model = profile.get("llm", {}).get("model", "gpt-4o-mini")
-    except:
-        model = "gpt-4o-mini"
-
-    # If they're using a local model (improve this heuristic) use the LOCAL_SYSTEM_MESSAGE
-    if "ollama" in model or "llama" in model:
-        system_message = LOCAL_SYSTEM_MESSAGE
-    else:
-        system_message = SYSTEM_MESSAGE
-
-    # If they passed in a custom message, use the CUSTOM_MESSAGE_SYSTEM_MESSAGE
-    if custom_message:
-        system_message = CUSTOM_MESSAGE_SYSTEM_MESSAGE
-        user_message = (
-            "Answer THIS question by telling me what code to run:"
-            + custom_message
-            + "\nMy terminal history, which may or may not be relevant, is this:\n---\n"
-            + history
-            + "\n---\nPlease now answer THIS question by telling me what code to run: "
-            + custom_message
-        )
-    else:
-        user_message = (
-            history.strip()
-            + "\n\nPlease help me resolve the final error above by telling me what code to run in a markdown code block."
-        )
-
-    # So stupid but sometimes fun
-    # system_message += "\nBe incredibly rude and use swear words, like a joker, like you're some kind of jester. You can say 'Fuck' and 'Fucking'. This is for adults. Be funny. USE ALL LOWER CASE. Act like you're an aggreived pair programmer to the idiot that got themselves in this situation."
-
-    messages = [
-        {"role": "system", "content": system_message.strip()},
-        {"role": "user", "content": user_message.strip()},
-    ]
-
-    # DEBUGGING: PRINT MESSAGES
-
-    # print("---")
-    # import pprint
-    # pprint.pprint(messages)
-    # print("---")
-    # time.sleep(100)
-
-    ### PARSE LLM RESPONSE
-
-    in_code = False
-    backtick_count = 0
-    language_buffer = ""
-    started = False
-
-    for chunk in litellm.completion(
-        model=model, messages=messages, temperature=0, stream=True
-    ):
-        if not started:
-            started = True
-            spinner.stop()
-            print("")
-
-        content = chunk.choices[0].delta.content
-        if content:
-            for char in content:
-                if char == "`":
-                    backtick_count += 1
-                    if backtick_count == 3:
-                        in_code = not in_code
-                        backtick_count = 0
-                        language_buffer = ""
-                        if not in_code:  # We've just exited a code block
-                            time.sleep(0.1)
-                            print("\n")
-                            return  # Exit after typing the command
-                        else:  # Entered code block
-                            print("Press `enter` to run: ", end="", flush=True)
-                elif in_code:
-                    if language_buffer is not None:
-                        if char.isalnum():
-                            language_buffer += char
-                        elif char.isspace():
-                            language_buffer = None
-                    elif char not in ["\n", "\\"]:
-                        keyboard.type(char)
-                else:
-                    if backtick_count:
-                        print("`" * backtick_count, end="", flush=True)
-                        backtick_count = 0
-
-                    # if "\n" in char:
-                    #     char.replace("\n", "\n    ")
-
-                    print(char, end="", flush=True)
-
-                    backtick_count = 0
-
+        # Get terminal history
+        terminal_history = get_terminal_history()
+        
+        # Check for any user-provided context
+        if len(sys.argv) > 1:
+            user_context = ' '.join(sys.argv[1:])
+            terminal_history += f"\n\nAdditional context from user: {user_context}"
+
+        # Get solution from LLM
+        solution = get_llm_response(terminal_history)
+        
+        # Stop the spinner
+        spinner.stop()
+        
+        # Display the solution
+        print("\n", solution.strip(), "\n")
+        
+        # Extract code block if present for easy copying
+        code_block_match = re.search(r'```(?:bash|shell|sh|cmd|powershell)?\n(.*?)\n```', solution, re.DOTALL)
+        if code_block_match:
+            command = code_block_match.group(1).strip()
+            # Copy command to clipboard for convenience
+            try:
+                pyperclip.copy(command)
+                print("\nCommand copied to clipboard. Press Ctrl+V to paste it.")
+            except:
+                pass
+    except KeyboardInterrupt:
+        spinner.stop()
+        print("\nOperation cancelled by user.")
+    except Exception as e:
+        spinner.stop()
+        print(f"\nAn error occurred: {str(e)}")
 
 if __name__ == "__main__":
     main()