fix BerriAI#9692. Keep cache key stable during mutation

adrianlyjak · adrianlyjak · commit c6eea98610ab · 2025-04-01T23:32:22.000-04:00
A) Return a copy from strict key removal to not break cache keys
B) Fix issue in existing cache key stabilizer that was not storing a stable
   key across request/response if no litellm_params existed in the request
diff --git a/litellm/caching/caching.py b/litellm/caching/caching.py
@@ -330,10 +330,9 @@ def _get_preset_cache_key_from_kwargs(self, **kwargs) -> Optional[str]:
         """
         Get the preset cache key from kwargs["litellm_params"]
 
-        We use _get_preset_cache_keys for two reasons
+        Is set after the cache is first calculated in order to not mutate between request and response time,
+        in case the implementation mutates the original objects (and avoids doing duplicate key calculations)
 
-        1. optional params like max_tokens, get transformed for bedrock -> max_new_tokens
-        2. avoid doing duplicate / repeated work
         """
         if kwargs:
             if "litellm_params" in kwargs:
@@ -346,7 +345,10 @@ def _set_preset_cache_key_in_kwargs(self, preset_cache_key: str, **kwargs) -> No
 
         This is used to avoid doing duplicate / repeated work
 
-        Placed in kwargs["litellm_params"]
+        Placed in kwargs["litellm_params"].
+        
+        Note! Your request must have a `litellm_params` key in order to use this feature, 
+        (as mutating the **kwargs splat object here does not mutate the original reference object).
         """
         if kwargs:
             if "litellm_params" in kwargs:
diff --git a/litellm/utils.py b/litellm/utils.py
@@ -1251,6 +1251,10 @@ async def wrapper_async(*args, **kwargs):  # noqa: PLR0915
         if "litellm_call_id" not in kwargs:
             kwargs["litellm_call_id"] = str(uuid.uuid4())
 
+        # set up litellm_params, so that keys can be added (e.g. for tracking cache keys)
+        if "litellm_params" not in kwargs:
+            kwargs["litellm_params"] = {}
+
         model: Optional[str] = args[0] if len(args) > 0 else kwargs.get("model", None)
         is_completion_with_fallbacks = kwargs.get("fallbacks") is not None
 
@@ -2770,23 +2774,33 @@ def _remove_additional_properties(schema):
 
 def _remove_strict_from_schema(schema):
     """
-    Relevant Issues: https://github.com/BerriAI/litellm/issues/6136, https://github.com/BerriAI/litellm/issues/6088
+    Recursively removes 'strict' from schema. Returns a copy, in order to not break cache keys, (so you should update your reference)
+    
+    Relevant Issues: https://github.com/BerriAI/litellm/issues/6136, https://github.com/BerriAI/litellm/issues/6088, 
     """
+    maybe_copy = None # make a copy to not break cache keys https://github.com/BerriAI/litellm/issues/9692
     if isinstance(schema, dict):
         # Remove the 'additionalProperties' key if it exists and is set to False
         if "strict" in schema:
-            del schema["strict"]
+            maybe_copy = schema.copy()
+            del maybe_copy["strict"]
 
         # Recursively process all dictionary values
         for key, value in schema.items():
-            _remove_strict_from_schema(value)
+            result = _remove_strict_from_schema(value)
+            if result is not value:
+                maybe_copy = maybe_copy or schema.copy()
+                maybe_copy[key] = result
 
     elif isinstance(schema, list):
         # Recursively process all items in the list
-        for item in schema:
-            _remove_strict_from_schema(item)
+        for i, item in enumerate(schema):
+            result = _remove_strict_from_schema(item)
+            if result is not item:
+                maybe_copy = maybe_copy or list(schema)
+                maybe_copy[i] = result
 
-    return schema
+    return maybe_copy or schema
 
 
 def _remove_unsupported_params(
diff --git a/tests/litellm_utils_tests/test_utils.py b/tests/litellm_utils_tests/test_utils.py
@@ -1,4 +1,5 @@
 import copy
+import json
 import sys
 import time
 from datetime import datetime
@@ -1890,6 +1891,43 @@ async def test_function(**kwargs):
         == "gpt-4o-mini"
     )
 
+@pytest.mark.asyncio
+async def test_cache_key_stability_with_mutation(monkeypatch):
+    from litellm.utils import client
+    import asyncio
+    from litellm.caching import Cache
+
+    # Set up in-memory cache
+    cache = Cache()
+    monkeypatch.setattr(litellm, "cache", cache)
+
+    # Create mock original function
+    mock_original = AsyncMock()
+
+    def side_effect(**kwargs):
+        print(f"kwargs: {kwargs}")
+        return litellm.ModelResponse(
+            model="vertex_ai/gemini-2.0-flash"
+        )
+    mock_original.side_effect = side_effect
+
+    # Apply decorator
+    @client
+    async def acompletion(**kwargs):
+        kwargs["messages"][0]["content"] = "mutated"
+        return await mock_original(**kwargs)
+
+    # Test kwargs
+    test_kwargs = {"model": "vertex_ai/gemini-2.0-flash", "messages": [{"role": "user", "content": "Hello, world!"}]}
+    original_kwargs = copy.deepcopy(test_kwargs)
+
+    # Call decorated function
+    await acompletion(**test_kwargs)
+    await asyncio.sleep(0.01)
+    await acompletion(**original_kwargs)
+
+    mock_original.assert_called_once()
+
 
 def test_dict_to_response_format_helper():
     from litellm.llms.base_llm.base_utils import _dict_to_response_format_helper
@@ -2102,3 +2140,70 @@ def test_get_provider_audio_transcription_config():
         config = ProviderConfigManager.get_provider_audio_transcription_config(
             model="whisper-1", provider=provider
         )
+
+def test_remove_strict_from_schema():
+    from litellm.utils import _remove_strict_from_schema
+
+    schema = { # This isn't maybe actually very realistic json schema, just slop full of stricts
+        "$schema": "http://json-schema.org/draft-07/schema#",
+        "type": "object",
+        "strict": True,
+        "definitions": {
+            "address": {
+                "type": "object",
+                "properties": {
+                    "street": {"type": "string"},
+                    "city": {"type": "string"}
+                },
+                "required": ["street", "city"],
+                "strict": True
+            }
+        },
+        "properties": {
+            "name": {
+                "type": "string",
+                "strict": True
+            },
+            "age": {
+                "type": "integer"
+            },
+            "address": {
+                "$ref": "#/definitions/address"
+            },
+            "tags": {
+                "type": "array",
+                "items": {"type": "string"},
+                "strict": True
+            },
+            "contacts": {
+                "type": "array",
+                "items": {
+                    "oneOf": [
+                        {"type": "string"},
+                        {
+                            "type": "array",
+                            "items": {
+                                "type": "object",
+                                "strict": True,
+                                "properties": {
+                                    "value": {"type": "string"}
+                                },
+                                "required": ["value"]
+                            }
+                        }
+                    ],
+                    "strict": True
+                }
+            }
+        }
+    }
+    original_schema = copy.deepcopy(schema)
+    cleaned = _remove_strict_from_schema(schema)
+    assert "strict" not in json.dumps(cleaned)
+    # schema should be unchanged, (should copy instead of mutate)
+    # otherwise it breaks cache keys 
+    # https://github.com/BerriAI/litellm/issues/9692
+    assert cleaned != original_schema
+    assert schema == original_schema
+    
+    
diff --git a/tests/local_testing/test_caching.py b/tests/local_testing/test_caching.py
@@ -2608,3 +2608,4 @@ def test_caching_with_reasoning_content():
     print(f"response 2: {response_2.model_dump_json(indent=4)}")
     assert response_2._hidden_params["cache_hit"] == True
     assert response_2.choices[0].message.reasoning_content is not None
+
diff --git a/tests/local_testing/test_unit_test_caching.py b/tests/local_testing/test_unit_test_caching.py
@@ -251,3 +251,21 @@ def test_generate_streaming_content():
     assert chunk_count > 1
 
     print(f"Number of chunks: {chunk_count}")
+
+def test_caching_stable_with_mutation():
+    """
+    Test that caching is stable with mutation
+    """
+    litellm.cache = Cache()
+    kwargs = {
+        "model": "gpt-3.5-turbo",
+        "messages": [{"role": "user", "content": "Hello, world!"}],
+        "temperature": 0.7,
+        "litellm_params": {},
+    }
+    cache_key = litellm.cache.get_cache_key(**kwargs)
+    
+    # mutate kwargs
+    kwargs["temperature"] = 0.8
+    cache_key_2 = litellm.cache.get_cache_key(**kwargs)
+    assert cache_key == cache_key_2