fix(python): Properly respect max_concurrency for aevaluate (#1613)

jacoblee93 · baskaryan · web-flow · commit 9aa4df74951a · 2025-03-30T21:39:45.000-07:00
Co-authored-by: Bagatur &lt;22008038+baskaryan@users.noreply.github.com&gt;
diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py
@@ -778,31 +778,43 @@ async def awith_predictions_and_evaluators(
         """
         evaluators = _resolve_evaluators(evaluators)
 
-        if not hasattr(self, "_evaluator_executor"):
-            self._evaluator_executor = cf.ThreadPoolExecutor(max_workers=4)
+        if not hasattr(self, "_evaluation_feedback_executor"):
+            self._evaluation_feedback_executor = cf.ThreadPoolExecutor(max_workers=4)
+
+        traceable_target = _ensure_async_traceable(target)
+
+        async def process_example(example: schemas.Example):
+            # Yield the coroutine to be awaited later
+            pred = await _aforward(
+                traceable_target,
+                self._get_example_with_readers(example),
+                self.experiment_name,
+                self._metadata,
+                self.client,
+                _include_attachments(target),
+            )
+            example, run = pred["example"], pred["run"]
+            result = await self._arun_evaluators(
+                evaluators,
+                {
+                    "run": run,
+                    "example": example,
+                    "evaluation_results": {"results": []},
+                },
+                feedback_executor=self._evaluation_feedback_executor,
+            )
+            return result
 
         async def process_examples():
             """Create a single task per example.
 
             That task is to run the target function and all the evaluators
             sequentially.
             """
-            async for pred in self._apredict(
-                target,
-                max_concurrency=max_concurrency,
-                include_attachments=_include_attachments(target),
-            ):
-                example, run = pred["example"], pred["run"]
-                result = self._arun_evaluators(
-                    evaluators,
-                    {
-                        "run": run,
-                        "example": example,
-                        "evaluation_results": {"results": []},
-                    },
-                    executor=self._evaluator_executor,
-                )
-                yield result
+            async for example in await self.aget_examples():
+                yield process_example(example)
+
+            await self._aend()
 
         # Run the per-example tasks with max-concurrency
         # This guarantees that max_concurrency is the upper limit
@@ -944,13 +956,13 @@ async def _ascore(
         evaluators: Sequence[RunEvaluator],
         max_concurrency: Optional[int] = None,
     ) -> AsyncIterator[ExperimentResultRow]:
-        with cf.ThreadPoolExecutor(max_workers=4) as executor:
+        with cf.ThreadPoolExecutor(max_workers=4) as feedback_executor:
 
             async def score_all():
                 async for current_results in self.aget_results():
                     # Yield the coroutine to be awaited later in aiter_with_concurrency
                     yield self._arun_evaluators(
-                        evaluators, current_results, executor=executor
+                        evaluators, current_results, feedback_executor=feedback_executor
                     )
 
             async for result in aitertools.aiter_with_concurrency(
@@ -962,7 +974,7 @@ async def _arun_evaluators(
         self,
         evaluators: Sequence[RunEvaluator],
         current_results: ExperimentResultRow,
-        executor: cf.ThreadPoolExecutor,
+        feedback_executor: cf.ThreadPoolExecutor,
     ) -> ExperimentResultRow:
         current_context = rh.get_tracing_context()
         metadata = {
@@ -996,7 +1008,7 @@ async def _run_single_evaluator(evaluator: RunEvaluator):
 
                     if self._upload_results:
                         self.client._log_evaluation_feedback(
-                            evaluator_response, run=run, _executor=executor
+                            evaluator_response, run=run, _executor=feedback_executor
                         )
                     return selected_results
                 except Exception as e:
@@ -1019,7 +1031,7 @@ async def _run_single_evaluator(evaluator: RunEvaluator):
                         )
                         if self._upload_results:
                             self.client._log_evaluation_feedback(
-                                error_response, run=run, _executor=executor
+                                error_response, run=run, _executor=feedback_executor
                             )
                         return selected_results
                     except Exception as e2:
diff --git a/python/tests/evaluation/test_evaluation.py b/python/tests/evaluation/test_evaluation.py
@@ -472,3 +472,79 @@ async def predict(inputs: dict):
             predict,
             data=(_ for _ in range(0)),
         )
+
+
+async def test_aevaluate_large_dataset_and_concurrency():
+    client = Client()
+    _ = client.clone_public_dataset(
+        "https://smith.langchain.com/public/2bbf4a10-c3d5-4868-9e96-400df97fed69/d"
+    )
+    dataset_name = "Evaluate Examples"
+
+    async def mock_chat_completion(*, messages):
+        await asyncio.sleep(1)
+        return {
+            "role": "assistant",
+            "content": "Still thinking...",
+        }
+
+    def simulate_conversation_turn(*, existing, model_response):
+        return existing + [
+            model_response,
+            {"role": "human", "content": "Think harder!"},
+        ]
+
+    # Will be traced by default
+    async def target(inputs: dict) -> dict:
+        messages = [
+            {
+                "role": "system",
+                "content": "Come up with a math equation that solves the puzzle.",
+            },
+            # This dataset has inputs as a dict with a "statement" key
+            {"role": "user", "content": "foo"},
+        ]
+        res = await mock_chat_completion(model="gpt-4o-mini", messages=messages)
+        messages = simulate_conversation_turn(existing=messages, model_response=res)
+
+        return {"equation": res}
+
+    async def mock_evaluator_chat_completion(*, model, messages):
+        await asyncio.sleep(2)
+        return {
+            "role": "assistant",
+            "content": str(0.5),
+        }
+
+    async def mock_correctness_evaluator(outputs: dict, reference_outputs: dict):
+        messages = [
+            {"role": "system", "content": "Assign a score to the following output."},
+            {
+                "role": "user",
+                "content": f"""
+Actual: {outputs["equation"]}
+""",
+            },
+        ]
+        res = await mock_evaluator_chat_completion(model="o3-mini", messages=messages)
+        return {
+            "key": "correctness",
+            "score": float(res["content"]),
+            "comment": "The answer was a good attempt, but incorrect.",
+        }
+
+    client = Client()
+
+    start = time.time()
+
+    await client.aevaluate(
+        target,
+        data=client.list_examples(dataset_name=dataset_name, as_of="test_version"),
+        evaluators=[
+            mock_correctness_evaluator,
+        ],
+        max_concurrency=3,
+    )
+
+    finish_time = time.time()
+    assert (finish_time - start) <= 8.5