chore(llmobs): fix flaky evaluator runner log test (#13118)

lievan · lievan · web-flow · commit 712dfc2b681e · 2025-04-08T21:16:34.000Z
Increase the interval for evaluator runner being used in the buffer limit test to try to make it less flaky previously it was using `active_evaluator_runner` fixture which had a interval of 0.01 ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) Co-authored-by: lievan <evan.li@datadoqhq.com>
diff --git a/tests/llmobs/conftest.py b/tests/llmobs/conftest.py
@@ -97,6 +97,7 @@ def mock_writer_logs():
 def mock_evaluator_logs():
     with mock.patch("ddtrace.llmobs._evaluators.runner.logger") as m:
         yield m
+        m.reset_mock()
 
 
 @pytest.fixture
diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py
@@ -11,7 +11,6 @@
 from ddtrace.trace import Span
 from tests.llmobs._utils import DummyEvaluator
 from tests.llmobs._utils import _dummy_evaluator_eval_metric_event
-from tests.utils import flaky
 from tests.utils import override_env
 from tests.utils import override_global_config
 
@@ -31,10 +30,12 @@ def test_evaluator_runner_start(mock_evaluator_logs, active_evaluator_runner):
     mock_evaluator_logs.debug.assert_has_calls([mock.call("started %r", "EvaluatorRunner")])
 
 
-@flaky(1744053478)
-def test_evaluator_runner_buffer_limit(mock_evaluator_logs, active_evaluator_runner):
+def test_evaluator_runner_buffer_limit(mock_evaluator_logs):
+    evaluator_runner = EvaluatorRunner(interval=1, llmobs_service=mock.MagicMock())
+    evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=mock.MagicMock()))
+    evaluator_runner.start()
     for _ in range(1001):
-        active_evaluator_runner.enqueue({}, DUMMY_SPAN)
+        evaluator_runner.enqueue({}, DUMMY_SPAN)
     mock_evaluator_logs.warning.assert_called_with(
         "%r event buffer full (limit is %d), dropping event", "EvaluatorRunner", 1000
     )