Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 6a78ede

Browse files
authored
Improve reactor_tick_time metric (#11724)
The existing implementation of the `python_twisted_reactor_tick_time` metric is pretty useless, because it *only* measures the time taken to execute timed calls and callbacks from threads. That neglects everything that happens off the back of I/O, which is obviously quite a lot for us. To improve this, I've hooked into a different place in the reactor - in particular, where it calls `epoll`. That call is the only place it should wait for something to happen - the rest of the loop *should* be quick. I've also removed `python_twisted_reactor_pending_calls`, because I don't believe anyone ever looks at it, and it's a nuisance to populate.
1 parent 6b241f5 commit 6a78ede

File tree

5 files changed

+86
-109
lines changed

5 files changed

+86
-109
lines changed

changelog.d/11724.misc

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Improve accuracy of `python_twisted_reactor_tick_time` prometheus metric.

changelog.d/11724.removal

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Remove `python_twisted_reactor_pending_calls` prometheus metric.

contrib/prometheus/consoles/synapse.html

-16
Original file line numberDiff line numberDiff line change
@@ -92,22 +92,6 @@ <h3>Average reactor tick time</h3>
9292
})
9393
</script>
9494

95-
<h3>Pending calls per tick</h3>
96-
<div id="reactor_pending_calls"></div>
97-
<script>
98-
new PromConsole.Graph({
99-
node: document.querySelector("#reactor_pending_calls"),
100-
expr: "rate(python_twisted_reactor_pending_calls_sum[30s]) / rate(python_twisted_reactor_pending_calls_count[30s])",
101-
name: "[[job]]-[[index]]",
102-
min: 0,
103-
renderer: "line",
104-
height: 150,
105-
yAxisFormatter: PromConsole.NumberFormatter.humanize,
106-
yHoverFormatter: PromConsole.NumberFormatter.humanize,
107-
yTitle: "Pending Calls"
108-
})
109-
</script>
110-
11195
<h1>Storage</h1>
11296

11397
<h3>Queries</h3>

synapse/metrics/__init__.py

+1-93
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,12 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import functools
1615
import itertools
1716
import logging
1817
import os
1918
import platform
2019
import threading
21-
import time
2220
from typing import (
23-
Any,
2421
Callable,
2522
Dict,
2623
Generic,
@@ -33,7 +30,6 @@
3330
Type,
3431
TypeVar,
3532
Union,
36-
cast,
3733
)
3834

3935
import attr
@@ -44,11 +40,9 @@
4440
GaugeMetricFamily,
4541
)
4642

47-
from twisted.internet import reactor
48-
from twisted.internet.base import ReactorBase
4943
from twisted.python.threadpool import ThreadPool
5044

51-
import synapse
45+
import synapse.metrics._reactor_metrics
5246
from synapse.metrics._exposition import (
5347
MetricsResource,
5448
generate_latest,
@@ -368,21 +362,6 @@ def collect(self) -> Iterable[Metric]:
368362
REGISTRY.register(CPUMetrics())
369363

370364

371-
#
372-
# Twisted reactor metrics
373-
#
374-
375-
tick_time = Histogram(
376-
"python_twisted_reactor_tick_time",
377-
"Tick time of the Twisted reactor (sec)",
378-
buckets=[0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2, 5],
379-
)
380-
pending_calls_metric = Histogram(
381-
"python_twisted_reactor_pending_calls",
382-
"Pending calls",
383-
buckets=[1, 2, 5, 10, 25, 50, 100, 250, 500, 1000],
384-
)
385-
386365
#
387366
# Federation Metrics
388367
#
@@ -434,8 +413,6 @@ def collect(self) -> Iterable[Metric]:
434413
" ".join([platform.system(), platform.release()]),
435414
).set(1)
436415

437-
last_ticked = time.time()
438-
439416
# 3PID send info
440417
threepid_send_requests = Histogram(
441418
"synapse_threepid_send_requests_with_tries",
@@ -483,75 +460,6 @@ def register_threadpool(name: str, threadpool: ThreadPool) -> None:
483460
)
484461

485462

486-
class ReactorLastSeenMetric:
487-
def collect(self) -> Iterable[Metric]:
488-
cm = GaugeMetricFamily(
489-
"python_twisted_reactor_last_seen",
490-
"Seconds since the Twisted reactor was last seen",
491-
)
492-
cm.add_metric([], time.time() - last_ticked)
493-
yield cm
494-
495-
496-
REGISTRY.register(ReactorLastSeenMetric())
497-
498-
F = TypeVar("F", bound=Callable[..., Any])
499-
500-
501-
def runUntilCurrentTimer(reactor: ReactorBase, func: F) -> F:
502-
@functools.wraps(func)
503-
def f(*args: Any, **kwargs: Any) -> Any:
504-
now = reactor.seconds()
505-
num_pending = 0
506-
507-
# _newTimedCalls is one long list of *all* pending calls. Below loop
508-
# is based off of impl of reactor.runUntilCurrent
509-
for delayed_call in reactor._newTimedCalls:
510-
if delayed_call.time > now:
511-
break
512-
513-
if delayed_call.delayed_time > 0:
514-
continue
515-
516-
num_pending += 1
517-
518-
num_pending += len(reactor.threadCallQueue)
519-
start = time.time()
520-
ret = func(*args, **kwargs)
521-
end = time.time()
522-
523-
# record the amount of wallclock time spent running pending calls.
524-
# This is a proxy for the actual amount of time between reactor polls,
525-
# since about 25% of time is actually spent running things triggered by
526-
# I/O events, but that is harder to capture without rewriting half the
527-
# reactor.
528-
tick_time.observe(end - start)
529-
pending_calls_metric.observe(num_pending)
530-
531-
# Update the time we last ticked, for the metric to test whether
532-
# Synapse's reactor has frozen
533-
global last_ticked
534-
last_ticked = end
535-
536-
return ret
537-
538-
return cast(F, f)
539-
540-
541-
try:
542-
# Ensure the reactor has all the attributes we expect
543-
reactor.seconds # type: ignore
544-
reactor.runUntilCurrent # type: ignore
545-
reactor._newTimedCalls # type: ignore
546-
reactor.threadCallQueue # type: ignore
547-
548-
# runUntilCurrent is called when we have pending calls. It is called once
549-
# per iteratation after fd polling.
550-
reactor.runUntilCurrent = runUntilCurrentTimer(reactor, reactor.runUntilCurrent) # type: ignore
551-
except AttributeError:
552-
pass
553-
554-
555463
__all__ = [
556464
"MetricsResource",
557465
"generate_latest",

synapse/metrics/_reactor_metrics.py

+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# Copyright 2022 The Matrix.org Foundation C.I.C.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import select
16+
import time
17+
from typing import Any, Iterable, List, Tuple
18+
19+
from prometheus_client import Histogram, Metric
20+
from prometheus_client.core import REGISTRY, GaugeMetricFamily
21+
22+
from twisted.internet import reactor
23+
24+
#
25+
# Twisted reactor metrics
26+
#
27+
28+
tick_time = Histogram(
29+
"python_twisted_reactor_tick_time",
30+
"Tick time of the Twisted reactor (sec)",
31+
buckets=[0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2, 5],
32+
)
33+
34+
35+
class EpollWrapper:
36+
"""a wrapper for an epoll object which records the time between polls"""
37+
38+
def __init__(self, poller: "select.epoll"):
39+
self.last_polled = time.time()
40+
self._poller = poller
41+
42+
def poll(self, *args, **kwargs) -> List[Tuple[int, int]]: # type: ignore[no-untyped-def]
43+
# record the time since poll() was last called. This gives a good proxy for
44+
# how long it takes to run everything in the reactor - ie, how long anything
45+
# waiting for the next tick will have to wait.
46+
tick_time.observe(time.time() - self.last_polled)
47+
48+
ret = self._poller.poll(*args, **kwargs)
49+
50+
self.last_polled = time.time()
51+
return ret
52+
53+
def __getattr__(self, item: str) -> Any:
54+
return getattr(self._poller, item)
55+
56+
57+
class ReactorLastSeenMetric:
58+
def __init__(self, epoll_wrapper: EpollWrapper):
59+
self._epoll_wrapper = epoll_wrapper
60+
61+
def collect(self) -> Iterable[Metric]:
62+
cm = GaugeMetricFamily(
63+
"python_twisted_reactor_last_seen",
64+
"Seconds since the Twisted reactor was last seen",
65+
)
66+
cm.add_metric([], time.time() - self._epoll_wrapper.last_polled)
67+
yield cm
68+
69+
70+
try:
71+
# if the reactor has a `_poller` attribute, which is an `epoll` object
72+
# (ie, it's an EPollReactor), we wrap the `epoll` with a thing that will
73+
# measure the time between ticks
74+
from select import epoll
75+
76+
poller = reactor._poller # type: ignore[attr-defined]
77+
except (AttributeError, ImportError):
78+
pass
79+
else:
80+
if isinstance(poller, epoll):
81+
poller = EpollWrapper(poller)
82+
reactor._poller = poller # type: ignore[attr-defined]
83+
REGISTRY.register(ReactorLastSeenMetric(poller))

0 commit comments

Comments
 (0)