Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit be4250c

Browse files
reivilibreDavid Robertson
and
David Robertson
authored
Add experimental configuration option to allow disabling legacy Prometheus metric names. (#13540)
Co-authored-by: David Robertson <[email protected]>
1 parent 2e2040c commit be4250c

File tree

9 files changed

+150
-21
lines changed

9 files changed

+150
-21
lines changed

changelog.d/13540.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add experimental configuration option to allow disabling legacy Prometheus metric names.

synapse/app/_base.py

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -266,15 +266,48 @@ async def wrapper() -> None:
266266
reactor.callWhenRunning(lambda: defer.ensureDeferred(wrapper()))
267267

268268

269-
def listen_metrics(bind_addresses: Iterable[str], port: int) -> None:
269+
def listen_metrics(
270+
bind_addresses: Iterable[str], port: int, enable_legacy_metric_names: bool
271+
) -> None:
270272
"""
271273
Start Prometheus metrics server.
272274
"""
273-
from synapse.metrics import RegistryProxy, start_http_server
275+
from prometheus_client import start_http_server as start_http_server_prometheus
276+
277+
from synapse.metrics import (
278+
RegistryProxy,
279+
start_http_server as start_http_server_legacy,
280+
)
274281

275282
for host in bind_addresses:
276283
logger.info("Starting metrics listener on %s:%d", host, port)
277-
start_http_server(port, addr=host, registry=RegistryProxy)
284+
if enable_legacy_metric_names:
285+
start_http_server_legacy(port, addr=host, registry=RegistryProxy)
286+
else:
287+
_set_prometheus_client_use_created_metrics(False)
288+
start_http_server_prometheus(port, addr=host, registry=RegistryProxy)
289+
290+
291+
def _set_prometheus_client_use_created_metrics(new_value: bool) -> None:
292+
"""
293+
Sets whether prometheus_client should expose `_created`-suffixed metrics for
294+
all gauges, histograms and summaries.
295+
There is no programmatic way to disable this without poking at internals;
296+
the proper way is to use an environment variable which prometheus_client
297+
loads at import time.
298+
299+
The motivation for disabling these `_created` metrics is that they're
300+
a waste of space as they're not useful but they take up space in Prometheus.
301+
"""
302+
303+
import prometheus_client.metrics
304+
305+
if hasattr(prometheus_client.metrics, "_use_created"):
306+
prometheus_client.metrics._use_created = new_value
307+
else:
308+
logger.error(
309+
"Can't disable `_created` metrics in prometheus_client (brittle hack broken?)"
310+
)
278311

279312

280313
def listen_manhole(

synapse/app/generic_worker.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,11 @@ def start_listening(self) -> None:
412412
"enable_metrics is not True!"
413413
)
414414
else:
415-
_base.listen_metrics(listener.bind_addresses, listener.port)
415+
_base.listen_metrics(
416+
listener.bind_addresses,
417+
listener.port,
418+
enable_legacy_metric_names=self.config.metrics.enable_legacy_metrics,
419+
)
416420
else:
417421
logger.warning("Unsupported listener type: %s", listener.type)
418422

synapse/app/homeserver.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,11 @@ def start_listening(self) -> None:
307307
"enable_metrics is not True!"
308308
)
309309
else:
310-
_base.listen_metrics(listener.bind_addresses, listener.port)
310+
_base.listen_metrics(
311+
listener.bind_addresses,
312+
listener.port,
313+
enable_legacy_metric_names=self.config.metrics.enable_legacy_metrics,
314+
)
311315
else:
312316
# this shouldn't happen, as the listener type should have been checked
313317
# during parsing

synapse/config/metrics.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,35 @@ class MetricsConfig(Config):
4242

4343
def read_config(self, config: JsonDict, **kwargs: Any) -> None:
4444
self.enable_metrics = config.get("enable_metrics", False)
45+
46+
"""
47+
### `enable_legacy_metrics` (experimental)
48+
49+
**Experimental: this option may be removed or have its behaviour
50+
changed at any time, with no notice.**
51+
52+
Set to `true` to publish both legacy and non-legacy Prometheus metric names,
53+
or to `false` to only publish non-legacy Prometheus metric names.
54+
Defaults to `true`. Has no effect if `enable_metrics` is `false`.
55+
56+
Legacy metric names include:
57+
- metrics containing colons in the name, such as `synapse_util_caches_response_cache:hits`, because colons are supposed to be reserved for user-defined recording rules;
58+
- counters that don't end with the `_total` suffix, such as `synapse_federation_client_sent_edus`, therefore not adhering to the OpenMetrics standard.
59+
60+
These legacy metric names are unconventional and not compliant with OpenMetrics standards.
61+
They are included for backwards compatibility.
62+
63+
Example configuration:
64+
```yaml
65+
enable_legacy_metrics: false
66+
```
67+
68+
See https://github.com/matrix-org/synapse/issues/11106 for context.
69+
70+
*Since v1.67.0.*
71+
"""
72+
self.enable_legacy_metrics = config.get("enable_legacy_metrics", True)
73+
4574
self.report_stats = config.get("report_stats", None)
4675
self.report_stats_endpoint = config.get(
4776
"report_stats_endpoint", "https://matrix.org/report-usage-stats/push"

synapse/metrics/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,12 @@
4646

4747
# This module is imported for its side effects; flake8 needn't warn that it's unused.
4848
import synapse.metrics._reactor_metrics # noqa: F401
49-
from synapse.metrics._exposition import (
49+
from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager
50+
from synapse.metrics._legacy_exposition import (
5051
MetricsResource,
5152
generate_latest,
5253
start_http_server,
5354
)
54-
from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager
5555
from synapse.metrics._types import Collector
5656
from synapse.util import SYNAPSE_VERSION
5757

synapse/metrics/_exposition.py renamed to synapse/metrics/_legacy_exposition.py

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,27 @@ def sample_line(line: Sample, name: str) -> str:
8080
return "{}{} {}{}\n".format(name, labelstr, floatToGoString(line.value), timestamp)
8181

8282

83+
# Mapping from new metric names to legacy metric names.
84+
# We translate these back to their old names when exposing them through our
85+
# legacy vendored exporter.
86+
# Only this legacy exposition module applies these name changes.
87+
LEGACY_METRIC_NAMES = {
88+
"synapse_util_caches_cache_hits": "synapse_util_caches_cache:hits",
89+
"synapse_util_caches_cache_size": "synapse_util_caches_cache:size",
90+
"synapse_util_caches_cache_evicted_size": "synapse_util_caches_cache:evicted_size",
91+
"synapse_util_caches_cache_total": "synapse_util_caches_cache:total",
92+
"synapse_util_caches_response_cache_size": "synapse_util_caches_response_cache:size",
93+
"synapse_util_caches_response_cache_hits": "synapse_util_caches_response_cache:hits",
94+
"synapse_util_caches_response_cache_evicted_size": "synapse_util_caches_response_cache:evicted_size",
95+
"synapse_util_caches_response_cache_total": "synapse_util_caches_response_cache:total",
96+
}
97+
98+
8399
def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> bytes:
100+
"""
101+
Generate metrics in legacy format. Modern metrics are generated directly
102+
by prometheus-client.
103+
"""
84104

85105
# Trigger the cache metrics to be rescraped, which updates the common
86106
# metrics but do not produce metrics themselves
@@ -94,7 +114,8 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt
94114
# No samples, don't bother.
95115
continue
96116

97-
mname = metric.name
117+
# Translate to legacy metric name if it has one.
118+
mname = LEGACY_METRIC_NAMES.get(metric.name, metric.name)
98119
mnewname = metric.name
99120
mtype = metric.type
100121

@@ -124,7 +145,7 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt
124145
om_samples: Dict[str, List[str]] = {}
125146
for s in metric.samples:
126147
for suffix in ["_created", "_gsum", "_gcount"]:
127-
if s.name == metric.name + suffix:
148+
if s.name == mname + suffix:
128149
# OpenMetrics specific sample, put in a gauge at the end.
129150
# (these come from gaugehistograms which don't get renamed,
130151
# so no need to faff with mnewname)
@@ -140,12 +161,12 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt
140161
if emit_help:
141162
output.append(
142163
"# HELP {}{} {}\n".format(
143-
metric.name,
164+
mname,
144165
suffix,
145166
metric.documentation.replace("\\", r"\\").replace("\n", r"\n"),
146167
)
147168
)
148-
output.append(f"# TYPE {metric.name}{suffix} gauge\n")
169+
output.append(f"# TYPE {mname}{suffix} gauge\n")
149170
output.extend(lines)
150171

151172
# Get rid of the weird colon things while we're at it
@@ -170,11 +191,12 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt
170191
# Get rid of the OpenMetrics specific samples (we should already have
171192
# dealt with them above anyway.)
172193
for suffix in ["_created", "_gsum", "_gcount"]:
173-
if s.name == metric.name + suffix:
194+
if s.name == mname + suffix:
174195
break
175196
else:
197+
sample_name = LEGACY_METRIC_NAMES.get(s.name, s.name)
176198
output.append(
177-
sample_line(s, s.name.replace(":total", "").replace(":", "_"))
199+
sample_line(s, sample_name.replace(":total", "").replace(":", "_"))
178200
)
179201

180202
return "".join(output).encode("utf-8")

synapse/util/caches/__init__.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,23 +34,23 @@
3434
caches_by_name: Dict[str, Sized] = {}
3535
collectors_by_name: Dict[str, "CacheMetric"] = {}
3636

37-
cache_size = Gauge("synapse_util_caches_cache:size", "", ["name"])
38-
cache_hits = Gauge("synapse_util_caches_cache:hits", "", ["name"])
39-
cache_evicted = Gauge("synapse_util_caches_cache:evicted_size", "", ["name", "reason"])
40-
cache_total = Gauge("synapse_util_caches_cache:total", "", ["name"])
37+
cache_size = Gauge("synapse_util_caches_cache_size", "", ["name"])
38+
cache_hits = Gauge("synapse_util_caches_cache_hits", "", ["name"])
39+
cache_evicted = Gauge("synapse_util_caches_cache_evicted_size", "", ["name", "reason"])
40+
cache_total = Gauge("synapse_util_caches_cache_total", "", ["name"])
4141
cache_max_size = Gauge("synapse_util_caches_cache_max_size", "", ["name"])
4242
cache_memory_usage = Gauge(
4343
"synapse_util_caches_cache_size_bytes",
4444
"Estimated memory usage of the caches",
4545
["name"],
4646
)
4747

48-
response_cache_size = Gauge("synapse_util_caches_response_cache:size", "", ["name"])
49-
response_cache_hits = Gauge("synapse_util_caches_response_cache:hits", "", ["name"])
48+
response_cache_size = Gauge("synapse_util_caches_response_cache_size", "", ["name"])
49+
response_cache_hits = Gauge("synapse_util_caches_response_cache_hits", "", ["name"])
5050
response_cache_evicted = Gauge(
51-
"synapse_util_caches_response_cache:evicted_size", "", ["name", "reason"]
51+
"synapse_util_caches_response_cache_evicted_size", "", ["name", "reason"]
5252
)
53-
response_cache_total = Gauge("synapse_util_caches_response_cache:total", "", ["name"])
53+
response_cache_total = Gauge("synapse_util_caches_response_cache_total", "", ["name"])
5454

5555

5656
class EvictionReason(Enum):

tests/test_metrics.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,16 @@
1212
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
15+
try:
16+
from importlib import metadata
17+
except ImportError:
18+
import importlib_metadata as metadata # type: ignore[no-redef]
1519

20+
from unittest.mock import patch
21+
22+
from pkg_resources import parse_version
23+
24+
from synapse.app._base import _set_prometheus_client_use_created_metrics
1625
from synapse.metrics import REGISTRY, InFlightGauge, generate_latest
1726
from synapse.util.caches.deferred_cache import DeferredCache
1827

@@ -162,3 +171,30 @@ def test_cache_metric(self):
162171

163172
self.assertEqual(items["synapse_util_caches_cache_size"], "1.0")
164173
self.assertEqual(items["synapse_util_caches_cache_max_size"], "777.0")
174+
175+
176+
class PrometheusMetricsHackTestCase(unittest.HomeserverTestCase):
177+
if parse_version(metadata.version("prometheus_client")) < parse_version("0.14.0"):
178+
skip = "prometheus-client too old"
179+
180+
def test_created_metrics_disabled(self) -> None:
181+
"""
182+
Tests that a brittle hack, to disable `_created` metrics, works.
183+
This involves poking at the internals of prometheus-client.
184+
It's not the end of the world if this doesn't work.
185+
186+
This test gives us a way to notice if prometheus-client changes
187+
their internals.
188+
"""
189+
import prometheus_client.metrics
190+
191+
PRIVATE_FLAG_NAME = "_use_created"
192+
193+
# By default, the pesky `_created` metrics are enabled.
194+
# Check this assumption is still valid.
195+
self.assertTrue(getattr(prometheus_client.metrics, PRIVATE_FLAG_NAME))
196+
197+
with patch("prometheus_client.metrics") as mock:
198+
setattr(mock, PRIVATE_FLAG_NAME, True)
199+
_set_prometheus_client_use_created_metrics(False)
200+
self.assertFalse(getattr(mock, PRIVATE_FLAG_NAME, False))

0 commit comments

Comments
 (0)