Skip to content

Commit cf5ceb1

Browse files
committed
report memory and cpu metrics to prometheus
1 parent dd2d21c commit cf5ceb1

File tree

4 files changed

+151
-109
lines changed

4 files changed

+151
-109
lines changed

nbresuse/__init__.py

Lines changed: 8 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -1,100 +1,15 @@
11
import os
2-
import json
3-
import psutil
42
from traitlets import Bool, Float, Int, Union, default
53
from traitlets.config import Configurable
6-
from notebook.utils import url_path_join
7-
from notebook.base.handlers import IPythonHandler
8-
from tornado import web
4+
from tornado import ioloop
5+
from nbresuse.prometheus import PrometheusHandler
96

107
try:
118
# Traitlets >= 4.3.3
129
from traitlets import Callable
1310
except ImportError:
1411
from .utils import Callable
1512

16-
from concurrent.futures import ThreadPoolExecutor
17-
from tornado.concurrent import run_on_executor
18-
19-
class MetricsHandler(IPythonHandler):
20-
def initialize(self):
21-
super().initialize()
22-
self.cpu_percent = 0
23-
24-
# https://www.tornadoweb.org/en/stable/concurrent.html#tornado.concurrent.run_on_executor
25-
self.executor = ThreadPoolExecutor(max_workers=10)
26-
27-
self.cpu_count = psutil.cpu_count()
28-
29-
@run_on_executor
30-
def update_cpu_percent(self, all_processes):
31-
32-
def get_cpu_percent(p):
33-
try:
34-
return p.cpu_percent(interval=0.05)
35-
# Avoid littering logs with stack traces complaining
36-
# about dead processes having no CPU usage
37-
except:
38-
return 0
39-
40-
return sum([get_cpu_percent(p) for p in all_processes])
41-
42-
@web.authenticated
43-
async def get(self):
44-
"""
45-
Calculate and return current resource usage metrics
46-
"""
47-
config = self.settings['nbresuse_display_config']
48-
cur_process = psutil.Process()
49-
all_processes = [cur_process] + cur_process.children(recursive=True)
50-
limits = {}
51-
52-
# Get memory information
53-
rss = sum([p.memory_info().rss for p in all_processes])
54-
55-
if callable(config.mem_limit):
56-
mem_limit = config.mem_limit(rss=rss)
57-
else: # mem_limit is an Int
58-
mem_limit = config.mem_limit
59-
60-
# A better approach would use cpu_affinity to account for the
61-
# fact that the number of logical CPUs in the system is not
62-
# necessarily the same as the number of CPUs the process
63-
# can actually use. But cpu_affinity isn't available for OS X.
64-
cpu_count = psutil.cpu_count()
65-
66-
if config.track_cpu_percent:
67-
self.cpu_percent = await self.update_cpu_percent(all_processes)
68-
69-
if config.mem_limit != 0:
70-
limits['memory'] = {
71-
'rss': mem_limit
72-
}
73-
if config.mem_warning_threshold != 0:
74-
limits['memory']['warn'] = (mem_limit - rss) < (mem_limit * config.mem_warning_threshold)
75-
76-
# Optionally get CPU information
77-
if config.track_cpu_percent:
78-
self.cpu_percent = await self.update_cpu_percent(all_processes)
79-
80-
if config.cpu_limit != 0:
81-
limits['cpu'] = {
82-
'cpu': config.cpu_limit
83-
}
84-
if config.cpu_warning_threshold != 0:
85-
limits['cpu']['warn'] = (config.cpu_limit - self.cpu_percent) < (config.cpu_limit * config.cpu_warning_threshold)
86-
87-
metrics = {
88-
'rss': rss,
89-
'limits': limits,
90-
}
91-
if config.track_cpu_percent:
92-
metrics.update(cpu_percent=self.cpu_percent,
93-
cpu_count=self.cpu_count)
94-
95-
self.log.debug("NBResuse metrics: %s", metrics)
96-
self.write(json.dumps(metrics))
97-
9813

9914
def _jupyter_server_extension_paths():
10015
"""
@@ -104,6 +19,7 @@ def _jupyter_server_extension_paths():
10419
'module': 'nbresuse',
10520
}]
10621

22+
10723
def _jupyter_nbextension_paths():
10824
"""
10925
Set up the notebook extension for displaying metrics
@@ -115,6 +31,7 @@ def _jupyter_nbextension_paths():
11531
"require": "nbresuse/main"
11632
}]
11733

34+
11835
class ResourceUseDisplay(Configurable):
11936
"""
12037
Holds server-side configuration for nbresuse
@@ -151,7 +68,7 @@ def _mem_limit_default(self):
15168
return int(os.environ.get('MEM_LIMIT', 0))
15269

15370
track_cpu_percent = Bool(
154-
default_value=False,
71+
default_value=True,
15572
help="""
15673
Set to True in order to enable reporting of CPU usage statistics.
15774
"""
@@ -186,11 +103,12 @@ def _mem_limit_default(self):
186103
def _cpu_limit_default(self):
187104
return float(os.environ.get('CPU_LIMIT', 0))
188105

106+
189107
def load_jupyter_server_extension(nbapp):
190108
"""
191109
Called during notebook start
192110
"""
193111
resuseconfig = ResourceUseDisplay(parent=nbapp)
194112
nbapp.web_app.settings['nbresuse_display_config'] = resuseconfig
195-
route_pattern = url_path_join(nbapp.web_app.settings['base_url'], '/metrics')
196-
nbapp.web_app.add_handlers('.*', [(route_pattern, MetricsHandler)])
113+
callback = ioloop.PeriodicCallback(PrometheusHandler(nbapp), 1000)
114+
callback.start()

nbresuse/metrics.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import psutil
2+
from typing import NamedTuple
3+
4+
5+
class MemoryMetrics(NamedTuple):
6+
current_memory: int
7+
max_memory: int
8+
9+
10+
class CPUMetrics(NamedTuple):
11+
cpu_count: int
12+
cpu_usage: float
13+
14+
15+
def memory_metrics() -> MemoryMetrics:
16+
cur_process = psutil.Process()
17+
all_processes = [cur_process] + cur_process.children(recursive=True)
18+
19+
rss = sum([p.memory_info().rss for p in all_processes])
20+
virtual_memory = psutil.virtual_memory()
21+
22+
return MemoryMetrics(
23+
rss,
24+
virtual_memory.total
25+
)
26+
27+
28+
def cpu_metrics() -> CPUMetrics:
29+
cur_process = psutil.Process()
30+
all_processes = [cur_process] + cur_process.children(recursive=True)
31+
32+
cpu_count = psutil.cpu_count()
33+
34+
def get_cpu_percent(p):
35+
try:
36+
return p.cpu_percent(interval=0.05)
37+
# Avoid littering logs with stack traces complaining
38+
# about dead processes having no CPU usage
39+
except:
40+
return 0
41+
cpu_percent = sum([get_cpu_percent(p) for p in all_processes])
42+
43+
return CPUMetrics(
44+
cpu_count,
45+
cpu_percent
46+
)

nbresuse/prometheus.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from tornado import gen
2+
from notebook.notebookapp import NotebookApp
3+
from nbresuse.metrics import CPUMetrics, MemoryMetrics, cpu_metrics, memory_metrics
4+
from typing import Union
5+
6+
from prometheus_client import Gauge
7+
8+
try:
9+
# Traitlets >= 4.3.3
10+
from traitlets import Callable
11+
except ImportError:
12+
from .utils import Callable
13+
14+
TOTAL_MEMORY_USAGE = Gauge(
15+
'total_memory_usage',
16+
'counter for total memory usage',
17+
[]
18+
)
19+
20+
MAX_MEMORY_USAGE = Gauge(
21+
'max_memory_usage',
22+
'counter for max memory usage',
23+
[]
24+
)
25+
26+
TOTAL_CPU_USAGE = Gauge(
27+
'total_cpu_usage',
28+
'counter for total cpu usage',
29+
[]
30+
)
31+
32+
MAX_CPU_USAGE = Gauge(
33+
'max_cpu_usage',
34+
'counter for max cpu usage',
35+
[]
36+
)
37+
38+
39+
class PrometheusHandler(Callable):
40+
def __init__(self, nbapp: NotebookApp):
41+
super().__init__()
42+
self.config = nbapp.web_app.settings['nbresuse_display_config']
43+
self.session_manager = nbapp.session_manager
44+
45+
@gen.coroutine
46+
def __call__(self, *args, **kwargs):
47+
metrics = memory_metrics()
48+
TOTAL_MEMORY_USAGE.set(metrics.current_memory)
49+
MAX_MEMORY_USAGE.set(metrics.max_memory)
50+
if self.config.track_cpu_percent:
51+
metrics = cpu_metrics()
52+
TOTAL_CPU_USAGE.set(metrics.cpu_usage)
53+
MAX_CPU_USAGE.set(metrics.cpu_count)

nbresuse/static/main.js

Lines changed: 44 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
define(['jquery', 'base/js/utils'], function ($, utils) {
1+
define([
2+
'jquery',
3+
'base/js/utils'
4+
], function ($, utils) {
25
function setupDOM() {
36
$('#maintoolbar-container').append(
47
$('<div>').attr('id', 'nbresuse-display')
@@ -20,32 +23,54 @@ define(['jquery', 'base/js/utils'], function ($, utils) {
2023
);
2124
}
2225

26+
function humanFileSize(size) {
27+
var i = Math.floor( Math.log(size) / Math.log(1024) );
28+
return ( size / Math.pow(1024, i) ).toFixed(1) * 1 + ' ' + ['B', 'kB', 'MB', 'GB', 'TB'][i];
29+
}
30+
31+
32+
function metric(metric_name, text, multiple=false) {
33+
var regex = new RegExp("^" + metric_name + "\{?([^ \}]*)\}? (.*)$", "gm");
34+
var matches = [];
35+
var match;
36+
37+
do{
38+
match = regex.exec(text);
39+
if (match){
40+
matches.push(match)
41+
}
42+
}
43+
while (match);
44+
45+
if (!multiple) {
46+
if (matches.length > 0)
47+
return matches[0];
48+
return null;
49+
}else
50+
return matches;
51+
}
52+
2353
var displayMetrics = function() {
2454
if (document.hidden) {
2555
// Don't poll when nobody is looking
2656
return;
2757
}
28-
$.getJSON(utils.get_body_data('baseUrl') + 'metrics', function(data) {
29-
// FIXME: Proper setups for MB and GB. MB should have 0 things
30-
// after the ., but GB should have 2.
31-
var display = Math.round(data['rss'] / (1024 * 1024));
58+
$.ajax({
59+
url: utils.get_body_data('baseUrl') + 'metrics',
60+
success: function(data) {
61+
let totalMemoryUsage = metric("total_memory_usage", data);
62+
let maxMemoryUsage = metric("max_memory_usage", data);
3263

33-
var limits = data['limits'];
34-
if ('memory' in limits) {
35-
if ('rss' in limits['memory']) {
36-
display += " / " + Math.round(limits['memory']['rss'] / (1024 * 1024));
37-
}
38-
if (limits['memory']['warn']) {
39-
$('#nbresuse-display').addClass('nbresuse-warn');
40-
} else {
41-
$('#nbresuse-display').removeClass('nbresuse-warn');
42-
}
43-
}
44-
if (data['limits']['memory'] !== null) {
64+
if (!totalMemoryUsage || !maxMemoryUsage)
65+
return;
66+
totalMemoryUsage = humanFileSize(parseFloat(totalMemoryUsage[2]));
67+
maxMemoryUsage = humanFileSize(parseFloat(maxMemoryUsage[2]));
68+
69+
var display = totalMemoryUsage + "/" + maxMemoryUsage;
70+
$('#nbresuse-mem').text(display);
4571
}
46-
$('#nbresuse-mem').text(display + ' MB');
4772
});
48-
}
73+
};
4974

5075
var load_ipython_extension = function () {
5176
setupDOM();

0 commit comments

Comments
 (0)