Skip to content

Commit 282174b

Browse files
authored
webserver's healthcheck monitors and diagnoses slow callbacks as unhealthy (#1406)
* Added incidents in service-library * Added aiodebug to requirements * Enhnaced webserver healthcheck entrypoint - monitors slow callbacks * Upgrades storage reqs
1 parent d2b3e6b commit 282174b

File tree

15 files changed

+416
-41
lines changed

15 files changed

+416
-41
lines changed

packages/service-library/requirements/_base.in

+1
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,4 @@ prometheus_client # TODO: add as optional service-library[monitoring]
1818
tenacity
1919
attrs
2020
trafaret
21+
aiodebug

packages/service-library/requirements/_base.txt

+21-16
Original file line numberDiff line numberDiff line change
@@ -2,33 +2,38 @@
22
# This file is autogenerated by pip-compile
33
# To update, run:
44
#
5-
# pip-compile --output-file=_base.txt _base.in
5+
# pip-compile --build-isolation _base.in
66
#
7-
aiohttp==3.6.2 # via -r requirements/_base.in, aiozipkin
8-
aiopg[sa]==1.0.0 # via -r requirements/_base.in
9-
aiozipkin==0.6.0 # via -r requirements/_base.in
7+
aiodebug==1.1.2 # via -r _base.in
8+
aiohttp==3.6.2 # via -r _base.in, aiozipkin
9+
aiopg[sa]==1.0.0 # via -r _base.in
10+
aiozipkin==0.6.0 # via -r _base.in
1011
async-timeout==3.0.1 # via aiohttp
11-
attrs==19.3.0 # via -r requirements/_base.in, aiohttp, jsonschema, openapi-core
12+
attrs==19.3.0 # via -r _base.in, aiohttp, jsonschema, openapi-core
1213
chardet==3.0.4 # via aiohttp
13-
idna==2.8 # via yarl
14+
idna-ssl==1.1.0 # via aiohttp
15+
idna==2.8 # via idna-ssl, yarl
16+
importlib-metadata==1.5.0 # via jsonschema
1417
isodate==0.6.0 # via openapi-core
15-
jsonschema==3.2.0 # via -r requirements/_base.in, openapi-spec-validator
18+
jsonschema==3.2.0 # via -r _base.in, openapi-spec-validator
1619
lazy-object-proxy==1.4.1 # via openapi-core
1720
multidict==4.5.2 # via aiohttp, yarl
18-
openapi-core==0.12.0 # via -r requirements/_base.in
21+
openapi-core==0.12.0 # via -r _base.in
1922
openapi-spec-validator==0.2.7 # via openapi-core
20-
prometheus-client==0.7.1 # via -r requirements/_base.in
21-
psycopg2-binary==2.8.4 # via -r requirements/_base.in, aiopg, sqlalchemy
23+
prometheus-client==0.7.1 # via -r _base.in
24+
psycopg2-binary==2.8.4 # via -r _base.in, aiopg, sqlalchemy
2225
pyrsistent==0.15.2 # via jsonschema
23-
pyyaml==5.3 # via -r requirements/_base.in, openapi-spec-validator
26+
pyyaml==5.3 # via -r _base.in, openapi-spec-validator
2427
six==1.12.0 # via isodate, jsonschema, openapi-core, openapi-spec-validator, pyrsistent, tenacity
25-
sqlalchemy[postgresql_psycopg2binary]==1.3.4 # via -r requirements/_base.in, aiopg
28+
sqlalchemy[postgresql_psycopg2binary]==1.3.4 # via -r _base.in, aiopg
2629
strict-rfc3339==0.7 # via openapi-core
27-
tenacity==6.1.0 # via -r requirements/_base.in
28-
trafaret==2.0.2 # via -r requirements/_base.in
29-
ujson==2.0.2 # via -r requirements/_base.in
30-
werkzeug==1.0.0 # via -r requirements/_base.in
30+
tenacity==6.1.0 # via -r _base.in
31+
trafaret==2.0.2 # via -r _base.in
32+
typing-extensions==3.7.4.1 # via aiohttp
33+
ujson==2.0.2 # via -r _base.in
34+
werkzeug==1.0.0 # via -r _base.in
3135
yarl==1.3.0 # via aiohttp
36+
zipp==3.1.0 # via importlib-metadata
3237

3338
# The following packages are considered to be unsafe in a requirements file:
3439
# setuptools

packages/service-library/requirements/_test.txt

+6-5
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
# This file is autogenerated by pip-compile
33
# To update, run:
44
#
5-
# pip-compile --build-isolation --output-file=_test.txt _test.in
5+
# pip-compile --build-isolation _test.in
66
#
7+
aiodebug==1.1.2 # via -r _base.txt
78
aiohttp==3.6.2 # via -r _base.txt, aiozipkin, pytest-aiohttp
89
aiopg[sa]==1.0.0 # via -r _base.txt
910
aiozipkin==0.6.0 # via -r _base.txt
@@ -15,9 +16,9 @@ chardet==3.0.4 # via -r _base.txt, aiohttp, requests
1516
coverage==5.0.3 # via -r _test.in, coveralls, pytest-cov
1617
coveralls==1.11.1 # via -r _test.in
1718
docopt==0.6.2 # via coveralls
18-
idna-ssl==1.1.0 # via aiohttp
19+
idna-ssl==1.1.0 # via -r _base.txt, aiohttp
1920
idna==2.8 # via -r _base.txt, idna-ssl, requests, yarl
20-
importlib-metadata==1.5.0 # via jsonschema, pluggy, pytest
21+
importlib-metadata==1.5.0 # via -r _base.txt, jsonschema, pluggy, pytest
2122
isodate==0.6.0 # via -r _base.txt, openapi-core
2223
isort==4.3.21 # via pylint
2324
jsonschema==3.2.0 # via -r _base.txt, openapi-spec-validator
@@ -52,14 +53,14 @@ tenacity==6.1.0 # via -r _base.txt
5253
termcolor==1.1.0 # via pytest-sugar
5354
trafaret==2.0.2 # via -r _base.txt
5455
typed-ast==1.4.1 # via astroid
55-
typing-extensions==3.7.4.1 # via aiohttp
56+
typing-extensions==3.7.4.1 # via -r _base.txt, aiohttp
5657
ujson==2.0.2 # via -r _base.txt
5758
urllib3==1.25.8 # via requests
5859
wcwidth==0.1.8 # via pytest
5960
werkzeug==1.0.0 # via -r _base.txt
6061
wrapt==1.11.2 # via astroid
6162
yarl==1.3.0 # via -r _base.txt, aiohttp
62-
zipp==3.1.0 # via importlib-metadata
63+
zipp==3.1.0 # via -r _base.txt, importlib-metadata
6364

6465
# The following packages are considered to be unsafe in a requirements file:
6566
# setuptools
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
from typing import Any, Callable, Generic, List, Optional, TypeVar
2+
3+
import attr
4+
5+
6+
# UTILS ---
7+
8+
ItemT = TypeVar("ItemT")
9+
10+
11+
@attr.s(auto_attribs=True)
12+
class LimitedOrderedStack(Generic[ItemT]):
13+
""" Container designed only to keep the most
14+
relevant items (i.e called max) and drop
15+
everything else
16+
17+
Can be used as base class for incidence registry
18+
A running app might have endless amount of incidence
19+
over-time and we aim only to keep the most relevant ones
20+
provided we have limited resources.
21+
"""
22+
23+
max_size: int = 100
24+
order_by: Optional[Callable[[ItemT], Any]] = None
25+
26+
_items: List[ItemT] = attr.ib(init=False, default=attr.Factory(list))
27+
_hits: int = attr.ib(init=False, default=0)
28+
29+
def __len__(self):
30+
# called also for __bool__
31+
return len(self._items)
32+
33+
@property
34+
def hits(self):
35+
return self._hits
36+
37+
@property
38+
def max_item(self) -> Optional[ItemT]:
39+
if self._items:
40+
return self._items[0]
41+
return None
42+
43+
@property
44+
def min_item(self) -> Optional[ItemT]:
45+
if self._items:
46+
return self._items[-1]
47+
return None
48+
49+
def append(self, item: ItemT):
50+
self._items.append(item)
51+
self._hits += 1
52+
53+
# sort is based on the __lt__ defined in ItemT
54+
self._items = sorted(self._items, key=self.order_by, reverse=True)
55+
if len(self._items) > self.max_size:
56+
self._items.pop() # min is dropped
57+
58+
59+
# INCIDENT ISSUES ---
60+
61+
62+
@attr.s(auto_attribs=True)
63+
class BaseIncident:
64+
msg: str
65+
66+
67+
@attr.s(auto_attribs=True)
68+
class SlowCallback(BaseIncident):
69+
delay_secs: float
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
2+
import asyncio.events
3+
import time
4+
from asyncio.base_events import _format_handle
5+
from typing import List
6+
7+
from .incidents import SlowCallback
8+
9+
10+
def enable(slow_duration_secs: float, incidents: List[SlowCallback]) -> None:
11+
""" Based in from aiodebug
12+
13+
Patches ``asyncio.events.Handle`` to report an incident every time a callback
14+
takes ``slow_duration_secs`` seconds or more to run.
15+
"""
16+
# pylint: disable=protected-access
17+
from aiodebug.logging_compat import get_logger
18+
19+
logger = get_logger(__name__)
20+
_run = asyncio.events.Handle._run
21+
22+
def instrumented(self):
23+
t0 = time.monotonic()
24+
retval = _run(self)
25+
dt = time.monotonic() - t0
26+
if dt >= slow_duration_secs:
27+
task_info = _format_handle(self)
28+
incidents.append( SlowCallback(msg=task_info, delay_secs=dt) )
29+
logger.warning("Executing %s took %.3f seconds", task_info, dt)
30+
return retval
31+
32+
asyncio.events.Handle._run = instrumented
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# pylint:disable=unused-variable
2+
# pylint:disable=unused-argument
3+
# pylint:disable=redefined-outer-name
4+
5+
import asyncio
6+
import time
7+
8+
import pytest
9+
10+
from servicelib.aiopg_utils import DatabaseError, postgres_service_retry_policy_kwargs, retry
11+
from servicelib import monitor_slow_callbacks
12+
13+
14+
async def slow_task(delay):
15+
time.sleep(delay)
16+
17+
18+
@retry(**postgres_service_retry_policy_kwargs)
19+
async def fails_to_reach_pg_db():
20+
raise DatabaseError
21+
22+
23+
@pytest.fixture
24+
def incidents_manager(loop):
25+
incidents = []
26+
monitor_slow_callbacks.enable(slow_duration_secs=0.2, incidents=incidents)
27+
28+
f1a = asyncio.ensure_future(slow_task(0.3), loop=loop)
29+
f1b = asyncio.ensure_future(slow_task(0.3), loop=loop)
30+
f1c = asyncio.ensure_future(slow_task(0.4), loop=loop)
31+
32+
incidents_pg = None # aiopg_utils.monitor_pg_responsiveness.enable()
33+
f2 = asyncio.ensure_future(fails_to_reach_pg_db(), loop=loop)
34+
35+
yield { 'slow_callback': incidents , 'posgres_responsive': incidents_pg}
36+
37+
38+
39+
async def test_slow_task_incident(incidents_manager):
40+
await asyncio.sleep(2)
41+
assert len( incidents_manager['slow_callback'] ) == 3
42+
43+
delays = [record.delay_secs for record in incidents_manager['slow_callback']]
44+
assert max(delays)<0.5
45+
46+
47+
@pytest.mark.skip(reason="TODO: Design under development")
48+
def test_non_responsive_incident(incidents_manager):
49+
pass
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# pylint:disable=unused-variable
2+
# pylint:disable=unused-argument
3+
# pylint:disable=redefined-outer-name
4+
# pylint: disable=protected-access
5+
import operator
6+
7+
import attr
8+
9+
from servicelib.incidents import BaseIncident, LimitedOrderedStack
10+
11+
12+
def test_limited_ordered_stack():
13+
class IntsRegistry(LimitedOrderedStack[int]):
14+
pass
15+
16+
reg = IntsRegistry(max_size=2)
17+
18+
assert not reg
19+
20+
reg.append(1)
21+
reg.append(5)
22+
assert reg._items == [5, 1]
23+
24+
reg.append(3)
25+
reg.append(21)
26+
27+
assert reg._items == [21, 5]
28+
29+
assert reg.max_item == 21
30+
assert reg.min_item == 5
31+
assert len(reg) == reg.max_size
32+
33+
34+
def test_incidents_stack():
35+
@attr.s(auto_attribs=True)
36+
class TestIncident(BaseIncident):
37+
gravity: int
38+
39+
class IncidentsRegistry(LimitedOrderedStack[TestIncident]):
40+
pass
41+
42+
incidents = IncidentsRegistry(max_size=2, order_by=operator.attrgetter("gravity"))
43+
44+
assert not incidents # __len__ == 0
45+
46+
foo = TestIncident("foo", 0)
47+
bar = TestIncident("bar", 3)
48+
zoo = TestIncident("zoo", 4)
49+
50+
incidents.append(foo)
51+
incidents.append(bar)
52+
incidents.append(zoo)
53+
54+
assert incidents # __len__ != 0
55+
assert len(incidents) == 2
56+
assert len(incidents) == incidents.max_size
57+
assert incidents.hits == 3
58+
59+
assert incidents.max_item is zoo
60+
assert incidents.min_item is bar
61+
62+
kuu = TestIncident("kuu", 22)
63+
incidents.append(kuu)
64+
65+
assert incidents.max_item is kuu
66+
assert len(incidents) == 2
67+
assert incidents.hits == 4

services/docker-compose.yml

+1
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ services:
6666
- SWARM_STACK_NAME=${SWARM_STACK_NAME:-simcore}
6767
- WEBSERVER_MONITORING_ENABLED=1
6868
- WEBSERVER_LOGLEVEL=${LOG_LEVEL:-WARNING}
69+
- WEBSERVER_DIAGNOSTICS_MAX_DELAY_SECS=30
6970
env_file:
7071
- ../.env
7172
depends_on:

services/storage/requirements/_base.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
# This file is autogenerated by pip-compile
33
# To update, run:
44
#
5-
# pip-compile --output-file=_base.txt _base.in
5+
# pip-compile --build-isolation _base.in
66
#
77
aioboto3==6.4.1 # via -r _base.in
88
aiobotocore[boto3]==0.10.4 # via aioboto3
9+
aiodebug==1.1.2 # via -r ../../../packages/service-library/requirements/_base.in
910
aiofiles==0.4.0 # via -r _base.in
1011
aiohttp==3.6.2 # via -r ../../../packages/service-library/requirements/_base.in, -r _base.in, aiobotocore, aiozipkin
1112
aiopg[sa]==1.0.0 # via -r ../../../packages/service-library/requirements/_base.in, -r _base.in

services/storage/requirements/_test.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
# This file is autogenerated by pip-compile
33
# To update, run:
44
#
5-
# pip-compile --build-isolation --output-file=_test.txt _test.in
5+
# pip-compile --build-isolation _test.in
66
#
77
aioboto3==6.4.1 # via -r _base.txt
88
aiobotocore[boto3]==0.10.4 # via -r _base.txt, aioboto3
9+
aiodebug==1.1.2 # via -r _base.txt
910
aiofiles==0.4.0 # via -r _base.txt
1011
aiohttp==3.6.2 # via -r _base.txt, aiobotocore, aiozipkin, pytest-aiohttp
1112
aiopg[sa]==1.0.0 # via -r _base.txt

services/web/server/src/simcore_service_webserver/application.py

+3
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from typing import Dict
77

88
from aiohttp import web
9+
910
from servicelib.application import create_safe_application
1011
from servicelib.application_setup import ModuleCategory, app_module_setup
1112
from servicelib.monitoring import setup_monitoring
@@ -15,6 +16,7 @@
1516
from .catalog import setup_catalog
1617
from .computation import setup_computation
1718
from .db import setup_db
19+
from .diagnostics import setup_diagnostics
1820
from .director import setup_director
1921
from .email import setup_email
2022
from .login import setup_login
@@ -57,6 +59,7 @@ def create_application(config: Dict) -> web.Application:
5759

5860
# TODO: create dependency mechanism
5961
# and compute setup order https://github.com/ITISFoundation/osparc-simcore/issues/1142
62+
setup_diagnostics(app)
6063
setup_app_monitoring(app)
6164
setup_app_tracing(app)
6265
setup_statics(app)

0 commit comments

Comments
 (0)