Skip to content

webserver's healthcheck monitors and diagnoses slow callbacks as unhealthy #1406

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Mar 26, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packages/service-library/requirements/_base.in
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ prometheus_client # TODO: add as optional service-library[monitoring]
tenacity
attrs
trafaret
aiodebug
37 changes: 21 additions & 16 deletions packages/service-library/requirements/_base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,38 @@
# This file is autogenerated by pip-compile
# To update, run:
#
# pip-compile --output-file=_base.txt _base.in
# pip-compile --build-isolation _base.in
#
aiohttp==3.6.2 # via -r requirements/_base.in, aiozipkin
aiopg[sa]==1.0.0 # via -r requirements/_base.in
aiozipkin==0.6.0 # via -r requirements/_base.in
aiodebug==1.1.2 # via -r _base.in
aiohttp==3.6.2 # via -r _base.in, aiozipkin
aiopg[sa]==1.0.0 # via -r _base.in
aiozipkin==0.6.0 # via -r _base.in
async-timeout==3.0.1 # via aiohttp
attrs==19.3.0 # via -r requirements/_base.in, aiohttp, jsonschema, openapi-core
attrs==19.3.0 # via -r _base.in, aiohttp, jsonschema, openapi-core
chardet==3.0.4 # via aiohttp
idna==2.8 # via yarl
idna-ssl==1.1.0 # via aiohttp
idna==2.8 # via idna-ssl, yarl
importlib-metadata==1.5.0 # via jsonschema
isodate==0.6.0 # via openapi-core
jsonschema==3.2.0 # via -r requirements/_base.in, openapi-spec-validator
jsonschema==3.2.0 # via -r _base.in, openapi-spec-validator
lazy-object-proxy==1.4.1 # via openapi-core
multidict==4.5.2 # via aiohttp, yarl
openapi-core==0.12.0 # via -r requirements/_base.in
openapi-core==0.12.0 # via -r _base.in
openapi-spec-validator==0.2.7 # via openapi-core
prometheus-client==0.7.1 # via -r requirements/_base.in
psycopg2-binary==2.8.4 # via -r requirements/_base.in, aiopg, sqlalchemy
prometheus-client==0.7.1 # via -r _base.in
psycopg2-binary==2.8.4 # via -r _base.in, aiopg, sqlalchemy
pyrsistent==0.15.2 # via jsonschema
pyyaml==5.3 # via -r requirements/_base.in, openapi-spec-validator
pyyaml==5.3 # via -r _base.in, openapi-spec-validator
six==1.12.0 # via isodate, jsonschema, openapi-core, openapi-spec-validator, pyrsistent, tenacity
sqlalchemy[postgresql_psycopg2binary]==1.3.4 # via -r requirements/_base.in, aiopg
sqlalchemy[postgresql_psycopg2binary]==1.3.4 # via -r _base.in, aiopg
strict-rfc3339==0.7 # via openapi-core
tenacity==6.1.0 # via -r requirements/_base.in
trafaret==2.0.2 # via -r requirements/_base.in
ujson==2.0.2 # via -r requirements/_base.in
werkzeug==1.0.0 # via -r requirements/_base.in
tenacity==6.1.0 # via -r _base.in
trafaret==2.0.2 # via -r _base.in
typing-extensions==3.7.4.1 # via aiohttp
ujson==2.0.2 # via -r _base.in
werkzeug==1.0.0 # via -r _base.in
yarl==1.3.0 # via aiohttp
zipp==3.1.0 # via importlib-metadata

# The following packages are considered to be unsafe in a requirements file:
# setuptools
11 changes: 6 additions & 5 deletions packages/service-library/requirements/_test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
# This file is autogenerated by pip-compile
# To update, run:
#
# pip-compile --build-isolation --output-file=_test.txt _test.in
# pip-compile --build-isolation _test.in
#
aiodebug==1.1.2 # via -r _base.txt
aiohttp==3.6.2 # via -r _base.txt, aiozipkin, pytest-aiohttp
aiopg[sa]==1.0.0 # via -r _base.txt
aiozipkin==0.6.0 # via -r _base.txt
Expand All @@ -15,9 +16,9 @@ chardet==3.0.4 # via -r _base.txt, aiohttp, requests
coverage==5.0.3 # via -r _test.in, coveralls, pytest-cov
coveralls==1.11.1 # via -r _test.in
docopt==0.6.2 # via coveralls
idna-ssl==1.1.0 # via aiohttp
idna-ssl==1.1.0 # via -r _base.txt, aiohttp
idna==2.8 # via -r _base.txt, idna-ssl, requests, yarl
importlib-metadata==1.5.0 # via jsonschema, pluggy, pytest
importlib-metadata==1.5.0 # via -r _base.txt, jsonschema, pluggy, pytest
isodate==0.6.0 # via -r _base.txt, openapi-core
isort==4.3.21 # via pylint
jsonschema==3.2.0 # via -r _base.txt, openapi-spec-validator
Expand Down Expand Up @@ -52,14 +53,14 @@ tenacity==6.1.0 # via -r _base.txt
termcolor==1.1.0 # via pytest-sugar
trafaret==2.0.2 # via -r _base.txt
typed-ast==1.4.1 # via astroid
typing-extensions==3.7.4.1 # via aiohttp
typing-extensions==3.7.4.1 # via -r _base.txt, aiohttp
ujson==2.0.2 # via -r _base.txt
urllib3==1.25.8 # via requests
wcwidth==0.1.8 # via pytest
werkzeug==1.0.0 # via -r _base.txt
wrapt==1.11.2 # via astroid
yarl==1.3.0 # via -r _base.txt, aiohttp
zipp==3.1.0 # via importlib-metadata
zipp==3.1.0 # via -r _base.txt, importlib-metadata

# The following packages are considered to be unsafe in a requirements file:
# setuptools
69 changes: 69 additions & 0 deletions packages/service-library/src/servicelib/incidents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from typing import Any, Callable, Generic, List, Optional, TypeVar

import attr


# UTILS ---

ItemT = TypeVar("ItemT")


@attr.s(auto_attribs=True)
class LimitedOrderedStack(Generic[ItemT]):
""" Container designed only to keep the most
relevant items (i.e called max) and drop
everything else

Can be used as base class for incidence registry
A running app might have endless amount of incidence
over-time and we aim only to keep the most relevant ones
provided we have limited resources.
"""

max_size: int = 100
order_by: Optional[Callable[[ItemT], Any]] = None

_items: List[ItemT] = attr.ib(init=False, default=attr.Factory(list))
_hits: int = attr.ib(init=False, default=0)

def __len__(self):
# called also for __bool__
return len(self._items)

@property
def hits(self):
return self._hits

@property
def max_item(self) -> Optional[ItemT]:
if self._items:
return self._items[0]
return None

@property
def min_item(self) -> Optional[ItemT]:
if self._items:
return self._items[-1]
return None

def append(self, item: ItemT):
self._items.append(item)
self._hits += 1

# sort is based on the __lt__ defined in ItemT
self._items = sorted(self._items, key=self.order_by, reverse=True)
if len(self._items) > self.max_size:
self._items.pop() # min is dropped


# INCIDENT ISSUES ---


@attr.s(auto_attribs=True)
class BaseIncident:
msg: str


@attr.s(auto_attribs=True)
class SlowCallback(BaseIncident):
delay_secs: float
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@

import asyncio.events
import time
from asyncio.base_events import _format_handle
from typing import List

from .incidents import SlowCallback


def enable(slow_duration_secs: float, incidents: List[SlowCallback]) -> None:
""" Based in from aiodebug

Patches ``asyncio.events.Handle`` to report an incident every time a callback
takes ``slow_duration_secs`` seconds or more to run.
"""
# pylint: disable=protected-access
from aiodebug.logging_compat import get_logger

logger = get_logger(__name__)
_run = asyncio.events.Handle._run

def instrumented(self):
t0 = time.monotonic()
retval = _run(self)
dt = time.monotonic() - t0
if dt >= slow_duration_secs:
task_info = _format_handle(self)
incidents.append( SlowCallback(msg=task_info, delay_secs=dt) )
logger.warning("Executing %s took %.3f seconds", task_info, dt)
return retval

asyncio.events.Handle._run = instrumented
49 changes: 49 additions & 0 deletions packages/service-library/tests/test_incidents_monitoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# pylint:disable=unused-variable
# pylint:disable=unused-argument
# pylint:disable=redefined-outer-name

import asyncio
import time

import pytest

from servicelib.aiopg_utils import DatabaseError, postgres_service_retry_policy_kwargs, retry
from servicelib import monitor_slow_callbacks


async def slow_task(delay):
time.sleep(delay)


@retry(**postgres_service_retry_policy_kwargs)
async def fails_to_reach_pg_db():
raise DatabaseError


@pytest.fixture
def incidents_manager(loop):
incidents = []
monitor_slow_callbacks.enable(slow_duration_secs=0.2, incidents=incidents)

f1a = asyncio.ensure_future(slow_task(0.3), loop=loop)
f1b = asyncio.ensure_future(slow_task(0.3), loop=loop)
f1c = asyncio.ensure_future(slow_task(0.4), loop=loop)

incidents_pg = None # aiopg_utils.monitor_pg_responsiveness.enable()
f2 = asyncio.ensure_future(fails_to_reach_pg_db(), loop=loop)

yield { 'slow_callback': incidents , 'posgres_responsive': incidents_pg}



async def test_slow_task_incident(incidents_manager):
await asyncio.sleep(2)
assert len( incidents_manager['slow_callback'] ) == 3

delays = [record.delay_secs for record in incidents_manager['slow_callback']]
assert max(delays)<0.5


@pytest.mark.skip(reason="TODO: Design under development")
def test_non_responsive_incident(incidents_manager):
pass
67 changes: 67 additions & 0 deletions packages/service-library/tests/test_incidents_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# pylint:disable=unused-variable
# pylint:disable=unused-argument
# pylint:disable=redefined-outer-name
# pylint: disable=protected-access
import operator

import attr

from servicelib.incidents import BaseIncident, LimitedOrderedStack


def test_limited_ordered_stack():
class IntsRegistry(LimitedOrderedStack[int]):
pass

reg = IntsRegistry(max_size=2)

assert not reg

reg.append(1)
reg.append(5)
assert reg._items == [5, 1]

reg.append(3)
reg.append(21)

assert reg._items == [21, 5]

assert reg.max_item == 21
assert reg.min_item == 5
assert len(reg) == reg.max_size


def test_incidents_stack():
@attr.s(auto_attribs=True)
class TestIncident(BaseIncident):
gravity: int

class IncidentsRegistry(LimitedOrderedStack[TestIncident]):
pass

incidents = IncidentsRegistry(max_size=2, order_by=operator.attrgetter("gravity"))

assert not incidents # __len__ == 0

foo = TestIncident("foo", 0)
bar = TestIncident("bar", 3)
zoo = TestIncident("zoo", 4)

incidents.append(foo)
incidents.append(bar)
incidents.append(zoo)

assert incidents # __len__ != 0
assert len(incidents) == 2
assert len(incidents) == incidents.max_size
assert incidents.hits == 3

assert incidents.max_item is zoo
assert incidents.min_item is bar

kuu = TestIncident("kuu", 22)
incidents.append(kuu)

assert incidents.max_item is kuu
assert len(incidents) == 2
assert incidents.hits == 4
1 change: 1 addition & 0 deletions services/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ services:
- SWARM_STACK_NAME=${SWARM_STACK_NAME:-simcore}
- WEBSERVER_MONITORING_ENABLED=1
- WEBSERVER_LOGLEVEL=${LOG_LEVEL:-WARNING}
- WEBSERVER_DIAGNOSTICS_MAX_DELAY_SECS=30
env_file:
- ../.env
depends_on:
Expand Down
3 changes: 2 additions & 1 deletion services/storage/requirements/_base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
# This file is autogenerated by pip-compile
# To update, run:
#
# pip-compile --output-file=_base.txt _base.in
# pip-compile --build-isolation _base.in
#
aioboto3==6.4.1 # via -r _base.in
aiobotocore[boto3]==0.10.4 # via aioboto3
aiodebug==1.1.2 # via -r ../../../packages/service-library/requirements/_base.in
aiofiles==0.4.0 # via -r _base.in
aiohttp==3.6.2 # via -r ../../../packages/service-library/requirements/_base.in, -r _base.in, aiobotocore, aiozipkin
aiopg[sa]==1.0.0 # via -r ../../../packages/service-library/requirements/_base.in, -r _base.in
Expand Down
3 changes: 2 additions & 1 deletion services/storage/requirements/_test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
# This file is autogenerated by pip-compile
# To update, run:
#
# pip-compile --build-isolation --output-file=_test.txt _test.in
# pip-compile --build-isolation _test.in
#
aioboto3==6.4.1 # via -r _base.txt
aiobotocore[boto3]==0.10.4 # via -r _base.txt, aioboto3
aiodebug==1.1.2 # via -r _base.txt
aiofiles==0.4.0 # via -r _base.txt
aiohttp==3.6.2 # via -r _base.txt, aiobotocore, aiozipkin, pytest-aiohttp
aiopg[sa]==1.0.0 # via -r _base.txt
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Dict

from aiohttp import web

from servicelib.application import create_safe_application
from servicelib.application_setup import ModuleCategory, app_module_setup
from servicelib.monitoring import setup_monitoring
Expand All @@ -15,6 +16,7 @@
from .catalog import setup_catalog
from .computation import setup_computation
from .db import setup_db
from .diagnostics import setup_diagnostics
from .director import setup_director
from .email import setup_email
from .login import setup_login
Expand Down Expand Up @@ -57,6 +59,7 @@ def create_application(config: Dict) -> web.Application:

# TODO: create dependency mechanism
# and compute setup order https://github.com/ITISFoundation/osparc-simcore/issues/1142
setup_diagnostics(app)
setup_app_monitoring(app)
setup_app_tracing(app)
setup_statics(app)
Expand Down
Loading