Skip to content

Commit a957016

Browse files
authored
[maintenance] diagnose long response latency (#1421)
Monitors webserver's traffic and diagnoses when the average latency of the last requests is over a given threshold. At that point, the diagnostics raises HeathError which flags the service as unhealthy (i.e. healthy entrypoint returns 503). The swarm healthcheck mechanism can decide to restart it.
1 parent a7fac9b commit a957016

38 files changed

+920
-234
lines changed

Makefile

+2-2
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,10 @@ SWARM_HOSTS = $(shell docker node ls --format="{{.Hostname}}" 2>$(if $(IS_WIN),N
7575
.PHONY: build build-nc rebuild build-devel build-devel-nc build-devel-kit build-devel-x build-cache build-cache-kit build-cache-x build-cache-nc build-kit build-x
7676

7777
define _docker_compose_build
78-
export BUILD_TARGET=$(if $(findstring -devel,$@),development,$(if $(findstring -cache,$@),cache,production)); \
78+
export BUILD_TARGET=$(if $(findstring -devel,$@),development,$(if $(findstring -cache,$@),cache,production));\
7979
$(if $(findstring -x,$@),\
8080
pushd services; docker buildx bake --file docker-compose-build.yml; popd;,\
81-
docker-compose -f services/docker-compose-build.yml build $(if $(findstring -nc,$@),--no-cache,) --parallel;\
81+
docker-compose -f services/docker-compose-build.yml build $(if $(findstring -nc,$@),--no-cache,) $(if $(target),,--parallel)\
8282
)
8383
endef
8484

api/specs/webserver/openapi-diagnostics.yaml

+16-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,22 @@ paths:
33
get:
44
tags:
55
- maintenance
6-
summary: Service health check
6+
summary: run check
7+
operationId: check_running
8+
responses:
9+
"200":
10+
description: Service information
11+
content:
12+
application/json:
13+
schema:
14+
$ref: "./components/schemas/health_check.yaml#/HealthCheckEnveloped"
15+
default:
16+
$ref: "#/components/responses/DefaultErrorResponse"
17+
/health:
18+
get:
19+
tags:
20+
- maintenance
21+
summary: health check
722
operationId: check_health
823
responses:
924
"200":

api/specs/webserver/openapi.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ paths:
4949
# DIAGNOSTICS ---------------------------------------------------------
5050
/:
5151
$ref: "./openapi-diagnostics.yaml#/paths/~1"
52+
53+
/health:
54+
$ref: "./openapi-diagnostics.yaml#/paths/~1health"
5255

5356
/check/{action}:
5457
$ref: "./openapi-diagnostics.yaml#/paths/~1check~1{action}"

packages/postgres-database/requirements/_test.in

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
-r _migration.txt
77

88
# fixtures
9-
pyaml
9+
pyyaml
1010
aiopg[sa]
1111
faker
1212

packages/postgres-database/requirements/_test.txt

+7-4
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# This file is autogenerated by pip-compile
33
# To update, run:
44
#
5-
# pip-compile --build-isolation --output-file=_test.txt _test.in
5+
# pip-compile requirements/_test.in
66
#
77
aiohttp==3.6.2 # via pytest-aiohttp
88
aiopg[sa]==1.0.0 # via -r requirements/_test.in
@@ -21,7 +21,9 @@ coveralls==1.11.1 # via -r requirements/_test.in
2121
docker==4.2.0 # via -r requirements/_migration.txt
2222
docopt==0.6.2 # via coveralls
2323
faker==4.0.2 # via -r requirements/_test.in
24+
idna-ssl==1.1.0 # via aiohttp
2425
idna==2.8 # via -r requirements/_migration.txt, requests, yarl
26+
importlib-metadata==1.6.0 # via pluggy, pytest
2527
isort==4.3.21 # via pylint
2628
lazy-object-proxy==1.4.3 # via astroid
2729
mako==1.0.12 # via -r requirements/_migration.txt, alembic
@@ -35,7 +37,6 @@ pip-tools==4.5.1 # via -r requirements/../../../requirements.txt
3537
pluggy==0.13.1 # via pytest
3638
psycopg2-binary==2.8.4 # via -r requirements/_migration.txt, aiopg, sqlalchemy
3739
py==1.8.1 # via pytest
38-
pyaml==20.3.1 # via -r requirements/_test.in
3940
pylint==2.4.4 # via -r requirements/../../../requirements.txt, -r requirements/_test.in
4041
pyparsing==2.4.6 # via packaging
4142
pytest-aiohttp==0.3.0 # via -r requirements/_test.in
@@ -46,7 +47,7 @@ pytest-runner==5.2 # via -r requirements/_test.in
4647
pytest==5.3.5 # via -r requirements/_test.in, pytest-aiohttp, pytest-cov, pytest-instafail
4748
python-dateutil==2.8.0 # via -r requirements/_migration.txt, alembic, faker
4849
python-editor==1.0.4 # via -r requirements/_migration.txt, alembic
49-
pyyaml==5.3 # via pyaml
50+
pyyaml==5.3.1 # via -r requirements/_test.in
5051
regex==2020.2.20 # via black
5152
requests==2.22.0 # via -r requirements/_migration.txt, coveralls, docker
5253
rope==0.16.0 # via -r requirements/../../../requirements.txt
@@ -55,9 +56,11 @@ sqlalchemy[postgresql_psycopg2binary]==1.3.5 # via -r requirements/_migration.t
5556
tenacity==6.1.0 # via -r requirements/_migration.txt
5657
text-unidecode==1.3 # via faker
5758
toml==0.10.0 # via black
58-
typed-ast==1.4.1 # via black
59+
typed-ast==1.4.1 # via astroid, black
60+
typing-extensions==3.7.4.2 # via aiohttp
5961
urllib3==1.25.8 # via -r requirements/_migration.txt, requests
6062
wcwidth==0.1.8 # via pytest
6163
websocket-client==0.56.0 # via -r requirements/_migration.txt, docker
6264
wrapt==1.11.2 # via astroid
6365
yarl==1.4.2 # via -r requirements/_migration.txt, aiohttp
66+
zipp==3.1.0 # via importlib-metadata

packages/pytest-simcore/src/pytest_simcore/helpers/utils_assert.py

+3
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
async def assert_status(
1111
response: web.Response, expected_cls: web.HTTPException, expected_msg: str = None
1212
):
13+
"""
14+
Asserts for enveloped responses
15+
"""
1316
data, error = unwrap_envelope(await response.json())
1417
assert response.status == expected_cls.status_code, (data, error)
1518

packages/service-library/src/servicelib/application_setup.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def app_module_setup(
4545
4646
See packages/service-library/tests/test_application_setup.py
4747
48-
:param module_name: typically __name__ (automaticaly removes '.__init__')
48+
:param module_name: typically __name__
4949
:param depends: list of module_names that must be called first, defaults to None
5050
:param config_section: explicit configuration section, defaults to None (i.e. the name of the module, or last entry of the name if dotted)
5151
:param config_enabled: option in config to enable, defaults to None which is '$(module-section).enabled' (config_section and config_enabled are mutually exclusive)
@@ -122,7 +122,7 @@ def _get(cfg_, parts):
122122
is_enabled = _get(cfg, config_enabled.split("."))
123123
except KeyError as ee:
124124
raise ApplicationSetupError(
125-
f"Cannot find '{config_enabled}' in app config at [ {ee} ]"
125+
f"Cannot find required option '{config_enabled}' in app config's section '{ee}'"
126126
)
127127

128128
if not is_enabled:

packages/service-library/src/servicelib/monitoring.py

+30-16
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
3-
UNDER DEVELOPMENT for issue #784
3+
UNDER DEVELOPMENT for issue #784 (see web/server/diagnostics_monitoring.py)
44
55
Based on https://github.com/amitsaha/aiohttp-prometheus
66
@@ -20,6 +20,14 @@
2020
log = logging.getLogger(__name__)
2121

2222

23+
async def metrics_handler(_request: web.Request):
24+
# TODO: prometheus_client.generate_latest blocking! no asyhc solutin?
25+
# TODO: prometheus_client access to a singleton registry! can be instead created and pass to every metric wrapper
26+
resp = web.Response(body=prometheus_client.generate_latest())
27+
resp.content_type = CONTENT_TYPE_LATEST
28+
return resp
29+
30+
2331
def middleware_factory(app_name):
2432
@web.middleware
2533
async def middleware_handler(request: web.Request, handler):
@@ -68,27 +76,33 @@ async def middleware_handler(request: web.Request, handler):
6876

6977
return resp
7078

71-
middleware_handler.__middleware_name__ = __name__
79+
middleware_handler.__middleware_name__ = __name__ # SEE check_outermost_middleware
7280
return middleware_handler
7381

7482

75-
async def metrics(_request):
76-
# TODO: NOT async!
77-
# prometheus_client access to a singleton registry!
78-
resp = web.Response(body=prometheus_client.generate_latest())
79-
resp.content_type = CONTENT_TYPE_LATEST
80-
return resp
83+
async def check_outermost_middleware(
84+
app: web.Application, *, log_failure: bool = True
85+
) -> bool:
86+
try:
87+
ok = app.middlewares[0].__middleware_name__ == __name__
88+
except (IndexError, AttributeError):
89+
ok = False
90+
91+
if not ok and log_failure:
8192

93+
def _view(m) -> str:
94+
try:
95+
return f"{m.__middleware_name__} [{m}]"
96+
except AttributeError:
97+
return str(m)
8298

83-
async def check_outermost_middleware(app: web.Application):
84-
m = app.middlewares[0]
85-
ok = m and hasattr(m, "__middleware_name__") and m.__middleware_name__ == __name__
86-
if not ok:
87-
# TODO: name all middleware and list middleware in log
8899
log.critical(
89-
"Monitoring middleware expected in the outermost layer."
90-
"TIP: Check setup order"
100+
"Monitoring middleware expected in the outermost layer. "
101+
"Middleware stack: %s. "
102+
"TIP: Check setup order",
103+
[_view(m) for m in app.middlewares],
91104
)
105+
return ok
92106

93107

94108
def setup_monitoring(app: web.Application, app_name: str):
@@ -118,7 +132,7 @@ def setup_monitoring(app: web.Application, app_name: str):
118132
app.middlewares.insert(0, middleware_factory(app_name))
119133

120134
# FIXME: this in the front-end has to be protected!
121-
app.router.add_get("/metrics", metrics)
135+
app.router.add_get("/metrics", metrics_handler)
122136

123137
# Checks that middleware is in the outermost layer
124138
app.on_startup.append(check_outermost_middleware)

packages/service-library/src/servicelib/rest_middlewares.py

+46-24
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
)
2020
from .rest_utils import EnvelopeFactory
2121
from .rest_validators import OpenApiValidator
22+
from .utils import is_production_environ
2223

2324
DEFAULT_API_VERSION = "v0"
2425

@@ -31,26 +32,34 @@ def is_api_request(request: web.Request, api_version: str) -> bool:
3132
return request.path.startswith(base_path)
3233

3334

34-
def _process_and_raise_unexpected_error(request: web.BaseRequest, err: Exception):
35-
# FIXME: send info + trace to client ONLY in debug mode!!!
36-
resp = create_error_response(
37-
[err,], "Unexpected Server error", web.HTTPInternalServerError
38-
)
39-
40-
logger.exception(
41-
'Unexpected server error "%s" from access: %s "%s %s". Responding with status %s',
42-
type(err),
43-
request.remote,
44-
request.method,
45-
request.path,
46-
resp.status,
47-
)
48-
raise resp
49-
35+
def error_middleware_factory(
36+
api_version: str = DEFAULT_API_VERSION, log_exceptions=True
37+
):
38+
_is_prod: bool = is_production_environ()
39+
40+
def _process_and_raise_unexpected_error(request: web.BaseRequest, err: Exception):
41+
resp = create_error_response(
42+
[err,],
43+
"Unexpected Server error",
44+
web.HTTPInternalServerError,
45+
skip_internal_error_details=_is_prod,
46+
)
47+
48+
if log_exceptions:
49+
logger.error(
50+
'Unexpected server error "%s" from access: %s "%s %s". Responding with status %s',
51+
type(err),
52+
request.remote,
53+
request.method,
54+
request.path,
55+
resp.status,
56+
exc_info=err,
57+
stack_info=True,
58+
)
59+
raise resp
5060

51-
def error_middleware_factory(api_version: str = DEFAULT_API_VERSION):
5261
@web.middleware
53-
async def _middleware(request: web.Request, handler):
62+
async def _middleware_handler(request: web.Request, handler):
5463
"""
5564
Ensure all error raised are properly enveloped and json responses
5665
"""
@@ -98,12 +107,15 @@ async def _middleware(request: web.Request, handler):
98107
except Exception as err: # pylint: disable=broad-except
99108
_process_and_raise_unexpected_error(request, err)
100109

101-
return _middleware
110+
# adds identifier (mostly for debugging)
111+
_middleware_handler.__middleware_name__ = f"{__name__}.error_{api_version}"
112+
113+
return _middleware_handler
102114

103115

104116
def validate_middleware_factory(api_version: str = DEFAULT_API_VERSION):
105117
@web.middleware
106-
async def _middleware(request: web.Request, handler):
118+
async def _middleware_handler(request: web.Request, handler):
107119
"""
108120
Validates requests against openapi specs and extracts body, params, etc ...
109121
Validate response against openapi specs
@@ -141,12 +153,17 @@ async def _middleware(request: web.Request, handler):
141153

142154
return response
143155

144-
return _middleware
156+
# adds identifier (mostly for debugging)
157+
_middleware_handler.__middleware_name__ = f"{__name__}.validate_{api_version}"
158+
159+
return _middleware_handler
145160

146161

147162
def envelope_middleware_factory(api_version: str = DEFAULT_API_VERSION):
163+
_is_prod: bool = is_production_environ()
164+
148165
@web.middleware
149-
async def _middleware(request: web.Request, handler):
166+
async def _middleware_handler(request: web.Request, handler):
150167
"""
151168
Ensures all responses are enveloped as {'data': .. , 'error', ...} in json
152169
"""
@@ -156,13 +173,18 @@ async def _middleware(request: web.Request, handler):
156173
resp = await handler(request)
157174

158175
if not isinstance(resp, web.Response):
159-
response = create_data_response(data=resp)
176+
response = create_data_response(
177+
data=resp, skip_internal_error_details=_is_prod,
178+
)
160179
else:
161180
# Enforced by user. Should check it is json?
162181
response = resp
163182
return response
164183

165-
return _middleware
184+
# adds identifier (mostly for debugging)
185+
_middleware_handler.__middleware_name__ = f"{__name__}.envelope_{api_version}"
186+
187+
return _middleware_handler
166188

167189

168190
def append_rest_middlewares(

0 commit comments

Comments
 (0)