Skip to content

Commit 30f5e89

Browse files
author
Pedro Crespo
committed
Diagnostics plugin: tests and fixes
1 parent d8f2622 commit 30f5e89

File tree

5 files changed

+137
-86
lines changed

5 files changed

+137
-86
lines changed

services/web/server/src/simcore_service_webserver/diagnostics_core.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,9 @@ def observe(self, delay: float):
4747
fifo.pop(0)
4848

4949
def value(self) -> float:
50-
return statistics.mean(self.last_delays)
51-
50+
if self.last_delays:
51+
return statistics.mean(self.last_delays)
52+
return 0
5253

5354
def assert_healthy_app(app: web.Application) -> None:
5455
""" Diagnostics function that determins whether

services/web/server/src/simcore_service_webserver/diagnostics_entrypoints.py

+4-7
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717

1818

1919
async def check_health(request: web.Request):
20-
2120
# diagnostics of incidents
2221
try:
2322
assert_healthy_app(request.app)
@@ -50,13 +49,11 @@ async def get_diagnostics(request: web.Request):
5049
return web.json_response(data)
5150

5251

53-
def create_routes(specs: openapi.Spec) -> List[web.RouteDef]:
54-
# TODO: consider the case in which server creates routes for both v0 and v1!!!
55-
# TODO: should this be taken from servers instead?
56-
# TODO: routing will be done automatically using operation_id/tags, etc...
57-
52+
def create_rest_routes(specs: openapi.Spec) -> List[web.RouteDef]:
53+
# NOTE: these are routes with paths starting with v0/*
54+
5855
routes = []
59-
base_path = openapi.get_base_path(specs)
56+
base_path: str = openapi.get_base_path(specs)
6057

6158
path, handle = "/", check_health
6259
operation_id = specs.paths[path].operations["get"].operation_id

services/web/server/src/simcore_service_webserver/diagnostics_monitoring.py

+47-23
Original file line numberDiff line numberDiff line change
@@ -33,30 +33,49 @@ async def metrics_handler(request: web.Request):
3333

3434
def middleware_factory(app_name: str) -> Coroutine:
3535
@web.middleware
36-
async def middleware_handler(request: web.Request, handler):
36+
async def _middleware_handler(request: web.Request, handler):
3737
try:
3838
request[kSTART_TIME] = time.time()
3939
request.app[kREQUEST_IN_PROGRESS].labels(
4040
app_name, request.path, request.method
4141
).inc()
4242

4343
resp = await handler(request)
44-
unhandled_exception = None
44+
log_exception = None
45+
46+
assert isinstance(resp, web.Response), "Forgot envelope middleware?" # nsec
47+
48+
except web.HTTPServerError as exc:
49+
# Transforms exception into response object and log exception
50+
resp = exc
51+
log_exception = exc
4552

4653
except web.HTTPException as exc:
47-
# Transforms exception into response object
54+
# Transforms non-HTTPServerError exceptions into response object
4855
resp = exc
49-
unhandled_exception = None
56+
log_exception = None
5057

5158
except Exception as exc: # pylint: disable=broad-except
5259
# Transforms unhandled exceptions into responses with status 500
5360
# NOTE: Prevents issue #1025
5461
resp = web.HTTPInternalServerError(reason=str(exc))
55-
unhandled_exception = exc
62+
log_exception = exc
5663

5764
finally:
65+
assert isinstance( # nsec
66+
resp, web.Response # nsec
67+
), "Forgot envelope middleware or transformation?" # nsec
68+
5869
resp_time_secs: float = time.time() - request[kSTART_TIME]
5970

71+
exc_name = ""
72+
if log_exception:
73+
exc_name: str = log_exception.__class__.__name__
74+
75+
# Probes request latency
76+
request.app[kLATENCY_PROBE].observe(resp_time_secs)
77+
78+
# prometheus probes
6079
request.app[kREQUEST_LATENCY].labels(app_name, request.path).observe(
6180
resp_time_secs
6281
)
@@ -65,36 +84,29 @@ async def middleware_handler(request: web.Request, handler):
6584
app_name, request.path, request.method
6685
).dec()
6786

68-
exc_name: str = unhandled_exception.__class__.__name__ if unhandled_exception else ""
69-
7087
request.app[kREQUEST_COUNT].labels(
7188
app_name, request.method, request.path, resp.status, exc_name
7289
).inc()
7390

74-
if unhandled_exception:
75-
# NOTE: all access to API (i.e. and not other paths as /socket, /x, etc)
76-
# shall return web.HTTPErrors since processed by error_middleware_factory
77-
log.exception(
91+
if log_exception:
92+
log.error(
7893
'Unexpected server error "%s" from access: %s "%s %s" done in %3.2f secs. Responding with status %s',
79-
type(unhandled_exception),
94+
type(log_exception),
8095
request.remote,
8196
request.method,
8297
request.path,
8398
resp_time_secs,
8499
resp.status,
100+
exc_info=log_exception,
101+
stack_info=True,
85102
)
86103

87-
# Probes for on-the-fly stats ---
88-
# NOTE: might implement in the future some kind of statistical accumulator
89-
# to perform incremental calculations on the fly
90-
91-
# Probes request latency
92-
request.app[kLATENCY_PROBE].observe(resp_time_secs)
93-
94104
return resp
95105

96-
middleware_handler.__middleware_name__ = f"{__name__}.{app_name}"
97-
return middleware_handler
106+
# adds identifier
107+
_middleware_handler.__middleware_name__ = f"{__name__}.monitor_{app_name}"
108+
109+
return _middleware_handler
98110

99111

100112
def setup_monitoring(app: web.Application):
@@ -129,8 +141,20 @@ def setup_monitoring(app: web.Application):
129141
app[kLATENCY_PROBE] = DelayWindowProbe()
130142

131143
# WARNING: ensure ERROR middleware is over this one
132-
assert len(app.middlewares) >= 1 # nosec
133-
app.middlewares.append(middleware_factory("simcore_service_webserver"))
144+
#
145+
# non-API request/response (e.g /metrics, /x/* ...)
146+
# |
147+
# API request/response (/v0/*) |
148+
# | |
149+
# | |
150+
# v |
151+
# ===== monitoring-middleware =====
152+
# == rest-error-middlewarer ==== |
153+
# == ... == |
154+
# == rest-envelope-middleware == v
155+
#
156+
#
157+
app.middlewares.insert(0, middleware_factory("simcore_service_webserver"))
134158

135159
# TODO: in production, it should only be accessible to backend services
136160
app.router.add_get("/metrics", metrics_handler)

services/web/server/src/simcore_service_webserver/diagnostics_plugin.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,28 @@
66
from aiohttp import web
77

88
from servicelib import monitor_slow_callbacks
9+
from servicelib.application_setup import ModuleCategory, app_module_setup
910

1011
from .diagnostics_core import (
1112
IncidentsRegistry,
1213
kINCIDENTS_REGISTRY,
1314
kMAX_AVG_RESP_LATENCY,
1415
kMAX_TASK_DELAY,
1516
)
16-
from .diagnostics_entrypoints import create_routes
17+
from .diagnostics_entrypoints import create_rest_routes
1718
from .diagnostics_monitoring import setup_monitoring
1819
from .rest import APP_OPENAPI_SPECS_KEY
1920

2021
log = logging.getLogger(__name__)
2122

2223

24+
@app_module_setup(
25+
__name__,
26+
ModuleCategory.ADDON,
27+
depends=["simcore_service_webserver.rest"],
28+
config_section="diagnostics",
29+
logger=log,
30+
)
2331
def setup_diagnostics(
2432
app: web.Application,
2533
*,
@@ -64,9 +72,9 @@ def setup_diagnostics(
6472
app[kMAX_AVG_RESP_LATENCY] = max_avg_response_latency
6573
log.info("max_avg_response_latency = %3.2f secs ", max_avg_response_latency)
6674

67-
#
68-
# TODO: redesign ... to convoluted!!
69-
registry = IncidentsRegistry(order_by=attrgetter("delay_secs"))
75+
#
76+
# TODO: redesign ... too convoluted!!
77+
registry = IncidentsRegistry(order_by=attrgetter("delay_secs"))
7078
app[kINCIDENTS_REGISTRY] = registry
7179

7280
monitor_slow_callbacks.enable(max_task_delay, registry)
@@ -75,5 +83,5 @@ def setup_diagnostics(
7583
setup_monitoring(app)
7684

7785
# adds other diagnostic routes: healthcheck, etc
78-
routes = create_routes(specs=app[APP_OPENAPI_SPECS_KEY])
86+
routes = create_rest_routes(specs=app[APP_OPENAPI_SPECS_KEY])
7987
app.router.add_routes(routes)

0 commit comments

Comments
 (0)