From 83d29d48110eb30b9a992b68d76fccaa8759fe1a Mon Sep 17 00:00:00 2001 From: Gero Posmyk-Leinemann Date: Mon, 29 Aug 2022 11:58:05 +0000 Subject: [PATCH 1/8] [ops] Meta Overview/server: Fix unit of "API Request Error rate" to be reqps --- .../dashboards/components/meta-overview.json | 144 +++++++-- .../meta/dashboards/components/server.json | 297 ++++++++++++++++-- 2 files changed, 394 insertions(+), 47 deletions(-) diff --git a/operations/observability/mixins/meta/dashboards/components/meta-overview.json b/operations/observability/mixins/meta/dashboards/components/meta-overview.json index 9487be370c7d23..840a6264bce334 100644 --- a/operations/observability/mixins/meta/dashboards/components/meta-overview.json +++ b/operations/observability/mixins/meta/dashboards/components/meta-overview.json @@ -3,7 +3,10 @@ "list": [ { "builtIn": 1, - "datasource": "-- Grafana --", + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", @@ -21,12 +24,15 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "iteration": 1646144275104, "links": [], "liveNow": false, "panels": [ { "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "gridPos": { "h": 1, "w": 24, @@ -35,6 +41,15 @@ }, "id": 18, "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], "title": "General overview", "type": "row" }, @@ -48,6 +63,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -110,7 +127,8 @@ "max" ], "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -120,6 +138,9 @@ "pluginVersion": "8.2.2", "targets": [ { + "datasource": { + "uid": "$datasource" + }, "exemplar": true, "expr": "sum(rate(gitpod_server_api_calls_total{cluster=~\"$cluster\"}[5m])) by (cluster, method)", "interval": "", @@ -133,6 +154,7 @@ }, { "datasource": { + "type": "prometheus", "uid": "$datasource" }, "fieldConfig": { @@ -141,7 +163,8 @@ "mode": "palette-classic" }, "custom": { - "axisLabel": "", + "axisCenteredZero": false, + "axisColorMode": "text", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -170,7 +193,6 @@ }, "decimals": 2, "mappings": [], - "max": 1, "min": 0, "thresholds": { "mode": "absolute", @@ -185,7 +207,7 @@ } ] }, - "unit": "percentunit" + "unit": "reqps" }, "overrides": [] }, @@ -204,7 +226,8 @@ "max" ], "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -214,11 +237,16 @@ "pluginVersion": "8.2.2", "targets": [ { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", "exemplar": true, "expr": "sum(\n rate(gitpod_server_api_calls_total{cluster=~\"$cluster\", statusCode!~\"2..|429\"}[5m])\n) by (method)", "interval": "", "legendFormat": "{{method}}", "queryType": "randomWalk", + "range": true, "refId": "A" } ], @@ -236,6 +264,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -294,7 +324,8 @@ "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "multi", @@ -303,6 +334,9 @@ }, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "exemplar": true, "expr": "sum(server_websocket_connection_count{cluster=~\"$cluster\"}) by (cluster, clientType)", "interval": "", @@ -324,6 +358,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -379,7 +415,8 @@ "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -413,6 +450,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -486,7 +525,8 @@ "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -531,6 +571,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -587,7 +629,8 @@ "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -596,6 +639,9 @@ }, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "exemplar": true, "expr": "sum (gitpod_version_info{cluster=~\"$cluster\"}) by (cluster, gitpod_version)", "interval": "", @@ -608,6 +654,10 @@ }, { "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "gridPos": { "h": 1, "w": 24, @@ -616,6 +666,15 @@ }, "id": 20, "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], "title": "External metrics", "type": "row" }, @@ -630,6 +689,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -685,7 +746,8 @@ "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -710,6 +772,10 @@ }, { "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "gridPos": { "h": 1, "w": 24, @@ -718,6 +784,15 @@ }, "id": 22, "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], "title": "Messagebus health", "type": "row" }, @@ -731,6 +806,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -793,7 +870,8 @@ "max" ], "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -803,6 +881,9 @@ "pluginVersion": "8.2.2", "targets": [ { + "datasource": { + "uid": "$datasource" + }, "exemplar": true, "expr": "sum(\n rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", container!=\"POD\", pod=~\"messagebus.*\"}[1m])\n) by (pod, cluster, node)", "interval": "", @@ -824,6 +905,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -885,7 +968,8 @@ "max" ], "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -895,6 +979,9 @@ "pluginVersion": "8.2.2", "targets": [ { + "datasource": { + "uid": "$datasource" + }, "exemplar": true, "expr": "sum(container_memory_working_set_bytes{cluster=~\"$cluster\", container!=\"POD\", container!=\"\", pod=~\"messagebus.*\"}) by (pod, cluster, node)", "interval": "", @@ -916,6 +1003,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -975,7 +1064,8 @@ "lastNotNull" ], "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -985,6 +1075,9 @@ "pluginVersion": "8.2.2", "targets": [ { + "datasource": { + "uid": "$datasource" + }, "exemplar": true, "expr": "sum (\n rate(container_network_receive_bytes_total{container!=\"POD\", cluster=~\"$cluster\", pod=~\"messagebus.*\"}[1m])\n) by (pod, cluster, node)", "interval": "", @@ -993,6 +1086,9 @@ "refId": "A" }, { + "datasource": { + "uid": "$datasource" + }, "exemplar": true, "expr": "sum (\n rate(container_network_transmit_bytes_total{container!=\"POD\", cluster=~\"$cluster\", pod=~\"messagebus.*\"}[1m])\n) by (pod, cluster, node)", "interval": "", @@ -1015,6 +1111,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -1074,7 +1172,8 @@ "lastNotNull" ], "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -1084,6 +1183,9 @@ "pluginVersion": "8.2.2", "targets": [ { + "datasource": { + "uid": "$datasource" + }, "exemplar": true, "expr": "kube_pod_container_status_running{cluster=~\"$cluster\", pod=~\"messagebus.*\"} == 1 ", "interval": "", @@ -1092,6 +1194,9 @@ "refId": "A" }, { + "datasource": { + "uid": "$datasource" + }, "exemplar": true, "expr": "(\n sum by (pod) (kube_pod_container_status_terminated{cluster=~\"$cluster\", pod=~\"messagebus.*\"}) == 1\n) * on(pod) group_left(reason) (\n sum by (pod, reason) (kube_pod_container_status_terminated_reason{cluster=~\"$cluster\", pod=~\"messagebus.*\"}) == 1\n)", "interval": "", @@ -1100,6 +1205,9 @@ "refId": "B" }, { + "datasource": { + "uid": "$datasource" + }, "exemplar": true, "expr": "(\n sum by (pod) (kube_pod_container_status_waiting{cluster=~\"$cluster\", pod=~\"messagebus.*\"}) == 1\n) * on(pod) group_left(reason) (\n sum by (pod, reason) (kube_pod_container_status_waiting_reason{cluster=~\"$cluster\", pod=~\"messagebus.*\"}) == 1\n)", "interval": "", @@ -1112,7 +1220,7 @@ "type": "timeseries" } ], - "schemaVersion": 35, + "schemaVersion": 37, "style": "dark", "tags": [ "gitpod-mixin" @@ -1176,4 +1284,4 @@ "uid": "Gj5DE-O7k", "version": 1, "weekStart": "" -} \ No newline at end of file +} diff --git a/operations/observability/mixins/meta/dashboards/components/server.json b/operations/observability/mixins/meta/dashboards/components/server.json index 21bd404e0e2bfd..6c997c60d12a79 100644 --- a/operations/observability/mixins/meta/dashboards/components/server.json +++ b/operations/observability/mixins/meta/dashboards/components/server.json @@ -3,7 +3,10 @@ "list": [ { "builtIn": 1, - "datasource": "-- Grafana --", + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", @@ -21,12 +24,15 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, - "iteration": 1643631956016, "links": [], "liveNow": false, "panels": [ { "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "gridPos": { "h": 1, "w": 24, @@ -35,6 +41,15 @@ }, "id": 42, "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], "title": "server Metrics", "type": "row" }, @@ -76,7 +91,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.4", + "pluginVersion": "9.1.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -86,6 +101,9 @@ "steppedLine": false, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "exemplar": true, "expr": "sum(rate(gitpod_server_api_calls_total{cluster=~\"$cluster\", pod=~\"$pod\", method=~\"$method\"}[5m])) by (method)", "interval": "", @@ -164,7 +182,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.4", + "pluginVersion": "9.1.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -174,6 +192,9 @@ "steppedLine": false, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "sum(\n rate(gitpod_server_api_calls_total{cluster=~\"$cluster\", pod=~\"$pod\", method=~\"$method\", statusCode!~\"2..|429\"}[5m])\n) by (method)\n/\nsum(\n rate(gitpod_server_api_calls_total{cluster=~\"$cluster\", pod=~\"$pod\", method=~\"$method\"}[5m])\n) by (method)", "interval": "", "legendFormat": "{{method}}", @@ -197,14 +218,15 @@ }, "yaxes": [ { + "$$hashKey": "object:62", "decimals": 2, - "format": "percentunit", + "format": "reqps", "logBase": 1, - "max": "1", "min": "0", "show": true }, { + "$$hashKey": "object:63", "format": "short", "logBase": 1, "show": true @@ -226,6 +248,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -285,10 +309,12 @@ "legend": { "calcs": [], "displayMode": "table", - "placement": "right" + "placement": "right", + "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "pluginVersion": "8.3.3", @@ -351,6 +377,21 @@ "uid": "${datasource}" }, "description": "A heatmap of the API requests duration.", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, "gridPos": { "h": 8, "w": 12, @@ -364,7 +405,43 @@ "legend": { "show": false }, - "pluginVersion": "8.3.3", + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "scheme", + "scale": "exponential", + "scheme": "Greens", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "9.1.0", "reverseYBuckets": false, "targets": [ { @@ -408,6 +485,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -470,10 +549,12 @@ "max" ], "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "pluginVersion": "8.3.1", @@ -505,6 +586,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -560,14 +643,19 @@ "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "exemplar": true, "expr": "sum(rate(gitpod_server_api_connections_closed_total{cluster=~\"$cluster\", pod=~\"$pod\"}[5m])) by (cluster, pod)", "hide": false, @@ -590,6 +678,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -645,14 +735,19 @@ "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "exemplar": true, "expr": "sum(rate(gitpod_server_api_connections_total{cluster=~\"$cluster\", pod=~\"$pod\"}[5m])) by (cluster, pod)", "interval": "", @@ -699,7 +794,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.4", + "pluginVersion": "9.1.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -781,6 +876,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -839,14 +936,19 @@ "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { - "mode": "multi" + "mode": "multi", + "sort": "none" } }, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "exemplar": true, "expr": "sum (server_websocket_connection_count{cluster=~\"$cluster\", pod=~\"$pod\"}) by (cluster, pod)", "interval": "", @@ -868,6 +970,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -926,14 +1030,19 @@ "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { - "mode": "multi" + "mode": "multi", + "sort": "none" } }, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "exemplar": true, "expr": "sum(server_websocket_connection_count{cluster=~\"$cluster\", pod=~\"$pod\"}) by (cluster, clientType)", "interval": "", @@ -954,6 +1063,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -1009,14 +1120,19 @@ "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "exemplar": true, "expr": "sum(rate(gitpod_server_api_calls_user_total{cluster=~\"$cluster\", method=~\"ts.*\"}[5m])) by (cluster, method) * 60", "interval": "", @@ -1037,6 +1153,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -1093,14 +1211,19 @@ "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "exemplar": true, "expr": "sum (gitpod_version_info{}) by (cluster, gitpod_version)", "interval": "", @@ -1121,6 +1244,8 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -1176,14 +1301,19 @@ "legend": { "calcs": [], "displayMode": "table", - "placement": "right" + "placement": "right", + "showLegend": true }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "exemplar": true, "expr": "kube_pod_container_info{cluster=~\"$cluster\", pod=~\"$pod\", image=~\".+\", container=\"server\"}", "interval": "", @@ -1197,6 +1327,10 @@ }, { "collapsed": true, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "gridPos": { "h": 1, "w": 24, @@ -1250,6 +1384,9 @@ "steppedLine": false, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "process_resident_memory_bytes{cluster=~\"$cluster\", pod=~\"$pod\", job=\"server\"}", "interval": "", "legendFormat": "{{cluster}} - {{pod}}", @@ -1339,6 +1476,9 @@ "steppedLine": false, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "rate(process_cpu_seconds_total{cluster=~\"$cluster\", job=\"server\", pod=~\"$pod\"}[1m])", "interval": "", "intervalFactor": 2, @@ -1434,6 +1574,9 @@ "steppedLine": false, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "nodejs_eventloop_lag_seconds{cluster=~\"$cluster\", job=\"server\", pod=~\"$pod\"}", "interval": "", "legendFormat": "{{cluster}} - {{pod}}", @@ -1522,6 +1665,9 @@ "steppedLine": false, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "nodejs_active_handles_total{cluster=~\"$cluster\", job=\"server\", pod=~\"$pod\"}", "interval": "", "legendFormat": "{{cluster}} - {{pod}}", @@ -1560,11 +1706,24 @@ } } ], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], "title": "Node.js Runtime Metrics", "type": "row" }, { "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "gridPos": { "h": 1, "w": 24, @@ -1573,6 +1732,15 @@ }, "id": 50, "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], "title": "HTTP Metrics", "type": "row" }, @@ -1743,6 +1911,9 @@ "steppedLine": false, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "histogram_quantile(0.99, \n sum(rate(gitpod_server_http_request_duration_seconds_bucket{cluster=~\"$cluster\", route=~\"$http_route\"}[5m])) by (cluster, le)\n )", "interval": "", "legendFormat": "{{cluster}} - 99th percentile", @@ -1750,6 +1921,9 @@ "refId": "A" }, { + "datasource": { + "uid": "$datasource" + }, "expr": "histogram_quantile(0.95, \n sum(rate(gitpod_server_http_request_duration_seconds_bucket{cluster=~\"$cluster\", route=~\"$http_route\"}[5m])) by (cluster, le)\n )", "hide": false, "interval": "", @@ -1758,6 +1932,9 @@ "refId": "B" }, { + "datasource": { + "uid": "$datasource" + }, "expr": "histogram_quantile(0.50, \n sum(rate(gitpod_server_http_request_duration_seconds_bucket{cluster=~\"$cluster\", route=~\"$http_route\"}[5m])) by (cluster, le)\n )", "hide": false, "interval": "", @@ -1766,6 +1943,9 @@ "refId": "C" }, { + "datasource": { + "uid": "$datasource" + }, "expr": "sum(rate(gitpod_server_http_request_duration_seconds_sum{cluster=~\"$cluster\", route=~\"$http_route\"}[5m])) by (cluster)\n/\nsum(rate(gitpod_server_http_request_duration_seconds_count{cluster=~\"$cluster\", route=~\"$http_route\"}[5m])) by (cluster)", "hide": false, "interval": "", @@ -1806,6 +1986,10 @@ }, { "collapsed": true, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "gridPos": { "h": 1, "w": 24, @@ -1859,6 +2043,9 @@ "steppedLine": false, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "sum(container_memory_working_set_bytes{container!=\"POD\", container!=\"\", cluster=~\"$cluster\", node=~\"$node\", pod=~\"$pod\"}) by (pod, cluster, node)", "interval": "", "legendFormat": "{{cluster}} - {{node}} - {{pod}}", @@ -1945,6 +2132,9 @@ "steppedLine": false, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "sum(\n rate(container_cpu_usage_seconds_total{container!=\"POD\", cluster=~\"$cluster\", node=~\"$node\", pod=~\"$pod\"}[1m])\n) by (pod, cluster, node)", "interval": "", "legendFormat": "{{cluster}} - {{node}} - {{pod}} - Cores being used", @@ -2057,7 +2247,8 @@ "max" ], "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single" @@ -2126,6 +2317,9 @@ "steppedLine": false, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "sum (\n rate(container_network_receive_bytes_total{container!=\"POD\", cluster=~\"$cluster\", node=~\"$node\", pod=~\"$pod\"}[1m])\n) by (pod, cluster, node)", "interval": "", "legendFormat": "{{cluster}} - {{node}} - {{pod}} - Received", @@ -2133,6 +2327,9 @@ "refId": "A" }, { + "datasource": { + "uid": "$datasource" + }, "expr": "sum (\n rate(container_network_transmit_bytes_total{container!=\"POD\", cluster=~\"$cluster\", node=~\"$node\", pod=~\"$pod\"}[1m])\n) by (pod, cluster, node)", "interval": "", "legendFormat": "{{cluster}} - {{node}} - {{pod}} - Transmitted", @@ -2217,6 +2414,9 @@ "steppedLine": false, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "sum (\n rate(container_network_receive_packets_dropped_total{container!=\"POD\", pod!=\"\", cluster=~\"$cluster\", node=~\"$node\", pod=~\"$pod\"}[1m])\n) by (pod, cluster, node)", "interval": "", "legendFormat": "{{cluster}} - {{node}} - {{pod}} - Receive", @@ -2224,6 +2424,9 @@ "refId": "A" }, { + "datasource": { + "uid": "$datasource" + }, "expr": "sum (\n rate(container_network_transmit_packets_dropped_total{container!=\"POD\", pod!=\"\", cluster=~\"$cluster\", node=~\"$node\", pod=~\"$pod\"}[1m])\n) by (pod, cluster, node)", "interval": "", "legendFormat": "{{cluster}} - {{node}} - {{pod}} - Transmit", @@ -2308,6 +2511,9 @@ "steppedLine": false, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "sum (\n rate(container_network_receive_errors_total{container!=\"POD\", pod!=\"\", cluster=~\"$cluster\", node=~\"$node\", pod=~\"$pod\"}[1m])\n) by (pod, cluster, node)", "interval": "", "legendFormat": "{{cluster}} - {{node}} - {{pod}} - Received", @@ -2315,6 +2521,9 @@ "refId": "A" }, { + "datasource": { + "uid": "$datasource" + }, "expr": "sum (\n rate(container_network_transmit_errors_total{container!=\"POD\", pod!=\"\", cluster=~\"$cluster\", node=~\"$node\", pod=~\"$pod\"}[1m])\n) by (pod, cluster, node)", "interval": "", "legendFormat": "{{cluster}} - {{node}} - {{pod}} - Transmitted", @@ -2401,6 +2610,9 @@ "steppedLine": false, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "rate(kube_pod_container_status_restarts_total{cluster=~\"$cluster\", pod=~\"$pod\"}[1m])", "interval": "", "legendFormat": "{{cluster}} - {{kubernetes_pod_node_name}} - {{pod}} ", @@ -2487,6 +2699,9 @@ "steppedLine": false, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "kube_pod_container_status_running{cluster=~\"$cluster\", pod=~\"$pod\"} == 1 ", "interval": "", "legendFormat": "{{pod}} - RUNNING", @@ -2494,6 +2709,9 @@ "refId": "A" }, { + "datasource": { + "uid": "$datasource" + }, "expr": "(\n sum by (pod) (kube_pod_container_status_terminated{cluster=~\"$cluster\", pod=~\"$pod\"}) == 1\n) * on(pod) group_left(reason) (\n sum by (pod, reason) (kube_pod_container_status_terminated_reason{cluster=~\"$cluster\", pod=~\"$pod\"}) == 1\n)", "interval": "", "legendFormat": "{{pod}} - TERMINATED -> {{reason}}", @@ -2501,6 +2719,9 @@ "refId": "B" }, { + "datasource": { + "uid": "$datasource" + }, "expr": "(\n sum by (pod) (kube_pod_container_status_waiting{cluster=~\"$cluster\", pod=~\"$pod\"}) == 1\n) * on(pod) group_left(reason) (\n sum by (pod, reason) (kube_pod_container_status_waiting_reason{cluster=~\"$cluster\", pod=~\"$pod\"}) == 1\n)", "interval": "", "legendFormat": "{{pod}} - WAITING -> {{reason}}", @@ -2585,6 +2806,9 @@ "steppedLine": false, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "kube_deployment_spec_replicas{cluster=~\"$cluster\", deployment=\"server\"}", "interval": "", "legendFormat": "{{cluster}} - {{deployment}} - Desired", @@ -2592,6 +2816,9 @@ "refId": "C" }, { + "datasource": { + "uid": "$datasource" + }, "expr": "kube_deployment_status_replicas_available{cluster=~\"$cluster\", deployment=\"server\"}", "interval": "", "legendFormat": "{{cluster}} - {{deployment}} - Available replicas", @@ -2599,6 +2826,9 @@ "refId": "A" }, { + "datasource": { + "uid": "$datasource" + }, "expr": "kube_deployment_status_replicas_unavailable{cluster=~\"$cluster\", deployment=\"server\"}", "interval": "", "legendFormat": "{{cluster}} - {{deployment}} - Unvailable replicas", @@ -2638,12 +2868,21 @@ } } ], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], "title": "Pod Metrics", "type": "row" } ], "refresh": false, - "schemaVersion": 34, + "schemaVersion": 37, "style": "dark", "tags": [ "gitpod-mixin" @@ -2841,4 +3080,4 @@ "uid": "server", "version": 1, "weekStart": "" -} \ No newline at end of file +} From edfc822c3cd8f5d66eda1de7c77fb54124bc3423 Mon Sep 17 00:00:00 2001 From: Gero Posmyk-Leinemann Date: Tue, 30 Aug 2022 14:02:40 +0000 Subject: [PATCH 2/8] [ops] WebApp: Internal alert on JSON RPC error rates --- .../rules/components/server/alerts.libsonnet | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet index 4ec0f9812208f8..7afe606b80ade4 100644 --- a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet +++ b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet @@ -49,6 +49,23 @@ description: 'Server cannot start workspace instances on workspace clusters.', }, }, + // Rollout alerts + { + alert: 'JsonRpcApiErrorRates', + // Reasoning: the values are taken from past data + expr: 'sum (rate(gitpod_server_api_calls_total{statusCode!~"2..|429"}[5m])) / sum(rate(gitpod_server_api_calls_total[5m])) > 0.04', + 'for': '5m', + labels: { + // sent to the team internal channel until we fine tuned it + severity: 'warning', + team: 'webapp' + }, + annotations: { + runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodApiErrorRate.md', + summary: 'The error rate of the JSON RPC API is high. Investigation required.', + description: 'JSON RPC API error rate high', + }, + }, ], }, ], From e01c30b52320b0947b494431df62b76f9e608891 Mon Sep 17 00:00:00 2001 From: Gero Posmyk-Leinemann Date: Tue, 30 Aug 2022 14:23:28 +0000 Subject: [PATCH 3/8] [ops] WebApp: high websocket connection rate --- .../rules/components/server/alerts.libsonnet | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet index 7afe606b80ade4..b28be14f2cce5c 100644 --- a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet +++ b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet @@ -66,6 +66,22 @@ description: 'JSON RPC API error rate high', }, }, + { + alert: 'WebsocketConnectionRateHigh', + // Reasoning: the values are taken from past data + expr: 'sum(rate(server_websocket_connection_count[2m])) > 30', + 'for': '5m', + labels: { + // sent to the team internal channel until we fine tuned it + severity: 'warning', + team: 'webapp' + }, + annotations: { + runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebsocketConnectionRateHigh.md', + summary: 'The websocket connection rate is higher than usual. Investigation required.', + description: 'Websocket connection rate high', + }, + }, ], }, ], From a94af2e2580b0b5c3c63bc6326ed5a169bd269e9 Mon Sep 17 00:00:00 2001 From: Gero Posmyk-Leinemann Date: Tue, 30 Aug 2022 14:31:25 +0000 Subject: [PATCH 4/8] [ops] WebApp: alert on messagebus not running --- .../meta/rules/components/server/alerts.libsonnet | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet index b28be14f2cce5c..b9a849668681a6 100644 --- a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet +++ b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet @@ -82,6 +82,21 @@ description: 'Websocket connection rate high', }, }, + { + alert: 'MessagebusNotRunning', + expr: 'up{job="messagebus"} < 1', + 'for': '30s', + labels: { + // sent to the team internal channel until we fine tuned it + severity: 'warning', + team: 'webapp' + }, + annotations: { + runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/MessagebusNotRunning.md', + summary: 'The messagebus pod is not running. Investigation required.', + description: 'Messagebus pod not running', + }, + }, ], }, ], From 7f4684f6f2cbb2e1ea937ea510e073b6c05b282e Mon Sep 17 00:00:00 2001 From: Gero Posmyk-Leinemann Date: Tue, 30 Aug 2022 14:35:14 +0000 Subject: [PATCH 5/8] [ops] WebApp: alert if db-sync is not running --- .../rules/components/server/alerts.libsonnet | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet index b9a849668681a6..950a11b9f5fca1 100644 --- a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet +++ b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet @@ -82,6 +82,25 @@ description: 'Websocket connection rate high', }, }, + /** + * TODO(gpl) This will be true for US all the time. Can we exclude that cluster somehow? + * { + * alert: 'db-sync not running', + * expr: 'sum (kube_pod_status_phase{pod=~"db-sync.*"}) by (pod) < 1', + * 'for': '5m', + * labels: { + * // sent to the team internal channel until we fine tuned it + * severity: 'warning', + * team: 'webapp' + * }, + * annotations: { + * runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/DbSyncNotRunning.md', + * summary: 'The db-sync pod is not running. Investigation required.', + * description: 'db-sync pod not running', + * }, + * }, + * + */ { alert: 'MessagebusNotRunning', expr: 'up{job="messagebus"} < 1', From e899ee95bffd1ca86899c0c0d9d4063c28f0e3f6 Mon Sep 17 00:00:00 2001 From: Gero Posmyk-Leinemann Date: Tue, 30 Aug 2022 14:52:52 +0000 Subject: [PATCH 6/8] [ops] WebApp: Alerts on exessive RAM and CPU usage --- .../rules/components/server/alerts.libsonnet | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet index 950a11b9f5fca1..7bb23b79438280 100644 --- a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet +++ b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet @@ -116,6 +116,38 @@ description: 'Messagebus pod not running', }, }, + { + alert: 'WebAppServicesHighMemoryUsage', + // Reasoning: high rates of RAM consumption should only be temporary. Values based on past data (around 5-10 is constant) + expr: 'sum(rate(container_memory_working_set_bytes{container!="POD", node=~".*", pod=~"(server|ws-manager-bridge|usage)-.*"}[30m])) by (pod, node) > 10000000', + 'for': '15m', + labels: { + // sent to the team internal channel until we fine tuned it + severity: 'warning', + team: 'webapp' + }, + annotations: { + runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesHighMemoryUsage.md', + summary: 'WebApp services consume excessive amounts of memory. Investigation required.', + description: 'WebApp Services execcisve memory usage', + }, + }, + { + alert: 'WebAppServicesHighCPUUsage', + // Reasoning: high rates of CPU consumption should only be temporary. + expr: 'sum(rate(container_cpu_usage_seconds_total{container!="POD", node=~".*", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (pod, node) > 0.80', + 'for': '10m', + labels: { + // sent to the team internal channel until we fine tuned it + severity: 'warning', + team: 'webapp' + }, + annotations: { + runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesHighCPUUsage.md', + summary: 'WebApp services consume excessive amounts of CPU. Investigation required.', + description: 'WebApp Services execcisve CPU USAGE', + }, + }, ], }, ], From 8f29e34f58895217903fb2f243aced77cdb932ac Mon Sep 17 00:00:00 2001 From: Gero Posmyk-Leinemann Date: Wed, 31 Aug 2022 11:26:53 +0000 Subject: [PATCH 7/8] [ops] WebApp: Alert on services crashlooping --- .../meta/rules/components/server/alerts.libsonnet | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet index 7bb23b79438280..3b570dea21646a 100644 --- a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet +++ b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet @@ -148,6 +148,21 @@ description: 'WebApp Services execcisve CPU USAGE', }, }, + { + alert: 'WebAppServicesCrashlooping', + expr: 'sum(rate(container_cpu_usage_seconds_total{container!="POD", node=~".*", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (pod, node) > 0.80', + 'for': '15m', + labels: { + // sent to the team internal channel until we fine tuned it + severity: 'warning', + team: 'webapp' + }, + annotations: { + runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesCrashlooping.md', + summary: 'Pod is crash looping.', + description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes', + }, + }, ], }, ], From bdf673b430b3efece0b6978637866f3137eac33e Mon Sep 17 00:00:00 2001 From: Gero Posmyk-Leinemann Date: Wed, 31 Aug 2022 13:32:38 +0000 Subject: [PATCH 8/8] [ops] WebApp: review comments --- .../mixins/meta/rules/components/server/alerts.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet index 3b570dea21646a..a1ea19122b2938 100644 --- a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet +++ b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet @@ -104,7 +104,7 @@ { alert: 'MessagebusNotRunning', expr: 'up{job="messagebus"} < 1', - 'for': '30s', + 'for': '2m', labels: { // sent to the team internal channel until we fine tuned it severity: 'warning', @@ -112,7 +112,7 @@ }, annotations: { runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/MessagebusNotRunning.md', - summary: 'The messagebus pod is not running. Investigation required.', + summary: 'The messagebus pod is not running. Workspace information is not being correctly propagated into web app clusters. Investigation required.', description: 'Messagebus pod not running', }, }, @@ -145,7 +145,7 @@ annotations: { runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesHighCPUUsage.md', summary: 'WebApp services consume excessive amounts of CPU. Investigation required.', - description: 'WebApp Services execcisve CPU USAGE', + description: 'WebApp Services execcisve CPU usage', }, }, {