Skip to content

Commit 25cba5e

Browse files
committed
initial commit
0 parents  commit 25cba5e

File tree

661 files changed

+97214
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

661 files changed

+97214
-0
lines changed

LICENSE

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2021 Gitpod GmbH
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

Makefile

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
BIN_DIR?=$(shell pwd)/tmp/bin
2+
JB_BIN=$(BIN_DIR)/jb
3+
GOJSONTOYAML_BIN=$(BIN_DIR)/gojsontoyaml
4+
JSONNET_BIN=$(BIN_DIR)/jsonnet
5+
JSONNETFMT_BIN=$(BIN_DIR)/jsonnetfmt
6+
TOOLING=$(JSONNETFMT_BIN) $(JSONNET_BIN) $(GOJSONTOYAML_BIN) $(JB_BIN)
7+
8+
JSONNET_FMT := $(JSONNETFMT_BIN) -n 2 --max-blank-lines 2 --string-style s --comment-style s
9+
10+
all: setup-workspace fmt lint generate
11+
12+
.PHONY: clean
13+
clean:
14+
# Delete files marked in .gitignore
15+
git clean -Xfd .
16+
17+
.PHONY: setup-workspace
18+
setup-workspace:
19+
go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb
20+
go get github.com/brancz/gojsontoyaml
21+
go get github.com/google/go-jsonnet/cmd/jsonnet
22+
go get github.com/google/go-jsonnet/cmd/jsonnetfmt
23+
GO111MODULE=on go get github.com/prometheus/prometheus/cmd/[email protected]
24+
export PATH=$(PATH):$(PWD)/tmp/bin
25+
26+
.PHONY: generate
27+
generate: $(JSONNET_BIN)
28+
./hack/generate.sh
29+
30+
.PHONY: generate-ci
31+
generate-ci: $(JSONNET_BIN)
32+
./hack/generate.sh -e CI
33+
34+
.PHONY: fmt
35+
fmt: $(JSONNETFMT_BIN)
36+
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
37+
xargs -n 1 -- $(JSONNET_FMT) -i
38+
39+
.PHONY: lint
40+
lint: $(JSONNETFMT_BIN)
41+
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
42+
while read f; do \
43+
$(JSONNET_FMT) "$$f" | diff -u "$$f" -; \
44+
done
45+
46+
.PHONY: promtool-lint
47+
promtool-lint:
48+
promtool check rules monitoring-satellite/manifests/ci_prometheus_rules.yaml
49+
50+
$(BIN_DIR):
51+
mkdir -p $(BIN_DIR)
52+
53+
$(TOOLING): $(BIN_DIR)
54+
@echo Installing tools from tools.go
55+
@cd hack && cat tools.go | grep _ | awk -F'"' '{print $$2}' | xargs -tI % go build -modfile=go.mod -o $(BIN_DIR) %
56+
57+
.PHONY: update
58+
update: $(JB_BIN)
59+
$(JB_BIN) update
60+
61+
.PHONY: deploy-satellite
62+
deploy-satellite: generate
63+
./hack/prepare-kind.sh
64+
./hack/deploy-satellite.sh
65+
66+
.PHONY: test-e2e
67+
test-e2e:
68+
@cd tests/e2e && go test -timeout 55m -v . -count=1

README.md

+122
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
# Observability
2+
3+
[![Build Status](https://github.com/gitpod-com/observability/workflows/ci/badge.svg)](https://github.com/gitpod-com/observability/actions)
4+
[![Slack](https://img.shields.io/badge/join%20slack-%23observability-brightgreen.svg)](https://gitpod.slack.com/archives/C01KGM9D8LE)
5+
[![Gitpod ready-to-code](https://img.shields.io/badge/Gitpod-ready--to--code-908a85?logo=gitpod)](https://gitpod.io/#https://github.com/gitpod-com/observability)
6+
7+
Set of Jsonnet files used to deploy customized [monitoring-satellites](#monitoring-satellite) and [monitoring-centrals](#monitoring-central) into different clusters.
8+
9+
## Table of contents
10+
11+
- [Applications](#applications)
12+
- [Monitoring-satellite](#monitoring-satellite)
13+
- [Monitoring-Central](#monitoring-central)
14+
- [Workflows](#workflows)
15+
- [Development](#development)
16+
- [CI](#ci)
17+
- [Deployment](#deployment)
18+
19+
## Applications
20+
21+
### Monitoring-satellite
22+
23+
Monitoring-satellite is an application responsible for collecting observability signals from kubernetes clusters. Components included in monitoring-satellite:
24+
25+
* [Prometheus-Operator](https://github.com/prometheus-operator/prometheus-operator)
26+
* [Prometheus](https://github.com/prometheus/prometheus)
27+
* [Alertmanager](https://github.com/prometheus/alertmanager)
28+
* [Node-exporter](https://github.com/prometheus/node_exporter)
29+
* [Kube-State-Metrics](https://github.com/kubernetes/kube-state-metrics)
30+
* [Grafana](https://github.com/grafana/grafana)
31+
* Custom ServiceMonitors for [Gitpod](https://github.com/gitpod-io/gitpod)'s components
32+
33+
Monitoring-satellite can be customized by setting up jsonnet external-variables:
34+
35+
* `namespace` - changes the namespace where monitoring-satellite will be installed
36+
* `cluster_name` - adds a external label named `cluster` to Prometheus. This label is extermelly important to differentiate metrics comming from multiple clusters after being stored in monitoring-central.
37+
* `remote_write_url` - When defining this variable with something different from an empty string, Prometheus will send metrics to a Metrics backend, e.g. Thanos or Cortex, through Prometheus' Remote Write Protocol.
38+
* `pagerduty_routing_key` - Used to route critical alerts to pagerduty.
39+
* `slack_webhook_url_critical` - When defining this variable with something different from an empty string, Alertmanager will be configured to route alerts to Slack. **Careful:** When declaring this variable, you should also declare `slack_webhook_url_warning` and `slack_webhook_url_info`, which will route alerts from lower severities to different channels.
40+
* `dns_name` - When defining this variable with something different from an empty string, a set of extra resources will be created to expose Grafana to the internet while keeping it secure. When defining this variable, be careful to also declare `grafana_ingress_node_port`, `gcp_external_ip_address`, `IAP_client_id` and `IAP_client_secret`. The components included are:
41+
* Ingress
42+
* SSL Certificate (Requires certmanager installed in the cluster)
43+
* Google Cloud Backend Config
44+
45+
#### Monitoring-satellite RoadMap
46+
47+
As you can see, Metrics is the only Observability signal being collected by monitoring satellite right now. To make it complete Observability signal collector, we'll extend this application to collect:
48+
49+
* `Logs` - With [Promtail](https://grafana.com/docs/loki/latest/clients/promtail/) or [Fluentd](https://www.fluentd.org/)
50+
* `Traces` - With [Jaeger Agent](https://www.jaegertracing.io/docs/1.22/deployment/) or [OpenTelemetry Collector](https://github.com/open-telemetry/opentelemetry-collector)
51+
* `Profiles` - With [ConProf](https://github.com/conprof/conprof)
52+
53+
### Monitoring-Central
54+
55+
Monitoring-central is an application responsible for storing multiple signals collected by multiple monitoring-satellites for long term. Monitoring-central is the best place to analyze data during incidents or historical trend analisis. Components included in monitoring-central:
56+
57+
* [Grafana](https://github.com/grafana/grafana)
58+
* [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
59+
60+
Monitoring-central can be customized by setting up jsonnet external-variables:
61+
62+
* `dns_name` - When defining this variable with something different from an empty string, a set of extra resources will be created to expose Grafana to the internet while keeping it secure. When defining this variable, be careful to also declare `grafana_ingress_node_port`, `gcp_external_ip_address`, `IAP_client_id` and `IAP_client_secret`. The components included are:
63+
* Ingress
64+
* SSL Certificate (Requires certmanager installed in the cluster)
65+
* Google Cloud Backend Config
66+
67+
#### Monitoring-central RoadMap
68+
69+
Similarly to monitoring-satellite, monitoring-central only supports metric collection right now. To make it a complete Observability signal backend storage, we'll extend this application to store:
70+
71+
* `Logs` - With [Loki](https://github.com/grafana/loki)
72+
* `Traces` - With [Jaeger](https://github.com/jaegertracing/jaeger) or [Tempo](https://github.com/grafana/tempo)
73+
* `Profiles` - With [ConProf](https://github.com/conprof/conprof)
74+
75+
> To accelerate the development of monitoring-central, we are strongly considering teaming up with the Red Hat Monitoring Team to use [Observatorium](https://github.com/observatorium/observatorium) as our storage for all observability signals.
76+
77+
## Workflows
78+
79+
### Development
80+
81+
See [docs/code-design](./docs/code-design.md) for details on our folder structure.
82+
83+
During development we generate YAML files and Grafana dashboards based on our jsonnet templates.
84+
85+
**Notice**: These YAML files are only used during development and CI. For development/ci the entrypoints are `monitoring-*/manifests/*.jsonnet` whereas for ArgoCD the entrypoint is `monitoring-*/main.jsonnet`.
86+
87+
To generate the YAML files and Grafana dashboards run the command below.
88+
89+
```sh
90+
make generate
91+
```
92+
93+
The generated files are placed in `monitoring-*/manifests` - while working on the jsonnet templates it can sometimes be helpful to check out the generated YAML to see if everything looks the way you expected.
94+
95+
If you'd like to test Grafana dashboards during development, you can copy the content of the JSON files located at `components/gitpod/mixin/dashboard_out` and import it to Grafana using the import feature:
96+
97+
![image](https://user-images.githubusercontent.com/24193764/118832120-ba971200-b896-11eb-81aa-840dadecd21b.png)
98+
99+
100+
To make sure that all our jsonnet templates can compile and are correctly formatted run:
101+
102+
```sh
103+
make fmt
104+
```
105+
106+
If you are changing Prometheus rules you can additionally run:
107+
108+
```sh
109+
make promtool-lint
110+
```
111+
112+
### CI
113+
114+
We use Github Actions to validate PRs.
115+
116+
### Deployment
117+
118+
To make changes to monitoring-satellites and monitoring-centrals spread across our clusters, simply merge a PR to the `main` branch and ArgoCD will automatically synchronize all deployed applications.
119+
120+
If you want to verify ArgoCD has applied your changes, you can go to [argo-cd.gitpod-io-dev.com](https://argo-cd.gitpod-io-dev.com/) and use the label filter `application=monitoring-satelite` to see the status of all the satelites, or `application=monitoring-central` to see the monitoring centrals.
121+
122+
The ArgoCD applications are configured in [gitpod-com/gitpod](https://github.com/gitpod-com/gitpod) which is also responsible for setting up all the appropriate external-variables.

addons/alerting.libsonnet

+123
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
local criticalReceiver =
2+
if std.extVar('pagerduty_routing_key') != '' then
3+
|||
4+
pagerduty_configs:
5+
- send_resolved: true
6+
routing_key: '%(pagerdutyRoutingKey)s'
7+
||| % {
8+
pagerdutyRoutingKey: std.extVar('pagerduty_routing_key'),
9+
}
10+
else
11+
|||
12+
slack_configs:
13+
- send_resolved: true
14+
api_url: %(slackWebhookUrlCritical)s
15+
channel: '%(slackChannelPrefix)s_critical'
16+
title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}{{ end }}] %(clusterName)s Monitoring'
17+
text: |
18+
{{ range .Alerts }}
19+
**Please take immediate action!**
20+
*Cluster:* {{ .Labels.cluster }}
21+
*Alert:* {{ .Labels.alertname }}
22+
*Description:* {{ .Annotations.description }}
23+
{{ end }}
24+
actions:
25+
- type: button
26+
text: 'Runbook :book:'
27+
url: '{{ .CommonAnnotations.runbook_url }}'
28+
||| % {
29+
clusterName: std.extVar('cluster_name'),
30+
slackWebhookUrlCritical: std.extVar('slack_webhook_url_critical'),
31+
slackChannelPrefix: std.extVar('slack_channel_prefix'),
32+
}
33+
;
34+
35+
{
36+
values+:: {
37+
alertmanager+: {
38+
config: |||
39+
global:
40+
resolve_timeout: 5m
41+
route:
42+
receiver: Black_Hole
43+
group_by: ['...']
44+
routes:
45+
- receiver: CriticalReceiver
46+
match:
47+
severity: critical
48+
- receiver: SlackWarning
49+
match:
50+
severity: warning
51+
- receiver: SlackInfo
52+
match:
53+
severity: info
54+
- receiver: Watchdog
55+
match:
56+
alertname: Watchdog
57+
group_wait: 30s
58+
group_interval: 5m
59+
repeat_interval: 6h
60+
inhibit_rules:
61+
- source_match:
62+
severity: critical
63+
target_match_re:
64+
severity: warning|info
65+
equal:
66+
- alertname
67+
- source_match:
68+
severity: warning
69+
target_match_re:
70+
severity: info
71+
equal:
72+
- alertname
73+
receivers:
74+
- name: Black_Hole
75+
- name: Watchdog
76+
- name: CriticalReceiver
77+
%(criticalReceiver)s
78+
- name: SlackWarning
79+
slack_configs:
80+
- send_resolved: true
81+
api_url: %(slackWebhookUrlWarning)s
82+
channel: '%(slackChannelPrefix)s_warning'
83+
title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}{{ end }}] %(clusterName)s Monitoring'
84+
text: |
85+
{{ range .Alerts }}
86+
**Please take a look when possible**
87+
*Cluster:* {{ .Labels.cluster }}
88+
*Alert:* {{ .Labels.alertname }}
89+
*Description:* {{ .Annotations.description }}
90+
{{ end }}
91+
actions:
92+
- type: button
93+
text: 'Runbook :book:'
94+
url: '{{ .CommonAnnotations.runbook_url }}'
95+
- name: SlackInfo
96+
slack_configs:
97+
- send_resolved: true
98+
api_url: %(slackWebhookUrlInfo)s
99+
channel: '%(slackChannelPrefix)s_info'
100+
title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}{{ end }}] %(clusterName)s Monitoring'
101+
text: |
102+
{{ range .Alerts }}
103+
**No need for human intervention :slightly_smiling_face:
104+
*Cluster:* {{ .Labels.cluster }}
105+
*Alert:* {{ .Labels.alertname }}
106+
*Description:* {{ .Annotations.description }}
107+
{{ end }}
108+
actions:
109+
- type: button
110+
text: 'Runbook :book:'
111+
url: '{{ .CommonAnnotations.runbook_url }}'
112+
templates: []
113+
||| % {
114+
clusterName: std.extVar('cluster_name'),
115+
slackWebhookUrlWarning: std.extVar('slack_webhook_url_warning'),
116+
slackWebhookUrlInfo: std.extVar('slack_webhook_url_info'),
117+
slackChannelPrefix: std.extVar('slack_channel_prefix'),
118+
pagerdutyRoutingKey: std.extVar('pagerduty_routing_key'),
119+
criticalReceiver: criticalReceiver,
120+
},
121+
},
122+
},
123+
}

addons/cluster-monitoring.libsonnet

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
// The cluster-monitoring addon provides json snippets that are specific for installations responsible for full cluster monitoring.
2+
(import './node-affinity.libsonnet')
+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
// Specific modification when running the stack on CI
2+
{
3+
values+:: {
4+
prometheus+: {
5+
// Github Actions compute isn't strong, strip limits so pods can come up
6+
resources: {},
7+
},
8+
9+
alertmanager+: {
10+
// Github Actions compute isn't strong, strip limits so pods can come up
11+
resources: {},
12+
},
13+
},
14+
}

addons/disable-grafana-auth.libsonnet

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
values+:: {
3+
grafana+: {
4+
env+: [
5+
{
6+
name: 'GF_AUTH_ANONYMOUS_ENABLED',
7+
value: 'true',
8+
},
9+
{
10+
name: 'GF_AUTH_ANONYMOUS_ORG_ROLE',
11+
value: 'Admin',
12+
},
13+
{
14+
name: 'GF_AUTH_DISABLE_LOGIN_FORM',
15+
value: 'true',
16+
},
17+
],
18+
},
19+
},
20+
}

0 commit comments

Comments
 (0)