Skip to content

Commit e28417b

Browse files
authored
Decommission prow-monitoring (openshift#34851)
1 parent d2b5526 commit e28417b

File tree

135 files changed

+29
-17850
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

135 files changed

+29
-17850
lines changed

Makefile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,6 @@ verify-app-ci:
267267

268268
mixins:
269269
$(SKIP_PULL) || $(CONTAINER_ENGINE) pull registry.ci.openshift.org/ci/dashboards-validation:latest
270-
$(CONTAINER_ENGINE) run $(USER) --platform linux/amd64 --user=$(UID) --rm -v "$(CURDIR):/release:z" registry.ci.openshift.org/ci/dashboards-validation:latest make -C /release/clusters/app.ci/prow-monitoring/mixins install all
271270
$(CONTAINER_ENGINE) run $(USER) --platform linux/amd64 --user=$(UID) --rm -v "$(CURDIR):/release:z" registry.ci.openshift.org/ci/dashboards-validation:latest make -C /release/clusters/app.ci/openshift-user-workload-monitoring/mixins install all
272271
.PHONY: mixins
273272

ci-operator/jobs/infra-periodics.yaml

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1436,16 +1436,6 @@ periodics:
14361436
command:
14371437
- applyconfig
14381438
env:
1439-
- name: OPENSHIFT_MONITORING_CI_TOKEN_ON_HIVE
1440-
valueFrom:
1441-
secretKeyRef:
1442-
key: hive-openshift-monitoring-ci-token
1443-
name: hive-openshift-monitoring-credentials
1444-
- name: OPENSHIFT_PROMETHEUS_PASSWORD
1445-
valueFrom:
1446-
secretKeyRef:
1447-
key: prometheus-k8s-basic-auth-password
1448-
name: app-ci-openshift-monitoring-credentials
14491439
- name: SLACK_API_URL
14501440
valueFrom:
14511441
secretKeyRef:
@@ -1456,11 +1446,6 @@ periodics:
14561446
secretKeyRef:
14571447
key: integration_key
14581448
name: pagerduty
1459-
- name: PROMETHEUS_USER_WORKLOAD_TOKEN
1460-
valueFrom:
1461-
secretKeyRef:
1462-
key: sa.prometheus-user-workload.app.ci.token.txt
1463-
name: app-ci-openshift-user-workload-monitoring-credentials
14641449
image: applyconfig:latest
14651450
imagePullPolicy: Always
14661451
name: ""

ci-operator/jobs/openshift/release/openshift-release-master-postsubmits.yaml

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,26 +18,11 @@ postsubmits:
1818
command:
1919
- applyconfig
2020
env:
21-
- name: OPENSHIFT_MONITORING_CI_TOKEN_ON_HIVE
22-
valueFrom:
23-
secretKeyRef:
24-
key: hive-openshift-monitoring-ci-token
25-
name: hive-openshift-monitoring-credentials
26-
- name: OPENSHIFT_PROMETHEUS_PASSWORD
27-
valueFrom:
28-
secretKeyRef:
29-
key: prometheus-k8s-basic-auth-password
30-
name: app-ci-openshift-monitoring-credentials
3121
- name: PAGERDUTY_INTEGRATION_KEY
3222
valueFrom:
3323
secretKeyRef:
3424
key: integration_key
3525
name: pagerduty
36-
- name: PROMETHEUS_USER_WORKLOAD_TOKEN
37-
valueFrom:
38-
secretKeyRef:
39-
key: sa.prometheus-user-workload.app.ci.token.txt
40-
name: app-ci-openshift-user-workload-monitoring-credentials
4126
- name: SLACK_API_URL
4227
valueFrom:
4328
secretKeyRef:

ci-operator/jobs/openshift/release/openshift-release-master-presubmits.yaml

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -99,26 +99,11 @@ presubmits:
9999
env:
100100
- name: HOME
101101
value: /tmp
102-
- name: OPENSHIFT_MONITORING_CI_TOKEN_ON_HIVE
103-
valueFrom:
104-
secretKeyRef:
105-
key: hive-openshift-monitoring-ci-token
106-
name: hive-openshift-monitoring-credentials
107-
- name: OPENSHIFT_PROMETHEUS_PASSWORD
108-
valueFrom:
109-
secretKeyRef:
110-
key: prometheus-k8s-basic-auth-password
111-
name: app-ci-openshift-monitoring-credentials
112102
- name: PAGERDUTY_INTEGRATION_KEY
113103
valueFrom:
114104
secretKeyRef:
115105
key: integration_key
116106
name: pagerduty
117-
- name: PROMETHEUS_USER_WORKLOAD_TOKEN
118-
valueFrom:
119-
secretKeyRef:
120-
key: sa.prometheus-user-workload.app.ci.token.txt
121-
name: app-ci-openshift-user-workload-monitoring-credentials
122107
- name: SLACK_API_URL
123108
valueFrom:
124109
secretKeyRef:
@@ -847,7 +832,7 @@ presubmits:
847832
pj-rehearse.openshift.io/can-be-rehearsed: "true"
848833
name: pull-ci-openshift-release-master-generated-dashboards
849834
rerun_command: /test generated-dashboards
850-
run_if_changed: ^(clusters/app.ci/prow-monitoring/.*)|^(clusters/app.ci/openshift-user-workload-monitoring/.*)
835+
run_if_changed: ^(clusters/app.ci/openshift-user-workload-monitoring/.*)
851836
spec:
852837
containers:
853838
- args:

ci-operator/platform-balance/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Rebalancing tests among platforms
22

3-
If test volume for a given platform exceeds [the Boskos lease capacity][boskos-leases], [`jobs-failing-with-lease-acquire-timeout`](../../clusters/app.ci/prow-monitoring/mixins/prometheus_out/prometheus-prow-rules_prometheusrule.yaml) will fire.
3+
If test volume for a given platform exceeds [the Boskos lease capacity][boskos-leases], [`jobs-failing-with-lease-acquire-timeout`](../../clusters/app.ci/openshift-user-workload-monitoring/mixins/prometheus_out/ci-alerts_prometheusrule.yaml) will fire.
44
Presubmit jobs may be rebalanced to move platform-agnostic jobs to platforms with available capacity.
55
Component teams may mark their presubmit jobs as platform-agnostic by configuring `as` names which exclude the platform slug (e.g. `aws`), whose absence is used as a marker of "this test is platform-agnostic".
66
For example, see [release#10152][release-10152].

clusters/app.ci/assets/dptp-controller-manager.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ metadata:
6262
name: dptp-controller-manager
6363
namespace: ci
6464
spec:
65-
# By default up{job="prow-monitoring/dptp-controller-manager"}.
65+
# By default up{job="*/dptp-controller-manager"}.
6666
# We want up{job="dptp-controller-manager"} instead.
6767
jobLabel: app
6868
selector:

clusters/app.ci/prow-monitoring/build_cop.md renamed to clusters/app.ci/openshift-user-workload-monitoring/build_cop.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
# Build Cop Dashboard in prow-monitoring
1+
# Build Cop Dashboard in ci-monitoring
22

3-
The [build-cop dashboard](https://grafana-prow-monitoring.apps.ci.l2s4.p1.openshiftapps.com/d/6829209d59479d48073d09725ce807fa/build-cop-dashboard?orgId=1) in [prow-monitoring](README.md) is an alternative tool which shows the success rate for various types of Prow jobs in Build Cop reports. The data presented by these dashboards are derived from Prow's state and persist for a month. Every authenticated user of our CI cluster has access to the dashboard.
3+
The [build-cop dashboard](https://grafana-route-ci-grafana.apps.ci.l2s4.p1.openshiftapps.com/d/6829209d59479d48073d09725ce807fa/build-cop-dashboard?orgId=1) in [ci-monitoring](README.md) is an alternative tool which shows the success rate for various types of Prow jobs in Build Cop reports. The data presented by these dashboards are derived from Prow's state and persist for a month. Every authenticated user of our CI cluster has access to the dashboard.
44

55
The Build Cop must keep track of passing rates for a number of job types. Normally, this would be done by viewing a filtered list of jobs in Deck. E.g., [the deck page](https://prow.ci.openshift.org/?job=*-master-e2e-aws) shows `Success rate over time: 3h: 78%, 12h: 81%, 48h: 77%` for job with name `*-master-e2e-aws`. With the dashboard, an overview of all job types can be seen with one panel.
66

clusters/app.ci/openshift-user-workload-monitoring/mixins/_prometheus/dptp_alerts.libsonnet

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
severity: 'critical',
3838
},
3939
annotations: {
40-
message: 'Infrastructure CI job {{ $labels.job_name }} is failing. Investigate the symptoms, assess the urgency and take appropriate action (<https://grafana-prow-monitoring.apps.ci.l2s4.p1.openshiftapps.com/d/%s/dptp-dashboard?orgId=1&fullscreen&viewPanel=4|Grafana Dashboard> | <https://prow.ci.openshift.org/?job={{ $labels.job_name }}|Deck> | <https://github.com/openshift/release/blob/master/docs/dptp-triage-sop/infrastructure-jobs.md#{{ $labels.job_name}}|SOP>).' % $._config.grafanaDashboardIDs['dptp.json'],
40+
message: 'Infrastructure CI job {{ $labels.job_name }} is failing. Investigate the symptoms, assess the urgency and take appropriate action (<https://grafana-route-ci-grafana.appss.ci.l2s4.p1.openshiftapps.com/d/%s/dptp-dashboard?orgId=1&fullscreen&viewPanel=4|Grafana Dashboard> | <https://prow.ci.openshift.org/?job={{ $labels.job_name }}|Deck> | <https://github.com/openshift/release/blob/master/docs/dptp-triage-sop/infrastructure-jobs.md#{{ $labels.job_name}}|SOP>).' % $._config.grafanaDashboardIDs['dptp.json'],
4141
},
4242
},
4343
{

clusters/app.ci/openshift-user-workload-monitoring/mixins/_prometheus/ghproxy_alerts.libsonnet

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
severity: 'critical',
1414
},
1515
annotations: {
16-
message: 'The average size of the pending GH API request queue in ghproxy is {{ $value | humanize }} over the last 5 minutes, which can indicate insufficient proxy throughput. Inspect <https://prometheus-prow-monitoring.apps.ci.l2s4.p1.openshiftapps.com/graph?g0.range_input=1h&g0.end_input=2022-03-23%2016%3A22&g0.expr=sum_over_time(pending_outbound_requests%7Bcontainer%3D%22ghproxy%22%7D%5B5m%5D)%20%2F%20count_over_time(pending_outbound_requests%7Bcontainer%3D%22ghproxy%22%7D%5B5m%5D)%20%3E%20100&g0.tab=0|Prometheus> and if the metric is ramping up, consider whether changing ghproxy throttling parameters may be necessary',
16+
message: 'The average size of the pending GH API request queue in ghproxy is {{ $value | humanize }} over the last 5 minutes, which can indicate insufficient proxy throughput. Inspect <https://console-openshift-console.apps.ci.l2s4.p1.openshiftapps.com/monitoring/alertrules?alerting-rule-name=ghproxy-too-many-pending-alerts|Prometheus> and if the metric is ramping up, consider whether changing ghproxy throttling parameters may be necessary',
1717
},
1818
},
1919
{
@@ -25,7 +25,7 @@
2525
severity: 'warning',
2626
},
2727
annotations: {
28-
message: '{{ $value | humanize }}%% of all requests for {{ $labels.path }} through the GitHub proxy are errorring with code {{ $labels.status }}. Check <https://grafana-prow-monitoring.apps.ci.l2s4.p1.openshiftapps.com/d/%s/github-cache?orgId=1&refresh=1m&fullscreen&viewPanel=9|grafana>' % $._config.grafanaDashboardIDs['ghproxy.json'],
28+
message: '{{ $value | humanize }}%% of all requests for {{ $labels.path }} through the GitHub proxy are errorring with code {{ $labels.status }}. Check <https://grafana-route-ci-grafana.appss.ci.l2s4.p1.openshiftapps.com/d/%s/github-cache?orgId=1&refresh=1m&fullscreen&viewPanel=9|grafana>' % $._config.grafanaDashboardIDs['ghproxy.json'],
2929
},
3030
},
3131
{
@@ -37,7 +37,7 @@
3737
severity: 'warning',
3838
},
3939
annotations: {
40-
message: '{{ $value | humanize }}%% of all API requests through the GitHub proxy are errorring with code {{ $labels.status }}. Check <https://grafana-prow-monitoring.apps.ci.l2s4.p1.openshiftapps.com/d/%s/github-cache?orgId=1&fullscreen&viewPanel=8|grafana>' % $._config.grafanaDashboardIDs['ghproxy.json'],
40+
message: '{{ $value | humanize }}%% of all API requests through the GitHub proxy are errorring with code {{ $labels.status }}. Check <https://grafana-route-ci-grafana.appss.ci.l2s4.p1.openshiftapps.com/d/%s/github-cache?orgId=1&fullscreen&viewPanel=8|grafana>' % $._config.grafanaDashboardIDs['ghproxy.json'],
4141
},
4242
},
4343
{
@@ -50,7 +50,7 @@
5050
severity: 'critical',
5151
},
5252
annotations: {
53-
message: '{{ $labels.login }} may run out of API quota before the next reset. Check the <https://grafana-prow-monitoring.apps.ci.l2s4.p1.openshiftapps.com/d/d72fe8d0400b2912e319b1e95d0ab1b3/github-cache?orgId=1|dashboard>',
53+
message: '{{ $labels.login }} may run out of API quota before the next reset. Check the <https://grafana-route-ci-grafana.appss.ci.l2s4.p1.openshiftapps.com/d/d72fe8d0400b2912e319b1e95d0ab1b3/github-cache?orgId=1|dashboard>',
5454
},
5555
},
5656
{
@@ -63,7 +63,7 @@
6363
severity: 'critical',
6464
},
6565
annotations: {
66-
message: '{{ $labels.token_hash }} may run out of API quota before the next reset. Check the <https://grafana-prow-monitoring.apps.ci.l2s4.p1.openshiftapps.com/d/d72fe8d0400b2912e319b1e95d0ab1b3/github-cache?orgId=1|dashboard>',
66+
message: '{{ $labels.token_hash }} may run out of API quota before the next reset. Check the <https://grafana-route-ci-grafana.appss.ci.l2s4.p1.openshiftapps.com/d/d72fe8d0400b2912e319b1e95d0ab1b3/github-cache?orgId=1|dashboard>',
6767
},
6868
},
6969
{
@@ -77,7 +77,7 @@
7777
},
7878
annotations: {
7979
message: |||
80-
{{ $labels.token_hash }} uses 90% of the available inode (<https://grafana-prow-monitoring.apps.ci.l2s4.p1.openshiftapps.com/d/d72fe8d0400b2912e319b1e95d0ab1b3/github-cache?viewPanel=5&orgId=1|dashboard>)
80+
{{ $labels.token_hash }} uses 90% of the available inode (<https://grafana-route-ci-grafana.appss.ci.l2s4.p1.openshiftapps.com/d/d72fe8d0400b2912e319b1e95d0ab1b3/github-cache?viewPanel=5&orgId=1|dashboard>)
8181
8282
Resolve by pruning the cache inside the ghproxy pod:
8383

clusters/app.ci/openshift-user-workload-monitoring/mixins/_prometheus/prow_alerts.libsonnet

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
severity: 'critical',
2424
},
2525
annotations: {
26-
message: 'The backlog for {{ $labels.name }} is not getting drained. Check <https://prometheus-prow-monitoring.apps.ci.l2s4.p1.openshiftapps.com/graph?g0.range_input=1h&g0.expr=workqueue_depth%7Bname%3D~%22{{ $labels.name }}%22%7D%20%3E%20100&g0.tab=0|Prometheus>'
26+
message: 'The backlog for {{ $labels.name }} is not getting drained. Check <https://console-openshift-console.apps.ci.l2s4.p1.openshiftapps.com/monitoring/alertrules?alerting-rule-name=prow-job-backlog-growing|Prometheus>'
2727
},
2828
},
2929
{

0 commit comments

Comments
 (0)