Skip to content

Commit 4b5dfc4

Browse files
committed
jsonnet: Sync with kube-prometheus
1 parent 2f35619 commit 4b5dfc4

File tree

5 files changed

+415
-53
lines changed

5 files changed

+415
-53
lines changed

assets/control-plane/prometheus-rule.yaml

+357-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ spec:
2020
expr: |
2121
increase(kube_pod_container_status_restarts_total{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"}[10m]) > 0
2222
and
23-
sum without (phase) (kube_pod_status_phase{phase!="Running",namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"} == 1)
23+
kube_pod_container_status_waiting{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"} == 1
2424
for: 15m
2525
labels:
2626
severity: warning
@@ -525,6 +525,338 @@ spec:
525525
for: 15m
526526
labels:
527527
severity: critical
528+
- name: kube-apiserver-burnrate.rules
529+
rules:
530+
- expr: |
531+
(
532+
(
533+
# too slow
534+
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d]))
535+
-
536+
(
537+
(
538+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1d]))
539+
or
540+
vector(0)
541+
)
542+
+
543+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1d]))
544+
+
545+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1d]))
546+
)
547+
)
548+
+
549+
# errors
550+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
551+
)
552+
/
553+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
554+
labels:
555+
verb: read
556+
record: apiserver_request:burnrate1d
557+
- expr: |
558+
(
559+
(
560+
# too slow
561+
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h]))
562+
-
563+
(
564+
(
565+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1h]))
566+
or
567+
vector(0)
568+
)
569+
+
570+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1h]))
571+
+
572+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1h]))
573+
)
574+
)
575+
+
576+
# errors
577+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
578+
)
579+
/
580+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
581+
labels:
582+
verb: read
583+
record: apiserver_request:burnrate1h
584+
- expr: |
585+
(
586+
(
587+
# too slow
588+
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h]))
589+
-
590+
(
591+
(
592+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[2h]))
593+
or
594+
vector(0)
595+
)
596+
+
597+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[2h]))
598+
+
599+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[2h]))
600+
)
601+
)
602+
+
603+
# errors
604+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
605+
)
606+
/
607+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
608+
labels:
609+
verb: read
610+
record: apiserver_request:burnrate2h
611+
- expr: |
612+
(
613+
(
614+
# too slow
615+
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m]))
616+
-
617+
(
618+
(
619+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[30m]))
620+
or
621+
vector(0)
622+
)
623+
+
624+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[30m]))
625+
+
626+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[30m]))
627+
)
628+
)
629+
+
630+
# errors
631+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
632+
)
633+
/
634+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
635+
labels:
636+
verb: read
637+
record: apiserver_request:burnrate30m
638+
- expr: |
639+
(
640+
(
641+
# too slow
642+
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
643+
-
644+
(
645+
(
646+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[3d]))
647+
or
648+
vector(0)
649+
)
650+
+
651+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[3d]))
652+
+
653+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[3d]))
654+
)
655+
)
656+
+
657+
# errors
658+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
659+
)
660+
/
661+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
662+
labels:
663+
verb: read
664+
record: apiserver_request:burnrate3d
665+
- expr: |
666+
(
667+
(
668+
# too slow
669+
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m]))
670+
-
671+
(
672+
(
673+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[5m]))
674+
or
675+
vector(0)
676+
)
677+
+
678+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[5m]))
679+
+
680+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[5m]))
681+
)
682+
)
683+
+
684+
# errors
685+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
686+
)
687+
/
688+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
689+
labels:
690+
verb: read
691+
record: apiserver_request:burnrate5m
692+
- expr: |
693+
(
694+
(
695+
# too slow
696+
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h]))
697+
-
698+
(
699+
(
700+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[6h]))
701+
or
702+
vector(0)
703+
)
704+
+
705+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[6h]))
706+
+
707+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[6h]))
708+
)
709+
)
710+
+
711+
# errors
712+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
713+
)
714+
/
715+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
716+
labels:
717+
verb: read
718+
record: apiserver_request:burnrate6h
719+
- expr: |
720+
(
721+
(
722+
# too slow
723+
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
724+
-
725+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d]))
726+
)
727+
+
728+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
729+
)
730+
/
731+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
732+
labels:
733+
verb: write
734+
record: apiserver_request:burnrate1d
735+
- expr: |
736+
(
737+
(
738+
# too slow
739+
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
740+
-
741+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h]))
742+
)
743+
+
744+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
745+
)
746+
/
747+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
748+
labels:
749+
verb: write
750+
record: apiserver_request:burnrate1h
751+
- expr: |
752+
(
753+
(
754+
# too slow
755+
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
756+
-
757+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h]))
758+
)
759+
+
760+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
761+
)
762+
/
763+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
764+
labels:
765+
verb: write
766+
record: apiserver_request:burnrate2h
767+
- expr: |
768+
(
769+
(
770+
# too slow
771+
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
772+
-
773+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m]))
774+
)
775+
+
776+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
777+
)
778+
/
779+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
780+
labels:
781+
verb: write
782+
record: apiserver_request:burnrate30m
783+
- expr: |
784+
(
785+
(
786+
# too slow
787+
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
788+
-
789+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d]))
790+
)
791+
+
792+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
793+
)
794+
/
795+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
796+
labels:
797+
verb: write
798+
record: apiserver_request:burnrate3d
799+
- expr: |
800+
(
801+
(
802+
# too slow
803+
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
804+
-
805+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m]))
806+
)
807+
+
808+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
809+
)
810+
/
811+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
812+
labels:
813+
verb: write
814+
record: apiserver_request:burnrate5m
815+
- expr: |
816+
(
817+
(
818+
# too slow
819+
sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
820+
-
821+
sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h]))
822+
)
823+
+
824+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
825+
)
826+
/
827+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
828+
labels:
829+
verb: write
830+
record: apiserver_request:burnrate6h
831+
- name: kube-apiserver-histogram.rules
832+
rules:
833+
- expr: |
834+
histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0
835+
labels:
836+
quantile: "0.99"
837+
verb: read
838+
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
839+
- expr: |
840+
histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0
841+
labels:
842+
quantile: "0.99"
843+
verb: write
844+
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
845+
- expr: |
846+
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
847+
labels:
848+
quantile: "0.99"
849+
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
850+
- expr: |
851+
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
852+
labels:
853+
quantile: "0.9"
854+
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
855+
- expr: |
856+
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
857+
labels:
858+
quantile: "0.5"
859+
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
528860
- name: k8s.rules
529861
rules:
530862
- expr: |
@@ -558,6 +890,12 @@ spec:
558890
max by(namespace, pod, node) (kube_pod_info{node!=""})
559891
)
560892
record: node_namespace_pod_container:container_memory_swap
893+
- expr: |
894+
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
895+
group_left() max by (namespace, pod) (
896+
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
897+
)
898+
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
561899
- expr: |
562900
sum by (namespace, cluster) (
563901
sum by (namespace, pod, cluster) (
@@ -569,6 +907,12 @@ spec:
569907
)
570908
)
571909
record: namespace_memory:kube_pod_container_resource_requests:sum
910+
- expr: |
911+
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
912+
group_left() max by (namespace, pod) (
913+
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
914+
)
915+
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
572916
- expr: |
573917
sum by (namespace, cluster) (
574918
sum by (namespace, pod, cluster) (
@@ -580,6 +924,12 @@ spec:
580924
)
581925
)
582926
record: namespace_cpu:kube_pod_container_resource_requests:sum
927+
- expr: |
928+
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
929+
group_left() max by (namespace, pod) (
930+
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
931+
)
932+
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
583933
- expr: |
584934
sum by (namespace, cluster) (
585935
sum by (namespace, pod, cluster) (
@@ -591,6 +941,12 @@ spec:
591941
)
592942
)
593943
record: namespace_memory:kube_pod_container_resource_limits:sum
944+
- expr: |
945+
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
946+
group_left() max by (namespace, pod) (
947+
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
948+
)
949+
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
594950
- expr: |
595951
sum by (namespace, cluster) (
596952
sum by (namespace, pod, cluster) (

0 commit comments

Comments
 (0)