20
20
expr : |
21
21
increase(kube_pod_container_status_restarts_total{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"}[10m]) > 0
22
22
and
23
- sum without (phase) (kube_pod_status_phase{phase!="Running", namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"} == 1)
23
+ kube_pod_container_status_waiting{ namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"} == 1
24
24
for : 15m
25
25
labels :
26
26
severity : warning
@@ -525,6 +525,338 @@ spec:
525
525
for : 15m
526
526
labels :
527
527
severity : critical
528
+ - name : kube-apiserver-burnrate.rules
529
+ rules :
530
+ - expr : |
531
+ (
532
+ (
533
+ # too slow
534
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d]))
535
+ -
536
+ (
537
+ (
538
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1d]))
539
+ or
540
+ vector(0)
541
+ )
542
+ +
543
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1d]))
544
+ +
545
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1d]))
546
+ )
547
+ )
548
+ +
549
+ # errors
550
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
551
+ )
552
+ /
553
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
554
+ labels:
555
+ verb: read
556
+ record: apiserver_request:burnrate1d
557
+ - expr : |
558
+ (
559
+ (
560
+ # too slow
561
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h]))
562
+ -
563
+ (
564
+ (
565
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1h]))
566
+ or
567
+ vector(0)
568
+ )
569
+ +
570
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1h]))
571
+ +
572
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1h]))
573
+ )
574
+ )
575
+ +
576
+ # errors
577
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
578
+ )
579
+ /
580
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
581
+ labels:
582
+ verb: read
583
+ record: apiserver_request:burnrate1h
584
+ - expr : |
585
+ (
586
+ (
587
+ # too slow
588
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h]))
589
+ -
590
+ (
591
+ (
592
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[2h]))
593
+ or
594
+ vector(0)
595
+ )
596
+ +
597
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[2h]))
598
+ +
599
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[2h]))
600
+ )
601
+ )
602
+ +
603
+ # errors
604
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
605
+ )
606
+ /
607
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
608
+ labels:
609
+ verb: read
610
+ record: apiserver_request:burnrate2h
611
+ - expr : |
612
+ (
613
+ (
614
+ # too slow
615
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m]))
616
+ -
617
+ (
618
+ (
619
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[30m]))
620
+ or
621
+ vector(0)
622
+ )
623
+ +
624
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[30m]))
625
+ +
626
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[30m]))
627
+ )
628
+ )
629
+ +
630
+ # errors
631
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
632
+ )
633
+ /
634
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
635
+ labels:
636
+ verb: read
637
+ record: apiserver_request:burnrate30m
638
+ - expr : |
639
+ (
640
+ (
641
+ # too slow
642
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
643
+ -
644
+ (
645
+ (
646
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[3d]))
647
+ or
648
+ vector(0)
649
+ )
650
+ +
651
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[3d]))
652
+ +
653
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[3d]))
654
+ )
655
+ )
656
+ +
657
+ # errors
658
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
659
+ )
660
+ /
661
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
662
+ labels:
663
+ verb: read
664
+ record: apiserver_request:burnrate3d
665
+ - expr : |
666
+ (
667
+ (
668
+ # too slow
669
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m]))
670
+ -
671
+ (
672
+ (
673
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[5m]))
674
+ or
675
+ vector(0)
676
+ )
677
+ +
678
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[5m]))
679
+ +
680
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[5m]))
681
+ )
682
+ )
683
+ +
684
+ # errors
685
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
686
+ )
687
+ /
688
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
689
+ labels:
690
+ verb: read
691
+ record: apiserver_request:burnrate5m
692
+ - expr : |
693
+ (
694
+ (
695
+ # too slow
696
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h]))
697
+ -
698
+ (
699
+ (
700
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[6h]))
701
+ or
702
+ vector(0)
703
+ )
704
+ +
705
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[6h]))
706
+ +
707
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[6h]))
708
+ )
709
+ )
710
+ +
711
+ # errors
712
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
713
+ )
714
+ /
715
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
716
+ labels:
717
+ verb: read
718
+ record: apiserver_request:burnrate6h
719
+ - expr : |
720
+ (
721
+ (
722
+ # too slow
723
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
724
+ -
725
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d]))
726
+ )
727
+ +
728
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
729
+ )
730
+ /
731
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
732
+ labels:
733
+ verb: write
734
+ record: apiserver_request:burnrate1d
735
+ - expr : |
736
+ (
737
+ (
738
+ # too slow
739
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
740
+ -
741
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h]))
742
+ )
743
+ +
744
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
745
+ )
746
+ /
747
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
748
+ labels:
749
+ verb: write
750
+ record: apiserver_request:burnrate1h
751
+ - expr : |
752
+ (
753
+ (
754
+ # too slow
755
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
756
+ -
757
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h]))
758
+ )
759
+ +
760
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
761
+ )
762
+ /
763
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
764
+ labels:
765
+ verb: write
766
+ record: apiserver_request:burnrate2h
767
+ - expr : |
768
+ (
769
+ (
770
+ # too slow
771
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
772
+ -
773
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m]))
774
+ )
775
+ +
776
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
777
+ )
778
+ /
779
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
780
+ labels:
781
+ verb: write
782
+ record: apiserver_request:burnrate30m
783
+ - expr : |
784
+ (
785
+ (
786
+ # too slow
787
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
788
+ -
789
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d]))
790
+ )
791
+ +
792
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
793
+ )
794
+ /
795
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
796
+ labels:
797
+ verb: write
798
+ record: apiserver_request:burnrate3d
799
+ - expr : |
800
+ (
801
+ (
802
+ # too slow
803
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
804
+ -
805
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m]))
806
+ )
807
+ +
808
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
809
+ )
810
+ /
811
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
812
+ labels:
813
+ verb: write
814
+ record: apiserver_request:burnrate5m
815
+ - expr : |
816
+ (
817
+ (
818
+ # too slow
819
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
820
+ -
821
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h]))
822
+ )
823
+ +
824
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
825
+ )
826
+ /
827
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
828
+ labels:
829
+ verb: write
830
+ record: apiserver_request:burnrate6h
831
+ - name : kube-apiserver-histogram.rules
832
+ rules :
833
+ - expr : |
834
+ histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0
835
+ labels:
836
+ quantile: "0.99"
837
+ verb: read
838
+ record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
839
+ - expr : |
840
+ histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0
841
+ labels:
842
+ quantile: "0.99"
843
+ verb: write
844
+ record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
845
+ - expr : |
846
+ histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
847
+ labels:
848
+ quantile: "0.99"
849
+ record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
850
+ - expr : |
851
+ histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
852
+ labels:
853
+ quantile: "0.9"
854
+ record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
855
+ - expr : |
856
+ histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
857
+ labels:
858
+ quantile: "0.5"
859
+ record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
528
860
- name : k8s.rules
529
861
rules :
530
862
- expr : |
@@ -558,6 +890,12 @@ spec:
558
890
max by(namespace, pod, node) (kube_pod_info{node!=""})
559
891
)
560
892
record: node_namespace_pod_container:container_memory_swap
893
+ - expr : |
894
+ kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
895
+ group_left() max by (namespace, pod) (
896
+ (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
897
+ )
898
+ record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
561
899
- expr : |
562
900
sum by (namespace, cluster) (
563
901
sum by (namespace, pod, cluster) (
@@ -569,6 +907,12 @@ spec:
569
907
)
570
908
)
571
909
record: namespace_memory:kube_pod_container_resource_requests:sum
910
+ - expr : |
911
+ kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
912
+ group_left() max by (namespace, pod) (
913
+ (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
914
+ )
915
+ record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
572
916
- expr : |
573
917
sum by (namespace, cluster) (
574
918
sum by (namespace, pod, cluster) (
@@ -580,6 +924,12 @@ spec:
580
924
)
581
925
)
582
926
record: namespace_cpu:kube_pod_container_resource_requests:sum
927
+ - expr : |
928
+ kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
929
+ group_left() max by (namespace, pod) (
930
+ (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
931
+ )
932
+ record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
583
933
- expr : |
584
934
sum by (namespace, cluster) (
585
935
sum by (namespace, pod, cluster) (
@@ -591,6 +941,12 @@ spec:
591
941
)
592
942
)
593
943
record: namespace_memory:kube_pod_container_resource_limits:sum
944
+ - expr : |
945
+ kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
946
+ group_left() max by (namespace, pod) (
947
+ (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
948
+ )
949
+ record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
594
950
- expr : |
595
951
sum by (namespace, cluster) (
596
952
sum by (namespace, pod, cluster) (
0 commit comments