Skip to content

Commit e7ddb91

Browse files
committed
LOG-258 - Fluentd alerting rules
Resurrecting the code for fluentd alerting rules from original pr/139 Place files/* in /usr/share/logging/<component>/, respectively.
1 parent 1ae1300 commit e7ddb91

16 files changed

+143
-14
lines changed

Dockerfile

+2-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ RUN INSTALL_PKGS=" \
1414
chmod og+w /tmp/_working_dir
1515
COPY --from=builder /go/src/github.com/openshift/cluster-logging-operator/_output/bin/cluster-logging-operator /usr/bin/
1616
COPY scripts/* /usr/bin/scripts/
17-
COPY files/ /usr/bin/files/
17+
RUN mkdir -p /usr/share/logging/
18+
COPY files/ /usr/share/logging/
1819
ADD controller-manifests /manifests
1920
# this is required because the operator invokes a script as `bash scripts/cert_generation.sh`
2021
WORKDIR /usr/bin

Dockerfile.rhel7

+2-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ RUN INSTALL_PKGS=" \
1414
chmod og+w /tmp/_working_dir
1515
COPY --from=builder /go/src/github.com/openshift/cluster-logging-operator/_output/bin/cluster-logging-operator /usr/bin/
1616
COPY scripts/* /usr/bin/scripts/
17-
COPY files/ /usr/bin/files/
17+
RUN mkdir -p /usr/share/logging/
18+
COPY files/ /usr/share/logging/
1819
ADD controller-manifests /manifests
1920
# this is required because the operator invokes a script as `bash scripts/cert_generation.sh`
2021
WORKDIR /usr/bin
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"groups":
2+
- "name": "logging_fluentd.alerts"
3+
"rules":
4+
- "alert": "FluentdNodeDown"
5+
"annotations":
6+
"message": "Prometheus could not scrape fluentd {{ $labels.instance }} for more than 10m."
7+
"summary": "Fluentd cannot be scraped"
8+
"expr": |
9+
absent(up{job="fluentd"} == 1)
10+
"for": "10m"
11+
"labels":
12+
"service": "fluentd"
13+
"severity": "critical"
14+
- "alert": "FluentdQueueLengthBurst"
15+
"annotations":
16+
"message": "In the last minute, fluentd {{ $labels.instance }} buffer queue length increased more than 32. Current value is {{ $value }}."
17+
"summary": "Fluentd is overwhelmed"
18+
"expr": |
19+
delta(fluentd_output_status_buffer_queue_length[1m]) > 32
20+
"for": "1m"
21+
"labels":
22+
"service": "fluentd"
23+
"severity": "warning"
24+
- "alert": "FluentdQueueLengthIncreasing"
25+
"annotations":
26+
"message": "In the last 12h, fluentd {{ $labels.instance }} buffer queue length constantly increased more than 1. Current value is {{ $value }}."
27+
"summary": "Fluentd file buffer usage issue"
28+
"expr": |
29+
delta(fluentd_output_status_buffer_queue_length[1m]) > 1
30+
"for": "12h"
31+
"labels":
32+
"service": "fluentd"
33+
"severity": "critical"
34+
- "alert": "FluentdErrorsHigh"
35+
"annotations":
36+
"message": "In the last minute, {{ $value }} errors reported by fluentd {{ $labels.instance }}."
37+
"summary": "Fluentd reports high number of errors"
38+
"expr": |
39+
sum by(instance, job) (rate(fluentd_output_status_num_errors[1m])) > 10
40+
"for": "1m"
41+
"labels":
42+
"service": "fluentd"
43+
"severity": "critical"
File renamed without changes.

hack/deploy.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ else
1111
fix_images() { cat ; }
1212
else
1313
fix_images() {
14-
sed -e "s,docker.io/openshift/origin-logging,$registry_host:5000/openshift/logging,"
14+
sed -e "s,docker.io/openshift/origin-logging,$registry_host:5000/openshift/logging," \
1515
-e "s,quay.io/openshift/origin-logging,$registry_host:5000/openshift/logging,"
1616
}
1717
fi

manifests/03-role.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ rules:
8282
- monitoring.coreos.com
8383
resources:
8484
- servicemonitors
85+
- prometheusrules
8586
verbs:
8687
- "*"
8788

pkg/k8shandler/collection.go

+4
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ func (cluster *ClusterLogging) CreateOrUpdateCollection() (err error) {
5959
return
6060
}
6161

62+
if err = createOrUpdateFluentdPrometheusRule(cluster); err != nil {
63+
return
64+
}
65+
6266
if err = createOrUpdateFluentdConfigMap(cluster); err != nil {
6367
return
6468
}

pkg/k8shandler/curation.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,9 @@ func (cluster *ClusterLogging) createOrUpdateCuratorConfigMap() error {
114114
"curator",
115115
cluster.Namespace,
116116
map[string]string{
117-
"actions.yaml": string(utils.GetFileContents("files/curator-actions.yaml")),
118-
"curator5.yaml": string(utils.GetFileContents("files/curator5-config.yaml")),
119-
"config.yaml": string(utils.GetFileContents("files/curator-config.yaml")),
117+
"actions.yaml": string(utils.GetFileContents("/usr/share/logging/curator/curator-actions.yaml")),
118+
"curator5.yaml": string(utils.GetFileContents("/usr/share/logging/curator/curator5-config.yaml")),
119+
"config.yaml": string(utils.GetFileContents("/usr/share/logging/curator/curator-config.yaml")),
120120
},
121121
)
122122

pkg/k8shandler/fluentd.go

+29-3
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ const (
2222
metricsPortName = "metrics"
2323
prometheusCAFile = "/etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt"
2424
metricsVolumeName = "fluentd-metrics"
25+
alertsFile = "/usr/share/logging/fluentd/fluentd_prometheus_alerts.yaml"
2526
)
2627

2728
func removeFluentd(cluster *ClusterLogging) (err error) {
@@ -35,6 +36,11 @@ func removeFluentd(cluster *ClusterLogging) (err error) {
3536
return
3637
}
3738

39+
if err = utils.RemovePrometheusRule(cluster.Namespace, "fluentd"); err != nil {
40+
return
41+
}
42+
43+
3844
if err = utils.RemoveConfigMap(cluster.Namespace, "fluentd"); err != nil {
3945
return
4046
}
@@ -118,15 +124,35 @@ func createOrUpdateFluentdServiceMonitor(cluster *ClusterLogging) error {
118124
return nil
119125
}
120126

127+
func createOrUpdateFluentdPrometheusRule(cluster *ClusterLogging) error {
128+
promRule := utils.NewPrometheusRule("fluentd", cluster.Namespace)
129+
130+
promRuleSpec, err := utils.NewPrometheusRuleSpecFrom(alertsFile)
131+
if err != nil {
132+
return fmt.Errorf("Failure creating the fluentd PrometheusRule: %v", err)
133+
}
134+
135+
promRule.Spec = *promRuleSpec
136+
137+
utils.AddOwnerRefToObject(promRule, utils.AsOwner(cluster))
138+
139+
err = sdk.Create(promRule)
140+
if err != nil && !errors.IsAlreadyExists(err) {
141+
return fmt.Errorf("Failure creating the fluentd PrometheusRule: %v", err)
142+
}
143+
144+
return nil
145+
}
146+
121147
func createOrUpdateFluentdConfigMap(cluster *ClusterLogging) error {
122148

123149
fluentdConfigMap := utils.NewConfigMap(
124150
"fluentd",
125151
cluster.Namespace,
126152
map[string]string{
127-
"fluent.conf": string(utils.GetFileContents("files/fluent.conf")),
128-
"throttle-config.yaml": string(utils.GetFileContents("files/fluentd-throttle-config.yaml")),
129-
"secure-forward.conf": string(utils.GetFileContents("files/secure-forward.conf")),
153+
"fluent.conf": string(utils.GetFileContents("/usr/share/logging/fluentd/fluent.conf")),
154+
"throttle-config.yaml": string(utils.GetFileContents("/usr/share/logging/fluentd/fluentd-throttle-config.yaml")),
155+
"secure-forward.conf": string(utils.GetFileContents("/usr/share/logging/fluentd/secure-forward.conf")),
130156
},
131157
)
132158

pkg/k8shandler/rsyslog.go

+5-5
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ func createOrUpdateRsyslogConfigMap(logging *ClusterLogging) error {
5151
"rsyslog-bin",
5252
logging.Namespace,
5353
map[string]string{
54-
"rsyslog.sh": string(utils.GetFileContents("files/rsyslog/rsyslog.sh")),
54+
"rsyslog.sh": string(utils.GetFileContents("/usr/share/logging/rsyslog/rsyslog.sh")),
5555
},
5656
)
5757
rsyslogConfigMaps["rsyslog-bin"] = rsyslogBinConfigMap
@@ -60,15 +60,15 @@ func createOrUpdateRsyslogConfigMap(logging *ClusterLogging) error {
6060
"rsyslog-main",
6161
logging.Namespace,
6262
map[string]string{
63-
"rsyslog.conf": string(utils.GetFileContents("files/rsyslog/rsyslog.conf")),
63+
"rsyslog.conf": string(utils.GetFileContents("/usr/share/logging/rsyslog/rsyslog.conf")),
6464
},
6565
)
6666
rsyslogConfigMaps["rsyslog-main"] = rsyslogMainConfigMap
6767

6868
rsyslogConfigMapFiles := make(map[string]string)
69-
readerDir, err := ioutil.ReadDir("files/rsyslog")
69+
readerDir, err := ioutil.ReadDir("/usr/share/logging/rsyslog")
7070
if err != nil {
71-
return fmt.Errorf("Failure %v to read files from directory 'files/rsyslog' for Rsyslog configmap", err)
71+
return fmt.Errorf("Failure %v to read files from directory '/usr/share/logging/rsyslog' for Rsyslog configmap", err)
7272
}
7373
for _, fileInfo := range readerDir {
7474
// exclude files provided by other configmaps
@@ -79,7 +79,7 @@ func createOrUpdateRsyslogConfigMap(logging *ClusterLogging) error {
7979
continue
8080
}
8181
// include all other files
82-
fullname := "files/rsyslog/" + fileInfo.Name()
82+
fullname := "/usr/share/logging/rsyslog/" + fileInfo.Name()
8383
rsyslogConfigMapFiles[fileInfo.Name()] = string(utils.GetFileContents(fullname))
8484
}
8585
rsyslogConfigMap := utils.NewConfigMap(

pkg/utils/prometheus_rule.go

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package utils
2+
3+
import (
4+
"bytes"
5+
"fmt"
6+
"io/ioutil"
7+
8+
monitoringv1 "github.com/coreos/prometheus-operator/pkg/apis/monitoring/v1"
9+
sdk "github.com/operator-framework/operator-sdk/pkg/sdk"
10+
"k8s.io/apimachinery/pkg/api/errors"
11+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
12+
k8sYAML "k8s.io/apimachinery/pkg/util/yaml"
13+
)
14+
15+
func NewPrometheusRule(ruleName, namespace string) *monitoringv1.PrometheusRule {
16+
return &monitoringv1.PrometheusRule{
17+
TypeMeta: metav1.TypeMeta{
18+
Kind: monitoringv1.PrometheusRuleKind,
19+
APIVersion: monitoringv1.SchemeGroupVersion.String(),
20+
},
21+
ObjectMeta: metav1.ObjectMeta{
22+
Name: ruleName,
23+
Namespace: namespace,
24+
},
25+
}
26+
}
27+
28+
func RemovePrometheusRule(namespace string, ruleName string) error {
29+
30+
promRule := NewPrometheusRule(ruleName, namespace)
31+
32+
err := sdk.Delete(promRule)
33+
if err != nil && !errors.IsNotFound(err) {
34+
return fmt.Errorf("Failure deleting %v prometheus rule: %v", promRule, err)
35+
}
36+
37+
return nil
38+
}
39+
40+
func NewPrometheusRuleSpecFrom(filePath string) (*monitoringv1.PrometheusRuleSpec, error) {
41+
if err := CheckFileExists(filePath); err != nil {
42+
return nil, err
43+
}
44+
fileContent, err := ioutil.ReadFile(filePath)
45+
if err != nil {
46+
return nil, fmt.Errorf("'%s' not readable", filePath)
47+
}
48+
ruleSpec := monitoringv1.PrometheusRuleSpec{}
49+
if err := k8sYAML.NewYAMLOrJSONDecoder(bytes.NewBufferString(string(fileContent)), 1000).Decode(&ruleSpec); err != nil {
50+
return nil, err
51+
}
52+
return &ruleSpec, nil
53+
}

0 commit comments

Comments
 (0)