openshift · openshift-merge-robot · Dec 1, 2021 · Nov 3, 2021 · Nov 16, 2021 · Nov 16, 2021
diff --git a/.golangci.yml b/.golangci.yml
@@ -89,7 +89,6 @@ linters:
     - misspell
     - nakedret
     - noctx
-    - nolintlint
     - rowserrcheck
     - staticcheck
     - structcheck
@@ -117,6 +116,7 @@ linters:
   # - revive
   # - wsl
   # - gomnd
+  # - nolintlint
 
 issues:
   # Excluding configuration per-path, per-linter, per-text and per-source

diff --git a/README.md b/README.md
@@ -1,6 +1,8 @@
 # Insights Operator
 
-This cluster operator gathers anonymized system configuration and reports it to Red Hat Insights. It is a part of the standard OpenShift distribution. The data collected allows for debugging in the event of cluster failures or unanticipated errors.
+This cluster operator gathers anonymized system configuration and reports it to Red Hat Insights. It is a part of the 
+standard OpenShift distribution. The data collected allows for debugging in the event of cluster failures or 
+unanticipated errors.
 
 # Table of Contents
 
@@ -58,7 +60,8 @@ Unit tests can be started by the following command:
 make test
 ```
 
-It is also possible to specify CLI options for Go test. For example, if you need to disable test results caching, use the following command:
+It is also possible to specify CLI options for Go test. For example, if you need to disable test results caching, 
+use the following command:
 
 ```shell script
 VERBOSE=-count=1 make test
@@ -69,7 +72,9 @@ VERBOSE=-count=1 make test
 # Documentation
 
 
-The document [docs/gathered-data](docs/gathered-data.md) contains the list of collected data and the API that is used to collect it. This documentation is generated by the command bellow, by collecting the comment tags located above each Gather method.
+The document [docs/gathered-data](docs/gathered-data.md) contains the list of collected data and the API that is used 
+to collect it. This documentation is generated by the command bellow, by collecting the comment tags located above 
+each Gather method.
 
 To start generating the document run:
 
@@ -81,23 +86,29 @@ make docs
 
 ## Generate the certificate and key
 
-Certificate and key are required to access Prometheus metrics (instead 404 Forbidden is returned). It is possible to generate these two files from Kubernetes config file. Certificate is stored in `users/admin/client-cerfificate-data` and key in `users/admin/client-key-data`. Please note that these values are encoded by using Base64 encoding, so it is needed to decode them, for example by `base64 -d`.
+Certificate and key are required to access Prometheus metrics (instead 404 Forbidden is returned). It is possible 
+to generate these two files from Kubernetes config file. Certificate is stored in `users/admin/client-cerfificate-data` 
+and key in `users/admin/client-key-data`. Please note that these values are encoded by using Base64 encoding, 
+so it is needed to decode them, for example by `base64 -d`.
 
-There's a tool named `gen_cert_key.py` that can be used to automatically generate both files. It is stored in `tools` subdirectory.
+There's a tool named `gen_cert_key.py` that can be used to automatically generate both files. It is stored in `tools` 
+subdirectory.
 
 ```shell script
 gen_cert_file.py kubeconfig.yaml
 ```
 
 ## Prometheus metrics provided by Insights Operator
 
-It is possible to read Prometheus metrics provided by Insights Operator. Example of metrics exposed by Insights Operator can be found at [metrics.txt](docs/metrics.txt)
+It is possible to read Prometheus metrics provided by Insights Operator. Example of metrics exposed by 
+Insights Operator can be found at [metrics.txt](docs/metrics.txt)
 
-Depending on how or where the IO is running you may have different ways to retrieve the metrics. Here is a list of some options, so you can find the one that fits you:
+Depending on how or where the IO is running you may have different ways to retrieve the metrics. 
+Here is a list of some options, so you can find the one that fits you:
 
 ### Running IO locally
 
-If the IO runs locally, the following command migth be used:
+If the IO runs locally, the following command might be used:
 
 ```shell script
 curl --cert k8s.crt --key k8s.key -k https://localhost:8443/metrics
@@ -174,7 +185,8 @@ go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30
 go tool pprof http://localhost:6060/debug/pprof/heap
 ```
 
-These commands will create a compressed file that can be visualized using a variety of tools, one of them is the `pprof` tool.
+These commands will create a compressed file that can be visualized using a variety of tools, one of them is 
+the `pprof` tool.
 
 ### Analyzing profiling data
 
@@ -201,15 +213,17 @@ It uses both the local git and GitHub`s API to update the file so:
 
 It can be used 2 ways:
 
-1. Providing no command line arguments the script will update the current `CHANGELOG.md` with the latest changes according to the local git state.
+1. Providing no command line arguments the script will update the current `CHANGELOG.md` with the latest changes 
+2. according to the local git state.
 
 > 🚨 IMPORTANT: It will only work with changelogs created with this script
 
 ```shell script
 go run cmd/changelog/main.go
 ```
 
-2. Providing 2 command line arguments, `AFTER` and `UNTIL` dates the script will generate a new `CHANGELOG.md` within the provided time frame.
+2. Providing 2 command line arguments, `AFTER` and `UNTIL` dates the script will generate a new `CHANGELOG.md` within 
+the provided time frame.
 
 ```shell script
 go run cmd/changelog/main.go 2021-01-10 2021-01-20
@@ -221,15 +235,18 @@ go run cmd/changelog/main.go 2021-01-10 2021-01-20
 * ClusterOperator objects
 * All non-secret global config (hostnames and URLs anonymized)
 
-The list of all collected data with description, location in produced archive and link to Api and some examples is at [docs/gathered-data.md](docs/gathered-data.md)
+The list of all collected data with description, location in produced archive and link to Api and some examples is 
+at [docs/gathered-data.md](docs/gathered-data.md)
 
-The resulting data is packed in `.tar.gz` archive with folder structure indicated in the document. Example of such archive is at [docs/insights-archive-sample](docs/insights-archive-sample).
+The resulting data is packed in `.tar.gz` archive with folder structure indicated in the document. Example of such 
+archive is at [docs/insights-archive-sample](docs/insights-archive-sample).
 
 ## Insights Operator Archive
 
 ### Sample IO archive
 
-There is a sample IO archive maintained in this repo to use as a quick reference. (can be found at [docs/insights-archive-sample](https://github.com/openshift/insights-operator/tree/master/docs/insights-archive-sample))
+There is a sample IO archive maintained in this repo to use as a quick reference. (can be found 
+at [docs/insights-archive-sample](https://github.com/openshift/insights-operator/tree/master/docs/insights-archive-sample))
 
 To keep it up-to-date it is **required** to update this manually when developing a new data enhancement.
 

diff --git a/docs/gathered-data.md b/docs/gathered-data.md
@@ -23,6 +23,16 @@ Params is of type AlertIsFiringConditionParams:
   * 4.10+
 
 
+## AlertmanagerLogs
+
+collects alertmanager logs for pods firing one the configured alerts.
+
+* Location in archive: conditional/namespaces/<namespace>/pods/<pod>/containers/<container>/logs/last-{i}-lines.log
+* Id in config: alertmanager_logs
+* Since versions:
+  * 4.10+
+
+
 ## CRD
 
 collects the specified Custom Resource Definitions.

diff --git a/...nshift-monitoring/pods/alertmanager-main-0/containers/alertmanager/logs/last-50-lines.log b/...nshift-monitoring/pods/alertmanager-main-0/containers/alertmanager/logs/last-50-lines.log
@@ -0,0 +1,29 @@
+2021-11-15T09:21:29.542014685Z level=info ts=2021-11-15T09:21:29.540Z caller=main.go:216 msg="Starting Alertmanager" version="(version=0.21.0, branch=rhaos-4.7-rhel-8, revision=7d7727749b9e72d483091a58e1a13cb7d4f4fa62)"
+2021-11-15T09:21:29.542014685Z level=info ts=2021-11-15T09:21:29.540Z caller=main.go:217 build_context="(go=go1.15.7, user=root@9e3ad46b3963, date=20210609-08:49:37)"
+2021-11-15T09:21:29.688729257Z level=warn ts=2021-11-15T09:21:29.685Z caller=cluster.go:228 component=cluster msg="failed to join cluster" err="3 errors occurred:\n\t* Failed to resolve alertmanager-main-0.alertmanager-operated:9094: lookup alertmanager-main-0.alertmanager-operated on 172.30.0.10:53: no such host\n\t* Failed to resolve alertmanager-main-1.alertmanager-operated:9094: lookup alertmanager-main-1.alertmanager-operated on 172.30.0.10:53: no such host\n\t* Failed to resolve alertmanager-main-2.alertmanager-operated:9094: lookup alertmanager-main-2.alertmanager-operated on 172.30.0.10:53: no such host\n\n"
+2021-11-15T09:21:29.688729257Z level=info ts=2021-11-15T09:21:29.685Z caller=cluster.go:230 component=cluster msg="will retry joining cluster every 10s"
+2021-11-15T09:21:29.688729257Z level=warn ts=2021-11-15T09:21:29.685Z caller=main.go:307 msg="unable to join gossip mesh" err="3 errors occurred:\n\t* Failed to resolve alertmanager-main-0.alertmanager-operated:9094: lookup alertmanager-main-0.alertmanager-operated on 172.30.0.10:53: no such host\n\t* Failed to resolve alertmanager-main-1.alertmanager-operated:9094: lookup alertmanager-main-1.alertmanager-operated on 172.30.0.10:53: no such host\n\t* Failed to resolve alertmanager-main-2.alertmanager-operated:9094: lookup alertmanager-main-2.alertmanager-operated on 172.30.0.10:53: no such host\n\n"
+2021-11-15T09:21:29.688729257Z level=info ts=2021-11-15T09:21:29.687Z caller=cluster.go:623 component=cluster msg="Waiting for gossip to settle..." interval=2s
+2021-11-15T09:21:29.840749198Z level=info ts=2021-11-15T09:21:29.840Z caller=coordinator.go:119 component=configuration msg="Loading configuration file" file=/etc/alertmanager/config/alertmanager.yaml
+2021-11-15T09:21:29.841639033Z level=info ts=2021-11-15T09:21:29.840Z caller=coordinator.go:131 component=configuration msg="Completed loading of configuration file" file=/etc/alertmanager/config/alertmanager.yaml
+2021-11-15T09:21:29.844588876Z level=info ts=2021-11-15T09:21:29.844Z caller=main.go:485 msg=Listening address=127.0.0.1:9093
+2021-11-15T09:21:31.688010032Z level=info ts=2021-11-15T09:21:31.687Z caller=cluster.go:648 component=cluster msg="gossip not settled" polls=0 before=0 now=1 elapsed=2.000160539s
+2021-11-15T09:21:36.069883883Z level=info ts=2021-11-15T09:21:36.065Z caller=coordinator.go:119 component=configuration msg="Loading configuration file" file=/etc/alertmanager/config/alertmanager.yaml
+2021-11-15T09:21:36.069883883Z level=info ts=2021-11-15T09:21:36.065Z caller=coordinator.go:131 component=configuration msg="Completed loading of configuration file" file=/etc/alertmanager/config/alertmanager.yaml
+2021-11-15T09:21:39.697216107Z level=info ts=2021-11-15T09:21:39.697Z caller=cluster.go:640 component=cluster msg="gossip settled; proceeding" elapsed=10.009414061s
+2021-11-15T09:21:44.715090814Z level=warn ts=2021-11-15T09:21:44.715Z caller=cluster.go:438 component=cluster msg=refresh result=failure addr=alertmanager-main-2.alertmanager-operated:9094 err="1 error occurred:\n\t* Failed to resolve alertmanager-main-2.alertmanager-operated:9094: lookup alertmanager-main-2.alertmanager-operated on 172.30.0.10:53: no such host\n\n"
+2021-11-15T09:21:59.695862111Z level=warn ts=2021-11-15T09:21:59.695Z caller=cluster.go:438 component=cluster msg=refresh result=failure addr=alertmanager-main-2.alertmanager-operated:9094 err="1 error occurred:\n\t* Failed to resolve alertmanager-main-2.alertmanager-operated:9094: lookup alertmanager-main-2.alertmanager-operated on 172.30.0.10:53: no such host\n\n"
+2021-11-16T08:34:38.427677160Z level=info ts=2021-11-16T08:34:38.423Z caller=coordinator.go:119 component=configuration msg="Loading configuration file" file=/etc/alertmanager/config/alertmanager.yaml
+2021-11-16T08:34:38.427677160Z level=info ts=2021-11-16T08:34:38.423Z caller=coordinator.go:131 component=configuration msg="Completed loading of configuration file" file=/etc/alertmanager/config/alertmanager.yaml
+2021-11-16T08:36:49.115902451Z level=warn ts=2021-11-16T08:36:49.115Z caller=notify.go:674 component=dispatcher receiver=Default integration=webhook[0] msg="Notify attempt failed, will retry later" attempts=1 err="Post \"https://this-endpoint.does/not-exist\": dial tcp 200.160.2.95:443: connect: connection timed out"
+2021-11-16T08:36:49.116291870Z level=warn ts=2021-11-16T08:36:49.115Z caller=notify.go:674 component=dispatcher receiver=Default integration=webhook[0] msg="Notify attempt failed, will retry later" attempts=1 err="Post \"https://this-endpoint.does/not-exist\": dial tcp 200.160.2.95:443: connect: connection timed out"
+2021-11-16T08:39:38.427419866Z level=error ts=2021-11-16T08:39:38.427Z caller=dispatch.go:309 component=dispatcher msg="Notify for alerts failed" num_alerts=1 err="Default/webhook[0]: notify retry canceled after 4 attempts: Post \"https://this-endpoint.does/not-exist\": context deadline exceeded"
+2021-11-16T08:39:38.428637761Z level=error ts=2021-11-16T08:39:38.428Z caller=dispatch.go:309 component=dispatcher msg="Notify for alerts failed" num_alerts=1 err="Default/webhook[0]: notify retry canceled after 4 attempts: Post \"https://this-endpoint.does/not-exist\": context deadline exceeded"
+2021-11-16T08:41:48.124399641Z level=warn ts=2021-11-16T08:41:48.123Z caller=notify.go:674 component=dispatcher receiver=Default integration=webhook[0] msg="Notify attempt failed, will retry later" attempts=1 err="Post \"https://this-endpoint.does/not-exist\": dial tcp 200.160.2.95:443: connect: connection timed out"
+2021-11-16T08:44:38.429245827Z level=error ts=2021-11-16T08:44:38.428Z caller=dispatch.go:309 component=dispatcher msg="Notify for alerts failed" num_alerts=1 err="Default/webhook[0]: notify retry canceled after 4 attempts: Post \"https://this-endpoint.does/not-exist\": context deadline exceeded"
+2021-11-16T08:44:44.252018413Z level=warn ts=2021-11-16T08:44:44.251Z caller=notify.go:674 component=dispatcher receiver=Default integration=webhook[0] msg="Notify attempt failed, will retry later" attempts=1 err="Post \"https://this-endpoint.does/not-exist\": dial tcp 200.160.2.95:443: connect: connection timed out"
+2021-11-16T08:44:44.252018413Z level=warn ts=2021-11-16T08:44:44.251Z caller=notify.go:674 component=dispatcher receiver=Critical integration=webhook[0] msg="Notify attempt failed, will retry later" attempts=1 err="Post \"https://this-endpoint.does/not-exist\": dial tcp 200.160.2.95:443: connect: connection timed out"
+2021-11-16T08:46:47.131868802Z level=warn ts=2021-11-16T08:46:47.131Z caller=notify.go:674 component=dispatcher receiver=Default integration=webhook[0] msg="Notify attempt failed, will retry later" attempts=1 err="Post \"https://this-endpoint.does/not-exist\": dial tcp 200.160.2.95:443: connect: connection timed out"
+2021-11-16T08:47:34.490605785Z level=warn ts=2021-11-16T08:47:34.490Z caller=notify.go:674 component=dispatcher receiver=Default integration=webhook[0] msg="Notify attempt failed, will retry later" attempts=3 err="Post \"https://this-endpoint.does/not-exist\": dial tcp 200.160.2.95:443: i/o timeout"
+2021-11-16T08:47:34.490869829Z level=error ts=2021-11-16T08:47:34.490Z caller=dispatch.go:309 component=dispatcher msg="Notify for alerts failed" num_alerts=1 err="Default/webhook[0]: notify retry canceled after 5 attempts: Post \"https://this-endpoint.does/not-exist\": context deadline exceeded"
+2021-11-16T08:47:34.495474536Z level=error ts=2021-11-16T08:47:34.495Z caller=dispatch.go:309 component=dispatcher msg="Notify for alerts failed" num_alerts=1 err="Critical/webhook[0]: notify retry canceled after 4 attempts: Post \"https://this-endpoint.does/not-exist\": context deadline exceeded"
diff --git a/pkg/gatherers/conditional/common.go b/pkg/gatherers/conditional/common.go
@@ -0,0 +1,27 @@
+package conditional
+
+import (
+	"fmt"
+
+	"k8s.io/klog/v2"
+)
+
+func getAlertPodName(labels AlertLabels) (string, error) {
+	name, ok := labels["pod"]
+	if !ok {
+		newErr := fmt.Errorf("alert is missing 'pod' label")
+		klog.Warningln(newErr.Error())
+		return "", newErr
+	}
+	return name, nil
+}
+
+func getAlertPodNamespace(labels AlertLabels) (string, error) {
+	namespace, ok := labels["namespace"]
+	if !ok {
+		newErr := fmt.Errorf("alert is missing 'namespace' label")
+		klog.Warningln(newErr.Error())
+		return "", newErr
+	}
+	return namespace, nil
+}
diff --git a/pkg/gatherers/conditional/common_test.go b/pkg/gatherers/conditional/common_test.go
@@ -0,0 +1,73 @@
+package conditional
+
+import "testing"
+
+// nolint:dupl
+func Test_getAlertPodName(t *testing.T) {
+	tests := []struct {
+		name    string
+		labels  AlertLabels
+		want    string
+		wantErr bool
+	}{
+		{
+			name:    "Pod name exists",
+			labels:  AlertLabels{"pod": "test-name"},
+			want:    "test-name",
+			wantErr: false,
+		},
+		{
+			name:    "Pod name does not exists",
+			labels:  AlertLabels{},
+			want:    "",
+			wantErr: true,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := getAlertPodName(tt.labels)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("getAlertPodName() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if got != tt.want {
+				t.Errorf("getAlertPodName() got = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+// nolint:dupl
+func Test_getAlertPodNamespace(t *testing.T) {
+	tests := []struct {
+		name    string
+		labels  AlertLabels
+		want    string
+		wantErr bool
+	}{
+		{
+			name:    "Pod namemespace exists",
+			labels:  AlertLabels{"namespace": "test-namespace"},
+			want:    "test-namespace",
+			wantErr: false,
+		},
+		{
+			name:    "Pod namespace does not exists",
+			labels:  AlertLabels{},
+			want:    "",
+			wantErr: true,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := getAlertPodNamespace(tt.labels)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("getAlertPodNamespace() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if got != tt.want {
+				t.Errorf("getAlertPodNamespace() got = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
diff --git a/pkg/gatherers/conditional/conditional_gatherer.go b/pkg/gatherers/conditional/conditional_gatherer.go
@@ -29,6 +29,7 @@ var gatheringFunctionBuilders = map[GatheringFunctionName]GathererFunctionBuilde
 	GatherImageStreamsOfNamespace: (*Gatherer).BuildGatherImageStreamsOfNamespace,
 	GatherAPIRequestCounts:        (*Gatherer).BuildGatherAPIRequestCounts,
 	GatherLogsOfUnhealthyPods:     (*Gatherer).BuildGatherLogsOfUnhealthyPods,
+	GatherAlertmanagerLogs:        (*Gatherer).BuildGatherAlertmanagerLogs,
 }
 
 // gatheringRules contains all the rules used to run conditional gatherings.
@@ -60,6 +61,7 @@ var gatheringFunctionBuilders = map[GatheringFunctionName]GathererFunctionBuilde
 // per container only if cluster version is 4.8 (not implemented, just an example of possible use) and alert
 // ClusterVersionOperatorIsDown is firing
 var defaultGatheringRules = []GatheringRule{
+	// GatherImageStreamsOfNamespace
 	{
 		Conditions: []ConditionWithParams{
 			{
@@ -79,6 +81,7 @@ var defaultGatheringRules = []GatheringRule{
 			},
 		},
 	},
+	// GatherAPIRequestCounts
 	{
 		Conditions: []ConditionWithParams{
 			{
@@ -94,6 +97,7 @@ var defaultGatheringRules = []GatheringRule{
 			},
 		},
 	},
+	// GatherLogsOfUnhealthyPods
 	{
 		Conditions: []ConditionWithParams{
 			{
@@ -128,6 +132,23 @@ var defaultGatheringRules = []GatheringRule{
 			},
 		},
 	},
+	// AlertManagerLogs
+	{
+		Conditions: []ConditionWithParams{
+			{
+				Type: AlertIsFiring,
+				Alert: &AlertConditionParams{
+					Name: "AlertmanagerFailedToSendAlerts",
+				},
+			},
+		},
+		GatheringFunctions: GatheringFunctions{
+			GatherAlertmanagerLogs: GatherAlertmanagerLogsParams{
+				AlertName: "AlertmanagerFailedToSendAlerts",
+				TailLines: 50,
+			},
+		},
+	},
 }
 
 const canConditionalGathererFail = false

diff --git a/pkg/gatherers/conditional/conditional_gatherer_test.go b/pkg/gatherers/conditional/conditional_gatherer_test.go
@@ -166,7 +166,7 @@ func Test_Gatherer_GatherConditionalGathererRules(t *testing.T) {
 	err = json.Unmarshal(item, &gotGatheringRules)
 	assert.NoError(t, err)
 
-	assert.Len(t, gotGatheringRules, 4)
+	assert.Len(t, gotGatheringRules, 5)
 }
 
 func newFakeClientWithMetrics(metrics string) *fake.RESTClient {