Skip to content

Commit 22ea9c9

Browse files
authored
Conditional gatherer of logs of unhealthy pods (#509)
* Cond. gather of unhealthy pods' logs * Add Previous switch/prop to log msg filter struct * Add FieldSelector to log gathering helper func * Add missing fields to gatherer params * Improve cond. gather of unhealthy pod logs * Add tests for unhealthy pod logs cond. gatherer * Update cond. gatherer rules JSON in sample archive * Simplify unhealthy pod logs cond. gatherer * Update sample cond. gatherer rules JSON * Refactor pod log test to satisfy linter * Make linter even happier (pod log cond. gather) * Fix linter long lines in pod log cond. gather * Add alert name pattern to pod log cond gather conf * Couple of tweaks to unhealthy pod log cond. gahter * Update unhealthy pod log cond. gatherer tests * Update Gathered Data docs
1 parent 6d9652d commit 22ea9c9

9 files changed

+394
-8
lines changed

docs/gathered-data.md

+9
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,15 @@ Response see:
356356
* 4.9+
357357

358358

359+
## LogsOfUnhealthyPods
360+
361+
collects either current or previous logs for pods firing one of the configured alerts.
362+
363+
* Location in archive: conditional/namespaces/<namespace>/pods/<pod>/containers/<container>/<logs|logs-previous>/last-<tail length>-lines.log
364+
* Since versions:
365+
* 4.10+
366+
367+
359368
## MachineAutoscalers
360369

361370
collects MachineAutoscalers definition

docs/insights-archive-sample/insights-operator/conditional-gatherer-rules.json

+50-2
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,62 @@
99
}
1010
],
1111
"gathering_functions": {
12-
"image_stream_definitions_of_namespace": {
12+
"image_streams_of_namespace": {
1313
"namespace": "openshift-cluster-samples-operator"
1414
},
1515
"logs_of_namespace": {
16-
"label_selector": "",
1716
"namespace": "openshift-cluster-samples-operator",
1817
"tail_lines": 100
1918
}
2019
}
20+
},
21+
{
22+
"conditions": [
23+
{
24+
"type": "alert_is_firing",
25+
"params": {
26+
"name": "APIRemovedInNextEUSReleaseInUse"
27+
}
28+
}
29+
],
30+
"gathering_functions": {
31+
"api_request_counts_of_resource_from_alert": {
32+
"alert_name": "APIRemovedInNextEUSReleaseInUse"
33+
}
34+
}
35+
},
36+
{
37+
"conditions": [
38+
{
39+
"type": "alert_is_firing",
40+
"params": {
41+
"name": "KubePodCrashLooping"
42+
}
43+
}
44+
],
45+
"gathering_functions": {
46+
"logs_of_unhealthy_pods": {
47+
"alert_name": "KubePodCrashLooping",
48+
"tail_lines": 20,
49+
"previous": true
50+
}
51+
}
52+
},
53+
{
54+
"conditions": [
55+
{
56+
"type": "alert_is_firing",
57+
"params": {
58+
"name": "KubePodNotReady"
59+
}
60+
}
61+
],
62+
"gathering_functions": {
63+
"logs_of_unhealthy_pods": {
64+
"alert_name": "KubePodNotReady",
65+
"tail_lines": 100,
66+
"previous": false
67+
}
68+
}
2169
}
2270
]

pkg/gatherers/common/gather_logs.go

+4
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
type LogContainersFilter struct {
2222
Namespace string
2323
LabelSelector string
24+
FieldSelector string
2425
ContainerNameRegexFilter string
2526
}
2627

@@ -31,6 +32,7 @@ type LogMessagesFilter struct {
3132
SinceSeconds int64
3233
LimitBytes int64
3334
TailLines int64
35+
Previous bool
3436
}
3537

3638
// CollectLogsFromContainers collects logs from containers
@@ -65,6 +67,7 @@ func CollectLogsFromContainers( //nolint:gocyclo
6567

6668
pods, err := coreClient.Pods(containersFilter.Namespace).List(ctx, metav1.ListOptions{
6769
LabelSelector: containersFilter.LabelSelector,
70+
FieldSelector: containersFilter.FieldSelector,
6871
})
6972
if err != nil {
7073
return nil, err
@@ -114,6 +117,7 @@ func CollectLogsFromContainers( //nolint:gocyclo
114117
SinceSeconds: sinceSeconds,
115118
LimitBytes: limitBytes,
116119
TailLines: tailLines,
120+
Previous: messagesFilter.Previous,
117121
Timestamps: true,
118122
})
119123

pkg/gatherers/conditional/conditional_gatherer.go

+35-4
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ var gatheringFunctionBuilders = map[GatheringFunctionName]GathererFunctionBuilde
2828
GatherLogsOfNamespace: (*Gatherer).BuildGatherLogsOfNamespace,
2929
GatherImageStreamsOfNamespace: (*Gatherer).BuildGatherImageStreamsOfNamespace,
3030
GatherAPIRequestCounts: (*Gatherer).BuildGatherAPIRequestCounts,
31+
GatherLogsOfUnhealthyPods: (*Gatherer).BuildGatherLogsOfUnhealthyPods,
3132
}
3233

3334
// gatheringRules contains all the rules used to run conditional gatherings.
@@ -93,6 +94,40 @@ var defaultGatheringRules = []GatheringRule{
9394
},
9495
},
9596
},
97+
{
98+
Conditions: []ConditionWithParams{
99+
{
100+
Type: AlertIsFiring,
101+
Params: AlertIsFiringConditionParams{
102+
Name: "KubePodCrashLooping",
103+
},
104+
},
105+
},
106+
GatheringFunctions: GatheringFunctions{
107+
GatherLogsOfUnhealthyPods: GatherLogsOfUnhealthyPodsParams{
108+
AlertName: "KubePodCrashLooping",
109+
TailLines: 20,
110+
Previous: true,
111+
},
112+
},
113+
},
114+
{
115+
Conditions: []ConditionWithParams{
116+
{
117+
Type: AlertIsFiring,
118+
Params: AlertIsFiringConditionParams{
119+
Name: "KubePodNotReady",
120+
},
121+
},
122+
},
123+
GatheringFunctions: GatheringFunctions{
124+
GatherLogsOfUnhealthyPods: GatherLogsOfUnhealthyPodsParams{
125+
AlertName: "KubePodNotReady",
126+
TailLines: 100,
127+
Previous: false,
128+
},
129+
},
130+
},
96131
}
97132

98133
const canConditionalGathererFail = false
@@ -230,24 +265,20 @@ func (g *Gatherer) updateAlertsCacheFromClient(ctx context.Context, metricsClien
230265
if err != nil {
231266
return err
232267
}
233-
234268
var parser expfmt.TextParser
235269
metricFamilies, err := parser.TextToMetricFamilies(bytes.NewReader(data))
236270
if err != nil {
237271
return err
238272
}
239-
240273
if len(metricFamilies) > 1 {
241274
// just log cuz everything would still work
242275
klog.Warning(logPrefix + "unexpected output from prometheus metrics parser")
243276
}
244-
245277
metricFamily, found := metricFamilies["ALERTS"]
246278
if !found {
247279
klog.Info(logPrefix + "no alerts are firing")
248280
return nil
249281
}
250-
251282
for _, metric := range metricFamily.GetMetric() {
252283
if metric == nil {
253284
klog.Info(logPrefix + "metric is nil")

pkg/gatherers/conditional/conditional_gatherer_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ func Test_Gatherer_GatherConditionalGathererRules(t *testing.T) {
166166
err = json.Unmarshal(item, &gotGatheringRules)
167167
assert.NoError(t, err)
168168

169-
assert.Len(t, gotGatheringRules, 2)
169+
assert.Len(t, gotGatheringRules, 4)
170170
}
171171

172172
func newFakeClientWithMetrics(metrics string) *fake.RESTClient {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
package conditional
2+
3+
import (
4+
"context"
5+
"fmt"
6+
7+
"k8s.io/client-go/kubernetes"
8+
v1 "k8s.io/client-go/kubernetes/typed/core/v1"
9+
"k8s.io/klog/v2"
10+
11+
"github.com/openshift/insights-operator/pkg/gatherers"
12+
"github.com/openshift/insights-operator/pkg/gatherers/common"
13+
"github.com/openshift/insights-operator/pkg/record"
14+
)
15+
16+
// BuildGatherLogsOfUnhealthyPods collects either current or previous logs for pods firing one of the configured alerts.
17+
//
18+
// * Location in archive: conditional/namespaces/<namespace>/pods/<pod>/containers/<container>/<logs|logs-previous>/last-<tail length>-lines.log
19+
// * Since versions:
20+
// * 4.10+
21+
func (g *Gatherer) BuildGatherLogsOfUnhealthyPods(paramsInterface interface{}) (gatherers.GatheringClosure, error) {
22+
params, ok := paramsInterface.(GatherLogsOfUnhealthyPodsParams)
23+
if !ok {
24+
return gatherers.GatheringClosure{}, fmt.Errorf(
25+
"unexpected type in paramsInterface, expected %T, got %T",
26+
GatherLogsOfUnhealthyPodsParams{}, paramsInterface,
27+
)
28+
}
29+
30+
return gatherers.GatheringClosure{
31+
Run: func(ctx context.Context) ([]record.Record, []error) {
32+
kubeClient, err := kubernetes.NewForConfig(g.gatherProtoKubeConfig)
33+
if err != nil {
34+
return nil, []error{err}
35+
}
36+
return g.gatherLogsOfUnhealthyPods(ctx, kubeClient.CoreV1(), params)
37+
},
38+
CanFail: canConditionalGathererFail,
39+
}, nil
40+
}
41+
42+
func (g *Gatherer) gatherLogsOfUnhealthyPods(
43+
ctx context.Context, coreClient v1.CoreV1Interface, params GatherLogsOfUnhealthyPodsParams,
44+
) ([]record.Record, []error) {
45+
errs := []error{}
46+
records := []record.Record{}
47+
48+
alertInstances, ok := g.firingAlerts[params.AlertName]
49+
if !ok {
50+
return nil, []error{fmt.Errorf("conditional gatherer triggered, but specified alert %q is not firing", params.AlertName)}
51+
}
52+
for _, alertLabels := range alertInstances {
53+
alertNamespace, ok := alertLabels["namespace"]
54+
if !ok {
55+
newErr := fmt.Errorf("alert is missing 'namespace' label")
56+
klog.Warningln(newErr.Error())
57+
errs = append(errs, newErr)
58+
continue
59+
}
60+
alertPod, ok := alertLabels["pod"]
61+
if !ok {
62+
newErr := fmt.Errorf("alert is missing 'pod' label")
63+
klog.Warningln(newErr.Error())
64+
errs = append(errs, newErr)
65+
continue
66+
}
67+
// The container label may not be present for all alerts (e.g., KubePodNotReady).
68+
containerFilter := ""
69+
if alertContainer, ok := alertLabels["container"]; ok && alertContainer != "" {
70+
containerFilter = fmt.Sprintf("^%s$", alertContainer)
71+
}
72+
73+
logRecords, err := common.CollectLogsFromContainers(ctx, coreClient,
74+
common.LogContainersFilter{
75+
Namespace: alertNamespace,
76+
FieldSelector: fmt.Sprintf("metadata.name=%s", alertPod),
77+
ContainerNameRegexFilter: containerFilter,
78+
},
79+
common.LogMessagesFilter{
80+
TailLines: params.TailLines,
81+
Previous: params.Previous,
82+
},
83+
func(namespace string, podName string, containerName string) string {
84+
logDirName := "logs"
85+
if params.Previous {
86+
logDirName = "logs-previous"
87+
}
88+
return fmt.Sprintf(
89+
"%s/namespaces/%s/pods/%s/containers/%s/%s/last-%d-lines.log",
90+
g.GetName(),
91+
namespace,
92+
podName,
93+
containerName,
94+
logDirName,
95+
params.TailLines,
96+
)
97+
})
98+
if err != nil {
99+
// This can happen when the pod is destroyed but the alert still exists.
100+
newErr := fmt.Errorf("unable to get container logs: %v", err)
101+
klog.Warningln(newErr.Error())
102+
errs = append(errs, newErr)
103+
continue
104+
}
105+
106+
records = append(records, logRecords...)
107+
}
108+
109+
return records, errs
110+
}

0 commit comments

Comments
 (0)