Skip to content

Commit b653953

Browse files
Merge pull request #314 from Sergey1011010/collect-logs-from-openshift-sdn-controller-pod
Bug 1916843: collect logs from openshift-sdn-controller pod
2 parents 3a8ba58 + d164710 commit b653953

File tree

8 files changed

+147
-17
lines changed

8 files changed

+147
-17
lines changed

docs/gathered-data.md

+20
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,26 @@ Response see https://docs.openshift.com/container-platform/4.6/rest_api/workload
317317
Location in archive: config/pod/openshift-apiserver-operator/logs/{pod-name}/errors.log
318318

319319

320+
## OpenshiftSDNControllerLogs
321+
322+
collects logs from sdn-controller pod in openshift-sdn namespace with following substrings:
323+
- "Node %s is not Ready": A node has been set offline for egress IPs because it is reported not ready at API
324+
- "Node %s may be offline... retrying": An egress node has failed the egress IP health check once,
325+
so it has big chances to be marked as offline soon or, at the very least, there has been a connectivity glitch.
326+
- "Node %s is offline": An egress node has failed enough probes to have been marked offline for egress IPs.
327+
If it has egress CIDRs assigned, its egress IPs have been moved to other nodes.
328+
Indicates issues at either the node or the network between the master and the node.
329+
- "Node %s is back online": This indicates that a node has recovered from the condition described
330+
at the previous message, by starting succeeding the egress IP health checks.
331+
Useful just in case that previous “Node %s is offline” messages are lost,
332+
so that we have a clue that there was failure previously.
333+
334+
The Kubernetes API https://github.com/kubernetes/client-go/blob/master/kubernetes/typed/core/v1/pod_expansion.go#L48
335+
Response see https://docs.openshift.com/container-platform/4.6/rest_api/workloads_apis/pod-core-v1.html#apiv1namespacesnamespacepodsnamelog
336+
337+
Location in archive: config/pod/openshift-sdn/logs/{pod-name}/errors.log
338+
339+
320340
## OpenshiftSDNLogs
321341

322342
collects logs from pods in openshift-sdn namespace with following substrings:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
W0120 20:07:54.645321 1 egressip.go:199] Node ci-ln-0d402jb-f76d1-hts4b-worker-c-2nwdl is not Ready
2+
W0120 20:08:29.646400 1 egressip.go:199] Node ci-ln-0d402jb-f76d1-hts4b-worker-c-2nwdl is not Ready
3+
I0120 20:08:59.649184 1 egressip.go:208] Node 10.0.32.3 is back online
4+
I0120 20:13:49.656538 1 egressip.go:219] Node 10.0.32.3 may be offline... retrying
5+
I0120 20:13:57.710028 1 egressip.go:219] Node 10.0.32.3 may be offline... retrying
6+
W0120 20:14:12.749977 1 egressip.go:214] Node 10.0.32.3 is offline
7+
I0120 20:15:29.663808 1 egressip.go:208] Node 10.0.32.3 is back online

pkg/gather/clusterconfig/0_gatherer.go

+1
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ var gatherFunctions = map[string]gatherFunction{
7171
"netnamespaces": GatherNetNamespace,
7272
"openshift_apiserver_operator_logs": GatherOpenShiftAPIServerOperatorLogs,
7373
"openshift_sdn_logs": GatherOpenshiftSDNLogs,
74+
"openshift_sdn_controller_logs": GatherOpenshiftSDNControllerLogs,
7475
}
7576

7677
// New creates new Gatherer

pkg/gather/clusterconfig/gather_logs.go

+21-5
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"bufio"
55
"context"
66
"fmt"
7+
"regexp"
78
"strings"
89

910
corev1 "k8s.io/api/core/v1"
@@ -21,7 +22,9 @@ import (
2122
// - limitBytes sets the maximum amount of logs that can be fetched
2223
// - logFileName sets the name of the file to save logs to.
2324
// - labelSelector allows you to filter pods by their labels
24-
// Actual location is `config/pod/{namespace}/logs/{podName}/{fileName}.log`
25+
// - regexSearch makes messagesToSearch regex patterns, so you can accomplish more complicated search
26+
//
27+
// Location of the logs is `config/pod/{namespace}/logs/{podName}/{fileName}.log`
2528
func gatherLogsFromPodsInNamespace(
2629
ctx context.Context,
2730
coreClient v1.CoreV1Interface,
@@ -31,6 +34,7 @@ func gatherLogsFromPodsInNamespace(
3134
limitBytes int64,
3235
logFileName string,
3336
labelSelector string,
37+
regexSearch bool,
3438
) ([]record.Record, error) {
3539
pods, err := coreClient.Pods(namespace).List(ctx, metav1.ListOptions{
3640
LabelSelector: labelSelector,
@@ -49,7 +53,7 @@ func gatherLogsFromPodsInNamespace(
4953
LimitBytes: &limitBytes,
5054
})
5155

52-
logs, err := filterLogs(ctx, request, messagesToSearch)
56+
logs, err := filterLogs(ctx, request, messagesToSearch, regexSearch)
5357
if err != nil {
5458
return nil, err
5559
}
@@ -70,7 +74,9 @@ func gatherLogsFromPodsInNamespace(
7074
return records, nil
7175
}
7276

73-
func filterLogs(ctx context.Context, request *restclient.Request, messagesToSearch []string) (string, error) {
77+
func filterLogs(
78+
ctx context.Context, request *restclient.Request, messagesToSearch []string, regexSearch bool,
79+
) (string, error) {
7480
stream, err := request.Stream(ctx)
7581
if err != nil {
7682
return "", err
@@ -90,8 +96,18 @@ func filterLogs(ctx context.Context, request *restclient.Request, messagesToSear
9096
for scanner.Scan() {
9197
line := scanner.Text()
9298
for _, messageToSearch := range messagesToSearch {
93-
if strings.Contains(strings.ToLower(line), strings.ToLower(messageToSearch)) {
94-
result += line + "\n"
99+
if regexSearch {
100+
matches, err := regexp.MatchString(messageToSearch, line)
101+
if err != nil {
102+
return "", err
103+
}
104+
if matches {
105+
result += line + "\n"
106+
}
107+
} else {
108+
if strings.Contains(strings.ToLower(line), strings.ToLower(messageToSearch)) {
109+
result += line + "\n"
110+
}
95111
}
96112
}
97113
}

pkg/gather/clusterconfig/gather_logs_test.go

+39-12
Original file line numberDiff line numberDiff line change
@@ -11,31 +11,29 @@ import (
1111
kubefake "k8s.io/client-go/kubernetes/fake"
1212
)
1313

14-
func TestGatherLogs(t *testing.T) {
15-
const testPodNamespace = "pod-namespace"
14+
func testGatherLogs(t *testing.T, regexSearch bool, stringToSearch string, shouldExist bool) {
15+
const testPodName = "test"
1616
const testLogFileName = "errors"
17-
// there's no way to specify logs fake pod will have, so we can only search for a hardcoded string "fake logs"
18-
const stringToSearch = "fake logs"
1917

2018
coreClient := kubefake.NewSimpleClientset().CoreV1()
2119
ctx := context.Background()
2220

23-
_, err := coreClient.Pods(testPodNamespace).Create(
21+
_, err := coreClient.Pods(testPodName).Create(
2422
ctx,
2523
&corev1.Pod{
2624
ObjectMeta: metav1.ObjectMeta{
27-
Name: testPodNamespace,
28-
Namespace: testPodNamespace,
25+
Name: testPodName,
26+
Namespace: testPodName,
2927
},
3028
Status: corev1.PodStatus{
3129
Phase: corev1.PodRunning,
3230
ContainerStatuses: []corev1.ContainerStatus{
33-
{Name: testPodNamespace},
31+
{Name: testPodName},
3432
},
3533
},
3634
Spec: corev1.PodSpec{
3735
Containers: []corev1.Container{
38-
{Name: testPodNamespace},
36+
{Name: testPodName},
3937
},
4038
},
4139
},
@@ -48,24 +46,53 @@ func TestGatherLogs(t *testing.T) {
4846
records, err := gatherLogsFromPodsInNamespace(
4947
ctx,
5048
coreClient,
51-
testPodNamespace,
49+
testPodName,
5250
[]string{
5351
stringToSearch,
5452
},
5553
86400, // last day
5654
1024*64, // maximum 64 kb of logs
5755
testLogFileName,
5856
"",
57+
regexSearch,
5958
)
6059
if err != nil {
6160
t.Fatal(err)
6261
}
6362

63+
if !shouldExist {
64+
assert.Len(t, records, 0)
65+
return
66+
}
67+
6468
assert.Len(t, records, 1)
6569
assert.Equal(
6670
t,
67-
fmt.Sprintf("config/pod/%s/logs/%s/%s.log", testPodNamespace, testPodNamespace, testLogFileName),
71+
fmt.Sprintf("config/pod/%s/logs/%s/%s.log", testPodName, testPodName, testLogFileName),
6872
records[0].Name,
6973
)
70-
assert.Equal(t, Raw{stringToSearch + "\n"}, records[0].Item)
74+
if regexSearch {
75+
assert.Regexp(t, stringToSearch, records[0].Item)
76+
} else {
77+
assert.Equal(t, Raw{stringToSearch + "\n"}, records[0].Item)
78+
}
79+
}
80+
81+
func TestGatherLogs(t *testing.T) {
82+
t.Run("SubstringSearch_ShouldExist", func(t *testing.T) {
83+
testGatherLogs(t, false, "fake logs", true)
84+
})
85+
t.Run("SubstringSearch_ShouldNotExist", func(t *testing.T) {
86+
testGatherLogs(t, false, "The quick brown fox jumps over the lazy dog", false)
87+
})
88+
t.Run("SubstringSearch_ShouldNotExist", func(t *testing.T) {
89+
testGatherLogs(t, false, "f.*l", false)
90+
})
91+
92+
t.Run("RegexSearch_ShouldExist", func(t *testing.T) {
93+
testGatherLogs(t, true, "f.*l", true)
94+
})
95+
t.Run("RegexSearch_ShouldNotExist", func(t *testing.T) {
96+
testGatherLogs(t, true, "[0-9]99", false)
97+
})
7198
}

pkg/gather/clusterconfig/openshift_apiserver_operator_logs.go

+1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ func GatherOpenShiftAPIServerOperatorLogs(g *Gatherer, c chan<- gatherResult) {
3636
1024*64, // maximum 64 kb of logs
3737
"errors",
3838
"app=openshift-apiserver-operator",
39+
false,
3940
)
4041
if err != nil {
4142
c <- gatherResult{nil, []error{err}}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
package clusterconfig
2+
3+
import (
4+
"k8s.io/client-go/kubernetes"
5+
)
6+
7+
// GatherOpenshiftSDNControllerLogs collects logs from sdn-controller pod in openshift-sdn namespace with following substrings:
8+
// - "Node %s is not Ready": A node has been set offline for egress IPs because it is reported not ready at API
9+
// - "Node %s may be offline... retrying": An egress node has failed the egress IP health check once,
10+
// so it has big chances to be marked as offline soon or, at the very least, there has been a connectivity glitch.
11+
// - "Node %s is offline": An egress node has failed enough probes to have been marked offline for egress IPs.
12+
// If it has egress CIDRs assigned, its egress IPs have been moved to other nodes.
13+
// Indicates issues at either the node or the network between the master and the node.
14+
// - "Node %s is back online": This indicates that a node has recovered from the condition described
15+
// at the previous message, by starting succeeding the egress IP health checks.
16+
// Useful just in case that previous “Node %s is offline” messages are lost,
17+
// so that we have a clue that there was failure previously.
18+
//
19+
// The Kubernetes API https://github.com/kubernetes/client-go/blob/master/kubernetes/typed/core/v1/pod_expansion.go#L48
20+
// Response see https://docs.openshift.com/container-platform/4.6/rest_api/workloads_apis/pod-core-v1.html#apiv1namespacesnamespacepodsnamelog
21+
//
22+
// Location in archive: config/pod/openshift-sdn/logs/{pod-name}/errors.log
23+
func GatherOpenshiftSDNControllerLogs(g *Gatherer, c chan<- gatherResult) {
24+
messagesToSearch := []string{
25+
"Node.+is not Ready",
26+
"Node.+may be offline\\.\\.\\. retrying",
27+
"Node.+is offline",
28+
"Node.+is back online",
29+
}
30+
31+
gatherKubeClient, err := kubernetes.NewForConfig(g.gatherProtoKubeConfig)
32+
if err != nil {
33+
c <- gatherResult{nil, []error{err}}
34+
return
35+
}
36+
37+
coreClient := gatherKubeClient.CoreV1()
38+
39+
records, err := gatherLogsFromPodsInNamespace(
40+
g.ctx,
41+
coreClient,
42+
"openshift-sdn",
43+
messagesToSearch,
44+
86400, // last day
45+
1024*64, // maximum 64 kb of logs
46+
"errors",
47+
"app=sdn-controller",
48+
true,
49+
)
50+
if err != nil {
51+
c <- gatherResult{nil, []error{err}}
52+
return
53+
}
54+
55+
c <- gatherResult{records, nil}
56+
return
57+
}

pkg/gather/clusterconfig/openshift_sdn_logs.go

+1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ func GatherOpenshiftSDNLogs(g *Gatherer, c chan<- gatherResult) {
4040
1024*64, // maximum 64 kb of logs
4141
"errors",
4242
"app=sdn",
43+
false,
4344
)
4445
if err != nil {
4546
c <- gatherResult{nil, []error{err}}

0 commit comments

Comments
 (0)