Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug 1916843: collect logs from openshift-sdn-controller pod #314

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions docs/gathered-data.md
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,26 @@ Response see https://docs.openshift.com/container-platform/4.6/rest_api/workload
Location in archive: config/pod/openshift-apiserver-operator/logs/{pod-name}/errors.log


## OpenshiftSDNControllerLogs

collects logs from sdn-controller pod in openshift-sdn namespace with following substrings:
- "Node %s is not Ready": A node has been set offline for egress IPs because it is reported not ready at API
- "Node %s may be offline... retrying": An egress node has failed the egress IP health check once,
so it has big chances to be marked as offline soon or, at the very least, there has been a connectivity glitch.
- "Node %s is offline": An egress node has failed enough probes to have been marked offline for egress IPs.
If it has egress CIDRs assigned, its egress IPs have been moved to other nodes.
Indicates issues at either the node or the network between the master and the node.
- "Node %s is back online": This indicates that a node has recovered from the condition described
at the previous message, by starting succeeding the egress IP health checks.
Useful just in case that previous “Node %s is offline” messages are lost,
so that we have a clue that there was failure previously.

The Kubernetes API https://github.com/kubernetes/client-go/blob/master/kubernetes/typed/core/v1/pod_expansion.go#L48
Response see https://docs.openshift.com/container-platform/4.6/rest_api/workloads_apis/pod-core-v1.html#apiv1namespacesnamespacepodsnamelog

Location in archive: config/pod/openshift-sdn/logs/{pod-name}/errors.log


## OpenshiftSDNLogs

collects logs from pods in openshift-sdn namespace with following substrings:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
W0120 20:07:54.645321 1 egressip.go:199] Node ci-ln-0d402jb-f76d1-hts4b-worker-c-2nwdl is not Ready
W0120 20:08:29.646400 1 egressip.go:199] Node ci-ln-0d402jb-f76d1-hts4b-worker-c-2nwdl is not Ready
I0120 20:08:59.649184 1 egressip.go:208] Node 10.0.32.3 is back online
I0120 20:13:49.656538 1 egressip.go:219] Node 10.0.32.3 may be offline... retrying
I0120 20:13:57.710028 1 egressip.go:219] Node 10.0.32.3 may be offline... retrying
W0120 20:14:12.749977 1 egressip.go:214] Node 10.0.32.3 is offline
I0120 20:15:29.663808 1 egressip.go:208] Node 10.0.32.3 is back online
1 change: 1 addition & 0 deletions pkg/gather/clusterconfig/0_gatherer.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ var gatherFunctions = map[string]gatherFunction{
"netnamespaces": GatherNetNamespace,
"openshift_apiserver_operator_logs": GatherOpenShiftAPIServerOperatorLogs,
"openshift_sdn_logs": GatherOpenshiftSDNLogs,
"openshift_sdn_controller_logs": GatherOpenshiftSDNControllerLogs,
}

// New creates new Gatherer
Expand Down
26 changes: 21 additions & 5 deletions pkg/gather/clusterconfig/gather_logs.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"bufio"
"context"
"fmt"
"regexp"
"strings"

corev1 "k8s.io/api/core/v1"
Expand All @@ -21,7 +22,9 @@ import (
// - limitBytes sets the maximum amount of logs that can be fetched
// - logFileName sets the name of the file to save logs to.
// - labelSelector allows you to filter pods by their labels
// Actual location is `config/pod/{namespace}/logs/{podName}/{fileName}.log`
// - regexSearch makes messagesToSearch regex patterns, so you can accomplish more complicated search
//
// Location of the logs is `config/pod/{namespace}/logs/{podName}/{fileName}.log`
func gatherLogsFromPodsInNamespace(
ctx context.Context,
coreClient v1.CoreV1Interface,
Expand All @@ -31,6 +34,7 @@ func gatherLogsFromPodsInNamespace(
limitBytes int64,
logFileName string,
labelSelector string,
regexSearch bool,
) ([]record.Record, error) {
pods, err := coreClient.Pods(namespace).List(ctx, metav1.ListOptions{
LabelSelector: labelSelector,
Expand All @@ -49,7 +53,7 @@ func gatherLogsFromPodsInNamespace(
LimitBytes: &limitBytes,
})

logs, err := filterLogs(ctx, request, messagesToSearch)
logs, err := filterLogs(ctx, request, messagesToSearch, regexSearch)
if err != nil {
return nil, err
}
Expand All @@ -70,7 +74,9 @@ func gatherLogsFromPodsInNamespace(
return records, nil
}

func filterLogs(ctx context.Context, request *restclient.Request, messagesToSearch []string) (string, error) {
func filterLogs(
ctx context.Context, request *restclient.Request, messagesToSearch []string, regexSearch bool,
) (string, error) {
stream, err := request.Stream(ctx)
if err != nil {
return "", err
Expand All @@ -90,8 +96,18 @@ func filterLogs(ctx context.Context, request *restclient.Request, messagesToSear
for scanner.Scan() {
line := scanner.Text()
for _, messageToSearch := range messagesToSearch {
if strings.Contains(strings.ToLower(line), strings.ToLower(messageToSearch)) {
result += line + "\n"
if regexSearch {
matches, err := regexp.MatchString(messageToSearch, line)
if err != nil {
return "", err
}
if matches {
result += line + "\n"
}
} else {
if strings.Contains(strings.ToLower(line), strings.ToLower(messageToSearch)) {
result += line + "\n"
}
}
}
}
Expand Down
51 changes: 39 additions & 12 deletions pkg/gather/clusterconfig/gather_logs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,29 @@ import (
kubefake "k8s.io/client-go/kubernetes/fake"
)

func TestGatherLogs(t *testing.T) {
const testPodNamespace = "pod-namespace"
func testGatherLogs(t *testing.T, regexSearch bool, stringToSearch string, shouldExist bool) {
const testPodName = "test"
const testLogFileName = "errors"
// there's no way to specify logs fake pod will have, so we can only search for a hardcoded string "fake logs"
const stringToSearch = "fake logs"

coreClient := kubefake.NewSimpleClientset().CoreV1()
ctx := context.Background()

_, err := coreClient.Pods(testPodNamespace).Create(
_, err := coreClient.Pods(testPodName).Create(
ctx,
&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: testPodNamespace,
Namespace: testPodNamespace,
Name: testPodName,
Namespace: testPodName,
},
Status: corev1.PodStatus{
Phase: corev1.PodRunning,
ContainerStatuses: []corev1.ContainerStatus{
{Name: testPodNamespace},
{Name: testPodName},
},
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{Name: testPodNamespace},
{Name: testPodName},
},
},
},
Expand All @@ -48,24 +46,53 @@ func TestGatherLogs(t *testing.T) {
records, err := gatherLogsFromPodsInNamespace(
ctx,
coreClient,
testPodNamespace,
testPodName,
[]string{
stringToSearch,
},
86400, // last day
1024*64, // maximum 64 kb of logs
testLogFileName,
"",
regexSearch,
)
if err != nil {
t.Fatal(err)
}

if !shouldExist {
assert.Len(t, records, 0)
return
}

assert.Len(t, records, 1)
assert.Equal(
t,
fmt.Sprintf("config/pod/%s/logs/%s/%s.log", testPodNamespace, testPodNamespace, testLogFileName),
fmt.Sprintf("config/pod/%s/logs/%s/%s.log", testPodName, testPodName, testLogFileName),
records[0].Name,
)
assert.Equal(t, Raw{stringToSearch + "\n"}, records[0].Item)
if regexSearch {
assert.Regexp(t, stringToSearch, records[0].Item)
} else {
assert.Equal(t, Raw{stringToSearch + "\n"}, records[0].Item)
}
}

func TestGatherLogs(t *testing.T) {
t.Run("SubstringSearch_ShouldExist", func(t *testing.T) {
testGatherLogs(t, false, "fake logs", true)
})
t.Run("SubstringSearch_ShouldNotExist", func(t *testing.T) {
testGatherLogs(t, false, "The quick brown fox jumps over the lazy dog", false)
})
t.Run("SubstringSearch_ShouldNotExist", func(t *testing.T) {
testGatherLogs(t, false, "f.*l", false)
})

t.Run("RegexSearch_ShouldExist", func(t *testing.T) {
testGatherLogs(t, true, "f.*l", true)
})
t.Run("RegexSearch_ShouldNotExist", func(t *testing.T) {
testGatherLogs(t, true, "[0-9]99", false)
})
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ func GatherOpenShiftAPIServerOperatorLogs(g *Gatherer, c chan<- gatherResult) {
1024*64, // maximum 64 kb of logs
"errors",
"app=openshift-apiserver-operator",
false,
)
if err != nil {
c <- gatherResult{nil, []error{err}}
Expand Down
57 changes: 57 additions & 0 deletions pkg/gather/clusterconfig/openshift_sdn_controller_logs.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package clusterconfig

import (
"k8s.io/client-go/kubernetes"
)

// GatherOpenshiftSDNControllerLogs collects logs from sdn-controller pod in openshift-sdn namespace with following substrings:
// - "Node %s is not Ready": A node has been set offline for egress IPs because it is reported not ready at API
// - "Node %s may be offline... retrying": An egress node has failed the egress IP health check once,
// so it has big chances to be marked as offline soon or, at the very least, there has been a connectivity glitch.
// - "Node %s is offline": An egress node has failed enough probes to have been marked offline for egress IPs.
// If it has egress CIDRs assigned, its egress IPs have been moved to other nodes.
// Indicates issues at either the node or the network between the master and the node.
// - "Node %s is back online": This indicates that a node has recovered from the condition described
// at the previous message, by starting succeeding the egress IP health checks.
// Useful just in case that previous “Node %s is offline” messages are lost,
// so that we have a clue that there was failure previously.
//
// The Kubernetes API https://github.com/kubernetes/client-go/blob/master/kubernetes/typed/core/v1/pod_expansion.go#L48
// Response see https://docs.openshift.com/container-platform/4.6/rest_api/workloads_apis/pod-core-v1.html#apiv1namespacesnamespacepodsnamelog
//
// Location in archive: config/pod/openshift-sdn/logs/{pod-name}/errors.log
func GatherOpenshiftSDNControllerLogs(g *Gatherer, c chan<- gatherResult) {
messagesToSearch := []string{
"Node.+is not Ready",
"Node.+may be offline\\.\\.\\. retrying",
"Node.+is offline",
"Node.+is back online",
}

gatherKubeClient, err := kubernetes.NewForConfig(g.gatherProtoKubeConfig)
if err != nil {
c <- gatherResult{nil, []error{err}}
return
}

coreClient := gatherKubeClient.CoreV1()

records, err := gatherLogsFromPodsInNamespace(
g.ctx,
coreClient,
"openshift-sdn",
messagesToSearch,
86400, // last day
1024*64, // maximum 64 kb of logs
"errors",
"app=sdn-controller",
true,
)
if err != nil {
c <- gatherResult{nil, []error{err}}
return
}

c <- gatherResult{records, nil}
return
}
1 change: 1 addition & 0 deletions pkg/gather/clusterconfig/openshift_sdn_logs.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ func GatherOpenshiftSDNLogs(g *Gatherer, c chan<- gatherResult) {
1024*64, // maximum 64 kb of logs
"errors",
"app=sdn",
false,
)
if err != nil {
c <- gatherResult{nil, []error{err}}
Expand Down