Skip to content

Commit a06094e

Browse files
committed
check log directory for restartCount
1 parent ac8c287 commit a06094e

File tree

2 files changed

+97
-0
lines changed

2 files changed

+97
-0
lines changed

pkg/kubelet/kuberuntime/kuberuntime_container.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,15 @@ import (
2121
"errors"
2222
"fmt"
2323
"io"
24+
"io/ioutil"
2425
"math/rand"
2526
"net/url"
2627
"os"
2728
"path/filepath"
29+
"regexp"
2830
goruntime "runtime"
2931
"sort"
32+
"strconv"
3033
"strings"
3134
"sync"
3235
"time"
@@ -127,6 +130,40 @@ func (s *startSpec) getTargetID(podStatus *kubecontainer.PodStatus) (*kubecontai
127130
return &targetStatus.ID, nil
128131
}
129132

133+
func calcRestartCountByLogDir(path string) (int, error) {
134+
// if the path doesn't exist then it's not an error
135+
if _, err := os.Stat(path); err != nil {
136+
return 0, nil
137+
}
138+
restartCount := int(0)
139+
files, err := ioutil.ReadDir(path)
140+
if err != nil {
141+
return 0, err
142+
}
143+
if len(files) == 0 {
144+
return 0, err
145+
}
146+
restartCountLogFileRegex := regexp.MustCompile(`(\d+).log(\..*)?`)
147+
for _, file := range files {
148+
if file.IsDir() {
149+
continue
150+
}
151+
matches := restartCountLogFileRegex.FindStringSubmatch(file.Name())
152+
if len(matches) == 0 {
153+
continue
154+
}
155+
count, err := strconv.Atoi(matches[1])
156+
if err != nil {
157+
return restartCount, err
158+
}
159+
count++
160+
if count > restartCount {
161+
restartCount = count
162+
}
163+
}
164+
return restartCount, nil
165+
}
166+
130167
// startContainer starts a container and returns a message indicates why it is failed on error.
131168
// It starts the container through the following steps:
132169
// * pull the image
@@ -150,6 +187,22 @@ func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandb
150187
containerStatus := podStatus.FindContainerStatusByName(container.Name)
151188
if containerStatus != nil {
152189
restartCount = containerStatus.RestartCount + 1
190+
} else {
191+
// The container runtime keeps state on container statuses and
192+
// what the container restart count is. When nodes are rebooted
193+
// some container runtimes clear their state which causes the
194+
// restartCount to be reset to 0. This causes the logfile to
195+
// start at 0.log, which either overwrites or appends to the
196+
// already existing log.
197+
//
198+
// We are checking to see if the log directory exists, and find
199+
// the latest restartCount by checking the log name -
200+
// {restartCount}.log - and adding 1 to it.
201+
logDir := BuildContainerLogsDirectory(pod.Namespace, pod.Name, pod.UID, container.Name)
202+
restartCount, err = calcRestartCountByLogDir(logDir)
203+
if err != nil {
204+
klog.InfoS("Log directory exists but could not calculate restartCount", "logDir", logDir, "err", err)
205+
}
153206
}
154207

155208
target, err := spec.getTargetID(podStatus)

pkg/kubelet/kuberuntime/kuberuntime_container_test.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ package kuberuntime
1818

1919
import (
2020
"fmt"
21+
"io/ioutil"
22+
"os"
2123
"path/filepath"
2224
"regexp"
2325
"strings"
@@ -422,3 +424,45 @@ func TestStartSpec(t *testing.T) {
422424
})
423425
}
424426
}
427+
428+
func TestRestartCountByLogDir(t *testing.T) {
429+
for _, tc := range []struct {
430+
filenames []string
431+
restartCount int
432+
}{
433+
{
434+
filenames: []string{"0.log.rotated-log"},
435+
restartCount: 1,
436+
},
437+
{
438+
filenames: []string{"0.log"},
439+
restartCount: 1,
440+
},
441+
{
442+
filenames: []string{"0.log", "1.log", "2.log"},
443+
restartCount: 3,
444+
},
445+
{
446+
filenames: []string{"0.log.rotated", "1.log", "2.log"},
447+
restartCount: 3,
448+
},
449+
{
450+
filenames: []string{"5.log.rotated", "6.log.rotated"},
451+
restartCount: 7,
452+
},
453+
{
454+
filenames: []string{"5.log.rotated", "6.log", "7.log"},
455+
restartCount: 8,
456+
},
457+
} {
458+
tempDirPath, err := ioutil.TempDir("", "test-restart-count-")
459+
assert.NoError(t, err, "create tempdir error")
460+
defer os.RemoveAll(tempDirPath)
461+
for _, filename := range tc.filenames {
462+
err = ioutil.WriteFile(filepath.Join(tempDirPath, filename), []byte("a log line"), 0600)
463+
assert.NoError(t, err, "could not write log file")
464+
}
465+
count, _ := calcRestartCountByLogDir(tempDirPath)
466+
assert.Equal(t, count, tc.restartCount, "count %v should equal restartCount %v", count, tc.restartCount)
467+
}
468+
}

0 commit comments

Comments
 (0)