Skip to content

Commit 10dbfef

Browse files
committed
add custom problem detector plugin
1 parent ffc7909 commit 10dbfef

31 files changed

+1058
-76
lines changed

README.md

+14-5
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ enabled by default in the GCE cluster.
1414
# Background
1515
There are tons of node problems could possibly affect the pods running on the
1616
node such as:
17-
* Hardware issues: Bad cpu, memory or disk;
17+
* Infrastructure daemon issues: ntp service down;
18+
* Hardware issues: Bad cpu, memory or disk, ntp service down;
1819
* Kernel issues: Kernel deadlock, corrupted file system;
1920
* Container runtime issues: Unresponsive runtime daemon;
2021
* ...
@@ -53,23 +54,30 @@ List of supported problem daemons:
5354
|----------------|:---------------:|:------------|
5455
| [KernelMonitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor.json) | KernelDeadlock | A system log monitor monitors kernel log and reports problem according to predefined rules. |
5556
| [AbrtAdaptor](https://github.com/kubernetes/node-problem-detector/blob/master/config/abrt-adaptor.json) | None | Monitor ABRT log messages and report them further. ABRT (Automatic Bug Report Tool) is health monitoring daemon able to catch kernel problems as well as application crashes of various kinds occurred on the host. For more information visit the [link](https://github.com/abrt). |
57+
| [CustomPluginMonitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/custom-plugin-monitor.json) | On-demand(According to users configuration) | A custom plugin monitor for node-problem-detector to invoke and check various node problems with user defined check scripts. See proposal [here](https://docs.google.com/document/d/1jK_5YloSYtboj-DtfjmYKxfNnUxCAvohLnsH5aGCAYQ/edit#). |
5658

5759
# Usage
5860
## Flags
5961
* `--version`: Print current version of node-problem-detector.
62+
* `--address`: The address to bind the node problem detector server.
63+
* `--port`: The port to bind the node problem detector server. Use 0 to disable.
6064
* `--system-log-monitors`: List of paths to system log monitor configuration files, comma separated, e.g.
6165
[config/kernel-monitor.json](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor.json).
6266
Node problem detector will start a separate log monitor for each configuration. You can
6367
use different log monitors to monitor different system log.
68+
* `--custom-plugin-monitors`: List of paths to custom plugin monitor config files, comma separated, e.g.
69+
[config/custom-plugin-monitor.json](https://github.com/kubernetes/node-problem-detector/blob/master/config/custom-plugin-monitor.json).
70+
Node problem detector will start a separate custom plugin monitor for each configuration. You can
71+
use different custom plugin monitors to monitor different node problems.
6472
* `--apiserver-override`: A URI parameter used to customize how node-problem-detector
6573
connects the apiserver. The format is same as the
6674
[`source`](https://github.com/kubernetes/heapster/blob/master/docs/source-configuration.md#kubernetes)
6775
flag of [Heapster](https://github.com/kubernetes/heapster).
6876
For example, to run without auth, use the following config:
69-
```
70-
http://APISERVER_IP:APISERVER_PORT?inClusterConfig=false
71-
```
72-
Refer [heapster docs](https://github.com/kubernetes/heapster/blob/master/docs/source-configuration.md#kubernetes) for a complete list of available options.
77+
```
78+
http://APISERVER_IP:APISERVER_PORT?inClusterConfig=false
79+
```
80+
Refer [heapster docs](https://github.com/kubernetes/heapster/blob/master/docs/source-configuration.md#kubernetes) for a complete list of available options.
7381
* `--hostname-override`: A customized node name used for node-problem-detector to update conditions and emit events. node-problem-detector gets node name first from `hostname-override`, then `NODE_NAME` environment variable and finally fall back to `os.Hostname`.
7482

7583
## Build Image
@@ -138,4 +146,5 @@ For more scenarios, see [here](https://github.com/kubernetes/heapster/blob/maste
138146
# Links
139147
* [Design Doc](https://docs.google.com/document/d/1cs1kqLziG-Ww145yN6vvlKguPbQQ0psrSBnEqpy0pzE/edit?usp=sharing)
140148
* [Slides](https://docs.google.com/presentation/d/1bkJibjwWXy8YnB5fna6p-Ltiy-N5p01zUsA22wCNkXA/edit?usp=sharing)
149+
* [Plugin Interface Proposal](https://docs.google.com/document/d/1jK_5YloSYtboj-DtfjmYKxfNnUxCAvohLnsH5aGCAYQ/edit#)
141150
* [Addon Manifest](https://github.com/kubernetes/kubernetes/tree/master/cluster/addons/node-problem-detector)

cmd/node_problem_detector.go

+14-3
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,11 @@ import (
2727
"github.com/spf13/pflag"
2828

2929
"k8s.io/node-problem-detector/cmd/options"
30+
"k8s.io/node-problem-detector/pkg/custompluginmonitor"
3031
"k8s.io/node-problem-detector/pkg/problemclient"
3132
"k8s.io/node-problem-detector/pkg/problemdetector"
3233
"k8s.io/node-problem-detector/pkg/systemlogmonitor"
34+
"k8s.io/node-problem-detector/pkg/types"
3335
"k8s.io/node-problem-detector/pkg/version"
3436
)
3537

@@ -67,15 +69,24 @@ func main() {
6769
os.Exit(0)
6870
}
6971

70-
monitors := make(map[string]systemlogmonitor.LogMonitor)
72+
monitors := make(map[string]types.Monitor)
7173
for _, config := range npdo.SystemLogMonitorConfigPaths {
7274
if _, ok := monitors[config]; ok {
73-
// Skip the config if it's duplictaed.
74-
glog.Warningf("Duplicated log monitor configuration %q", config)
75+
// Skip the config if it's duplicated.
76+
glog.Warningf("Duplicated monitor configuration %q", config)
7577
continue
7678
}
7779
monitors[config] = systemlogmonitor.NewLogMonitorOrDie(config)
7880
}
81+
82+
for _, config := range npdo.CustomPluginMonitorConfigPaths {
83+
if _, ok := monitors[config]; ok {
84+
// Skip the config if it's duplicated.
85+
glog.Warningf("Duplicated monitor configuration %q", config)
86+
continue
87+
}
88+
monitors[config] = custompluginmonitor.NewCustomPluginMonitorOrDie(config)
89+
}
7990
c := problemclient.NewClientOrDie(npdo)
8091
p := problemdetector.NewProblemDetector(monitors, c)
8192

cmd/options/options.go

+5
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ type NodeProblemDetectorOptions struct {
3333
// SystemLogMonitorConfigPaths specifies the list of paths to system log monitor configuration
3434
// files.
3535
SystemLogMonitorConfigPaths []string
36+
// CustomPluginMonitorConfigPaths specifies the list of paths to custom plugin monitor configuration
37+
// files.
38+
CustomPluginMonitorConfigPaths []string
3639
// ApiServerOverride is the custom URI used to connect to Kubernetes ApiServer.
3740
ApiServerOverride string
3841
// PrintVersion is the flag determining whether version information is printed.
@@ -58,6 +61,8 @@ func NewNodeProblemDetectorOptions() *NodeProblemDetectorOptions {
5861
func (npdo *NodeProblemDetectorOptions) AddFlags(fs *pflag.FlagSet) {
5962
fs.StringSliceVar(&npdo.SystemLogMonitorConfigPaths, "system-log-monitors",
6063
[]string{}, "List of paths to system log monitor config files, comma separated.")
64+
fs.StringSliceVar(&npdo.CustomPluginMonitorConfigPaths, "custom-plugin-monitors",
65+
[]string{}, "List of paths to custom plugin monitor config files, comma separated.")
6166
fs.StringVar(&npdo.ApiServerOverride, "apiserver-override",
6267
"", "Custom URI used to connect to Kubernetes ApiServer")
6368
fs.BoolVar(&npdo.PrintVersion, "version", false, "Print version information and quit")

config/custom-plugin-monitor.json

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{
2+
"plugin": "custom",
3+
"pluginConfig": {
4+
"invoke_interval": "30s",
5+
"timeout": "5s",
6+
"max_output_length": 80,
7+
"concurrency": 3
8+
},
9+
"source": "ntp-custom-plugin-monitor",
10+
"conditions": [
11+
{
12+
"type": "NTPProblem",
13+
"reason": "NTPIsUp",
14+
"message": "ntp service is up"
15+
}
16+
],
17+
"rules": [
18+
{
19+
"type": "temporary",
20+
"reason": "NTPIsDown",
21+
"path": "./config/plugin/check_ntp.sh",
22+
"timeout": "3s"
23+
},
24+
{
25+
"type": "permanent",
26+
"condition": "NTPProblem",
27+
"reason": "NTPIsDown",
28+
"path": "./config/plugin/check_ntp.sh",
29+
"timeout": "3s"
30+
}
31+
]
32+
}

config/plugin/check_ntp.sh

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/bin/bash
2+
3+
# NOTE: THIS NTP SERVICE CHECK SCRIPT ASSUME THAT NTP SERVICE IS RUNNING UNDER SYSTEMD.
4+
# THIS IS JUST AN EXAMPLE. YOU CAN WRITE YOUR OWN NODE PROBLEM PLUGIN ON DEMAND.
5+
6+
OK=0
7+
NONOK=1
8+
UNKNOWN=2
9+
10+
which systemctl >/dev/null
11+
if [ $? -ne 0 ]; then
12+
echo "Systemd is not supported"
13+
exit $UNKNOWN
14+
fi
15+
16+
systemctl status ntp.service | grep 'Active:' | grep -q running
17+
if [ $? -ne 0 ]; then
18+
echo "NTP service is not running"
19+
exit $NONOK
20+
fi
21+
22+
echo "NTP service is running"
23+
exit $OK

pkg/custompluginmonitor/README.md

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Custom Plugin Monitor
2+
3+
Custom plugin monitor is a plugin mechanism for node-problem-detector. It will
4+
extend node-problem-detector to execute any monitor scripts written in any language.
5+
The monitor scripts must conform to the plugin protocol in exit code and standard
6+
output. For more info about the plugin protocol, please refer to the
7+
[node-problem-detector plugin interface proposal](https://docs.google.com/document/d/1jK_5YloSYtboj-DtfjmYKxfNnUxCAvohLnsH5aGCAYQ/edit#)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
/*
2+
Copyright 2017 The Kubernetes Authors All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package custompluginmonitor
18+
19+
import (
20+
"encoding/json"
21+
"io/ioutil"
22+
"time"
23+
24+
"github.com/golang/glog"
25+
26+
"k8s.io/node-problem-detector/pkg/custompluginmonitor/plugin"
27+
cpmtypes "k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
28+
"k8s.io/node-problem-detector/pkg/types"
29+
"k8s.io/node-problem-detector/pkg/util/tomb"
30+
)
31+
32+
type customPluginMonitor struct {
33+
config cpmtypes.CustomPluginConfig
34+
conditions []types.Condition
35+
plugin *plugin.Plugin
36+
resultChan <-chan cpmtypes.Result
37+
statusChan chan *types.Status
38+
tomb *tomb.Tomb
39+
}
40+
41+
// NewCustomPluginMonitorOrDie create a new customPluginMonitor, panic if error occurs.
42+
func NewCustomPluginMonitorOrDie(configPath string) types.Monitor {
43+
c := &customPluginMonitor{
44+
tomb: tomb.NewTomb(),
45+
}
46+
f, err := ioutil.ReadFile(configPath)
47+
if err != nil {
48+
glog.Fatalf("Failed to read configuration file %q: %v", configPath, err)
49+
}
50+
err = json.Unmarshal(f, &c.config)
51+
if err != nil {
52+
glog.Fatalf("Failed to unmarshal configuration file %q: %v", configPath, err)
53+
}
54+
// Apply configurations
55+
err = (&c.config).ApplyConfiguration()
56+
if err != nil {
57+
glog.Fatalf("Failed to apply configuration for %q: %v", configPath, err)
58+
}
59+
60+
// Validate configurations
61+
err = c.config.Validate()
62+
if err != nil {
63+
glog.Fatalf("Failed to validate custom plugin config %+v: %v", c.config, err)
64+
}
65+
66+
glog.Infof("Finish parsing custom plugin monitor config file: %+v", c.config)
67+
68+
c.plugin = plugin.NewPlugin(c.config)
69+
// A 1000 size channel should be big enough.
70+
c.statusChan = make(chan *types.Status, 1000)
71+
return c
72+
}
73+
74+
func (c *customPluginMonitor) Start() (<-chan *types.Status, error) {
75+
glog.Info("Start custom plugin monitor")
76+
go c.plugin.Run()
77+
go c.monitorLoop()
78+
return c.statusChan, nil
79+
}
80+
81+
func (c *customPluginMonitor) Stop() {
82+
glog.Info("Stop custom plugin monitor")
83+
c.tomb.Stop()
84+
}
85+
86+
// monitorLoop is the main loop of log monitor.
87+
func (c *customPluginMonitor) monitorLoop() {
88+
c.initializeStatus()
89+
90+
resultChan := c.plugin.GetResultChan()
91+
92+
for {
93+
select {
94+
case result := <-resultChan:
95+
glog.V(3).Infof("Receive new plugin result: %+v", result)
96+
status := c.generateStatus(result)
97+
glog.Infof("New status generated: %+v", status)
98+
c.statusChan <- status
99+
case <-c.tomb.Stopping():
100+
c.plugin.Stop()
101+
glog.Infof("Custom plugin monitor stopped")
102+
c.tomb.Done()
103+
break
104+
}
105+
}
106+
}
107+
108+
// generateStatus generates status from the plugin check result.
109+
func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Status {
110+
timestamp := time.Now()
111+
var events []types.Event
112+
if result.Rule.Type == types.Temp {
113+
// For temporary error only generate event when exit status is above warning
114+
if result.ExitStatus >= cpmtypes.NonOK {
115+
events = append(events, types.Event{
116+
Severity: types.Warn,
117+
Timestamp: timestamp,
118+
Reason: result.Rule.Reason,
119+
Message: result.Message,
120+
})
121+
}
122+
} else {
123+
// For permanent error changes the condition
124+
for i := range c.conditions {
125+
condition := &c.conditions[i]
126+
if condition.Type == result.Rule.Condition {
127+
status := result.ExitStatus >= cpmtypes.NonOK
128+
if condition.Status != status || condition.Reason != result.Rule.Reason {
129+
condition.Transition = timestamp
130+
condition.Message = result.Message
131+
}
132+
condition.Status = status
133+
condition.Reason = result.Rule.Reason
134+
break
135+
}
136+
}
137+
}
138+
return &types.Status{
139+
Source: c.config.Source,
140+
// TODO(random-liu): Aggregate events and conditions and then do periodically report.
141+
Events: events,
142+
Conditions: c.conditions,
143+
}
144+
}
145+
146+
// initializeStatus initializes the internal condition and also reports it to the node problem detector.
147+
func (c *customPluginMonitor) initializeStatus() {
148+
// Initialize the default node conditions
149+
c.conditions = initialConditions(c.config.DefaultConditions)
150+
glog.Infof("Initialize condition generated: %+v", c.conditions)
151+
// Update the initial status
152+
c.statusChan <- &types.Status{
153+
Source: c.config.Source,
154+
Conditions: c.conditions,
155+
}
156+
}
157+
158+
func initialConditions(defaults []types.Condition) []types.Condition {
159+
conditions := make([]types.Condition, len(defaults))
160+
copy(conditions, defaults)
161+
for i := range conditions {
162+
// TODO(random-liu): Validate default conditions
163+
conditions[i].Status = false
164+
conditions[i].Transition = time.Now()
165+
}
166+
return conditions
167+
}

0 commit comments

Comments
 (0)