Skip to content

Commit f10c2ec

Browse files
committed
Change the userspace proxy to wait for the node record
The proxy setup races with the node registration. The iptables proxy setup tries to read the node record to retrieve the address for the node. If the registration has not happened yet the read fails, but the code proceeds despite the failure. This only shows up in the testing when the unidler is used because the ip address doesn't get used until the iptables rules are set to send the traffic to the userspace proxy, and because there is a nil ip address, the rules are rejected by iptables. The fix is to add a retry loop (with a backoff) to keep reading until the node record is present. The bug is masked when a network plugin is used because it typically needs the node record to be present and it runs first, so by the time the proxy is set up, there is a node record. Fixes bug 1519991 (https://bugzilla.redhat.com/show_bug.cgi?id=1519991)
1 parent 3133750 commit f10c2ec

File tree

1 file changed

+36
-11
lines changed

1 file changed

+36
-11
lines changed

pkg/cmd/server/kubernetes/network/network.go

+36-11
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"github.com/prometheus/client_golang/prometheus"
1111

1212
"k8s.io/api/core/v1"
13+
kapierrors "k8s.io/apimachinery/pkg/api/errors"
1314
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1415
"k8s.io/apimachinery/pkg/types"
1516
utilnet "k8s.io/apimachinery/pkg/util/net"
@@ -92,7 +93,11 @@ func (c *NetworkConfig) RunProxy() {
9293
case componentconfig.ProxyModeIPTables:
9394
glog.V(0).Info("Using iptables Proxier.")
9495
if bindAddr.Equal(net.IPv4zero) {
95-
bindAddr = getNodeIP(c.ExternalKubeClientset.CoreV1(), hostname)
96+
var err error
97+
bindAddr, err = getNodeIP(c.ExternalKubeClientset.CoreV1(), hostname)
98+
if err != nil {
99+
glog.Fatalf("Unable to get a bind address: %v", err)
100+
}
96101
}
97102
if c.ProxyConfig.IPTables.MasqueradeBit == nil {
98103
// IPTablesMasqueradeBit must be specified or defaulted.
@@ -232,17 +237,37 @@ func (c *NetworkConfig) RunProxy() {
232237
}
233238

234239
// getNodeIP is copied from the upstream proxy config to retrieve the IP of a node.
235-
func getNodeIP(client kv1core.CoreV1Interface, hostname string) net.IP {
236-
var nodeIP net.IP
237-
node, err := client.Nodes().Get(hostname, metav1.GetOptions{})
238-
if err != nil {
239-
glog.Warningf("Failed to retrieve node info: %v", err)
240-
return nil
240+
func getNodeIP(client kv1core.CoreV1Interface, hostname string) (net.IP, error) {
241+
var node *v1.Node
242+
var nodeErr error
243+
244+
// We may beat the thread that causes the node object to be created,
245+
// so if we can't get it, then we need to wait.
246+
// This will wait 0, 2, 4, 8, ... 64 seconds, for a total of ~2 mins
247+
nodeWaitBackoff := utilwait.Backoff{
248+
Duration: 2 * time.Second,
249+
Factor: 2,
250+
Steps: 7,
241251
}
242-
nodeIP, err = utilnode.GetNodeHostIP(node)
252+
utilwait.ExponentialBackoff(nodeWaitBackoff, func() (bool, error) {
253+
node, nodeErr = client.Nodes().Get(hostname, metav1.GetOptions{})
254+
if nodeErr == nil {
255+
return true, nil
256+
} else if kapierrors.IsNotFound(nodeErr) {
257+
glog.Warningf("waiting for node %q to be registered with master...", hostname)
258+
return false, nil
259+
} else {
260+
return false, nodeErr
261+
}
262+
})
263+
if nodeErr != nil {
264+
return nil, fmt.Errorf("failed to retrieve node info (after waiting): %v", nodeErr)
265+
}
266+
267+
nodeIP, err := utilnode.GetNodeHostIP(node)
243268
if err != nil {
244-
glog.Warningf("Failed to retrieve node IP: %v", err)
245-
return nil
269+
return nil, fmt.Errorf("failed to retrieve node IP: %v", err)
246270
}
247-
return nodeIP
271+
272+
return nodeIP, nil
248273
}

0 commit comments

Comments
 (0)