Skip to content

Commit 2f933b0

Browse files
Merge pull request #17564 from knobunc/fix/proxy-ip-node-race-idler
Automatic merge from submit-queue (batch tested with PRs 17734, 17550, 17647, 17761, 17564). Change the userspace proxy to wait for the node record The userspaceproxy setup races with the node registration. The userspace proxy setup tries to read the node record to retrieve the address for the node. If the registration has not happened yet the read fails, but the code proceeds despite the failure. The fix is to add a retry loop (with a backoff) to keep reading until the node record is present. The bug is masked when a network plugin is used because it typically needs the node record to be present and it runs first, so by the time the proxy is set up, there is a node record. Fixes bug 1519991 (https://bugzilla.redhat.com/show_bug.cgi?id=1519991) @openshift/networking PTAL
2 parents c6d3fa6 + f10c2ec commit 2f933b0

File tree

1 file changed

+36
-11
lines changed

1 file changed

+36
-11
lines changed

pkg/cmd/server/kubernetes/network/network.go

+36-11
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"github.com/prometheus/client_golang/prometheus"
1111

1212
"k8s.io/api/core/v1"
13+
kapierrors "k8s.io/apimachinery/pkg/api/errors"
1314
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1415
"k8s.io/apimachinery/pkg/types"
1516
utilnet "k8s.io/apimachinery/pkg/util/net"
@@ -93,7 +94,11 @@ func (c *NetworkConfig) RunProxy() {
9394
case kubeproxyconfig.ProxyModeIPTables:
9495
glog.V(0).Info("Using iptables Proxier.")
9596
if bindAddr.Equal(net.IPv4zero) {
96-
bindAddr = getNodeIP(c.ExternalKubeClientset.CoreV1(), hostname)
97+
var err error
98+
bindAddr, err = getNodeIP(c.ExternalKubeClientset.CoreV1(), hostname)
99+
if err != nil {
100+
glog.Fatalf("Unable to get a bind address: %v", err)
101+
}
97102
}
98103
if c.ProxyConfig.IPTables.MasqueradeBit == nil {
99104
// IPTablesMasqueradeBit must be specified or defaulted.
@@ -233,17 +238,37 @@ func (c *NetworkConfig) RunProxy() {
233238
}
234239

235240
// getNodeIP is copied from the upstream proxy config to retrieve the IP of a node.
236-
func getNodeIP(client kv1core.CoreV1Interface, hostname string) net.IP {
237-
var nodeIP net.IP
238-
node, err := client.Nodes().Get(hostname, metav1.GetOptions{})
239-
if err != nil {
240-
glog.Warningf("Failed to retrieve node info: %v", err)
241-
return nil
241+
func getNodeIP(client kv1core.CoreV1Interface, hostname string) (net.IP, error) {
242+
var node *v1.Node
243+
var nodeErr error
244+
245+
// We may beat the thread that causes the node object to be created,
246+
// so if we can't get it, then we need to wait.
247+
// This will wait 0, 2, 4, 8, ... 64 seconds, for a total of ~2 mins
248+
nodeWaitBackoff := utilwait.Backoff{
249+
Duration: 2 * time.Second,
250+
Factor: 2,
251+
Steps: 7,
242252
}
243-
nodeIP, err = utilnode.GetNodeHostIP(node)
253+
utilwait.ExponentialBackoff(nodeWaitBackoff, func() (bool, error) {
254+
node, nodeErr = client.Nodes().Get(hostname, metav1.GetOptions{})
255+
if nodeErr == nil {
256+
return true, nil
257+
} else if kapierrors.IsNotFound(nodeErr) {
258+
glog.Warningf("waiting for node %q to be registered with master...", hostname)
259+
return false, nil
260+
} else {
261+
return false, nodeErr
262+
}
263+
})
264+
if nodeErr != nil {
265+
return nil, fmt.Errorf("failed to retrieve node info (after waiting): %v", nodeErr)
266+
}
267+
268+
nodeIP, err := utilnode.GetNodeHostIP(node)
244269
if err != nil {
245-
glog.Warningf("Failed to retrieve node IP: %v", err)
246-
return nil
270+
return nil, fmt.Errorf("failed to retrieve node IP: %v", err)
247271
}
248-
return nodeIP
272+
273+
return nodeIP, nil
249274
}

0 commit comments

Comments
 (0)