Skip to content

Commit 3ce717f

Browse files
author
Craig Furman
authored
validate subcommand: various changes (#1088)
Remove "connections" validation. This checked that various services were TCP-reachable (and resolvable by kube-dns), from various pods. For example, it checked that pgsql was reachable from frontend. This particular validation wouldn't always be useful, e.g. when external postgres clusters were used. It was also failing due to the removal of `nc` from base images. This upstream health-checking _could_ be replaced by doing this check in the readiness probe handler. For example: frontend pods could check the availability of a pgsql connection, and fail their readiness probe if that is unavailable. This is a contentious topic however and should be approached with caution: mean-time-to-recovery can actually substantially increase under some configurations due to cascading failing readiness probes. This commit conservatively aims to restore utility of src-validate by removing the always-failing connections check, and there are no immediate plans to add upstream readiness probing. This commit also changes src-validate to exit with non-zero status when there are no pods and/or services. This white-box check does have some utility, as a sanity check that we've deployed anything at all. The "no PVCs" case is left as a warning, in case the user is using external DBs everywhere, and might legitimately have no PVCs in their namespace. This closes https://linear.app/sourcegraph/issue/REL-42/src-validate-should-not-exit-with-zero-when-no-pods-are-available-at and https://linear.app/sourcegraph/issue/REL-40/fix-src-validate-now-that-nc-is-not-available, assuming we agree with https://linear.app/sourcegraph/issue/REL-40/fix-src-validate-now-that-nc-is-not-available#comment-7b40beee. It will be followed-up with an implementation of https://linear.app/sourcegraph/issue/REL-43/src-validate-should-perform-black-box-validations-of-core-flows.
1 parent d16db0a commit 3ce717f

File tree

2 files changed

+8
-137
lines changed

2 files changed

+8
-137
lines changed

CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ All notable changes to `src-cli` are documented in this file.
1717

1818
### Fixed
1919

20+
- validate kube: connections check removed.
21+
- validate kube: exits non-zero when there are no pods or services in the target
22+
namespace.
23+
2024
### Removed
2125

2226
## 5.4.0

internal/validate/kube/kube.go

+4-137
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,21 @@
11
package kube
22

33
import (
4-
"bytes"
54
"context"
65
"fmt"
76
"io"
87
"log"
98
"os"
109
"path/filepath"
1110
"reflect"
12-
"regexp"
1311
"strings"
1412

1513
corev1 "k8s.io/api/core/v1"
1614
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1715
"k8s.io/client-go/kubernetes"
18-
"k8s.io/client-go/kubernetes/scheme"
1916
"k8s.io/client-go/rest"
2017
"k8s.io/client-go/tools/clientcmd"
2118

22-
"k8s.io/client-go/tools/remotecommand"
2319
"k8s.io/client-go/util/homedir"
2420

2521
"github.com/aws/aws-sdk-go-v2/service/ec2"
@@ -30,12 +26,6 @@ import (
3026
"github.com/sourcegraph/sourcegraph/lib/errors"
3127
)
3228

33-
var (
34-
sourcegraphFrontend = regexp.MustCompile(`^sourcegraph-frontend-.*`)
35-
sourcegraphRepoUpdater = regexp.MustCompile(`^repo-updater-.*`)
36-
sourcegraphWorker = regexp.MustCompile(`^worker-.*`)
37-
)
38-
3929
type Option = func(config *Config)
4030

4131
type Config struct {
@@ -95,7 +85,6 @@ func Validate(ctx context.Context, clientSet *kubernetes.Clientset, restConfig *
9585
{Pods, "validating pods", "pods validated", "validating pods failed"},
9686
{Services, "validating services", "services validated", "validating services failed"},
9787
{PVCs, "validating pvcs", "pvcs validated", "validating pvcs failed"},
98-
{Connections, "validating connections", "connections validated", "validating connections failed"},
9988
}
10089

10190
if cfg.eks {
@@ -213,9 +202,9 @@ func Pods(ctx context.Context, config *Config) ([]validate.Result, error) {
213202

214203
if len(pods.Items) == 0 {
215204
results = append(results, validate.Result{
216-
Status: validate.Warning,
205+
Status: validate.Failure,
217206
Message: fmt.Sprintf(
218-
"No pods exist on namespace '%s'. check namespace/cluster",
207+
"no pods exist in namespace '%s'. check namespace/cluster",
219208
config.namespace,
220209
),
221210
})
@@ -305,9 +294,9 @@ func Services(ctx context.Context, config *Config) ([]validate.Result, error) {
305294

306295
if len(services.Items) <= 1 {
307296
results = append(results, validate.Result{
308-
Status: validate.Warning,
297+
Status: validate.Failure,
309298
Message: fmt.Sprintf(
310-
"unexpected number of services on namespace '%s'; check namespace/cluster",
299+
"no services in namespace '%s'; check namespace/cluster",
311300
config.namespace,
312301
),
313302
})
@@ -384,128 +373,6 @@ func validatePVC(pvc *corev1.PersistentVolumeClaim) []validate.Result {
384373
return results
385374
}
386375

387-
type connection struct {
388-
src corev1.Pod
389-
dest []dest
390-
}
391-
392-
type dest struct {
393-
addr string
394-
port string
395-
}
396-
397-
// Connections will validate that Sourcegraph services can reach each other over the network.
398-
func Connections(ctx context.Context, config *Config) ([]validate.Result, error) {
399-
var results []validate.Result
400-
var connections []connection
401-
402-
pods, err := config.clientSet.CoreV1().Pods(config.namespace).List(ctx, metav1.ListOptions{})
403-
if err != nil {
404-
return nil, err
405-
}
406-
407-
if len(pods.Items) == 0 {
408-
results = append(results, validate.Result{
409-
Status: validate.Warning,
410-
Message: fmt.Sprintf(
411-
"cannot check connections: zero pods exist in namespace '%s'",
412-
config.namespace,
413-
),
414-
})
415-
416-
return results, nil
417-
}
418-
419-
// iterate through pods looking for specific pod name prefixes, then construct
420-
// a relationship map between pods that should have connectivity with each other
421-
for _, pod := range pods.Items {
422-
switch name := pod.Name; {
423-
case sourcegraphFrontend.MatchString(name): // pod is one of the sourcegraph front-end pods
424-
connections = append(connections, connection{
425-
src: pod,
426-
dest: []dest{
427-
{
428-
addr: "pgsql",
429-
port: "5432",
430-
},
431-
{
432-
addr: "indexed-search",
433-
port: "6070",
434-
},
435-
{
436-
addr: "repo-updater",
437-
port: "3182",
438-
},
439-
{
440-
addr: "syntect-server",
441-
port: "9238",
442-
},
443-
},
444-
})
445-
case sourcegraphWorker.MatchString(name): // pod is a worker pod
446-
connections = append(connections, connection{
447-
src: pod,
448-
dest: []dest{
449-
{
450-
addr: "pgsql",
451-
port: "5432",
452-
},
453-
},
454-
})
455-
case sourcegraphRepoUpdater.MatchString(name):
456-
connections = append(connections, connection{
457-
src: pod,
458-
dest: []dest{
459-
{
460-
addr: "pgsql",
461-
port: "5432",
462-
},
463-
},
464-
})
465-
}
466-
}
467-
468-
// use network relationships constructed above to test network connection for each relationship
469-
for _, c := range connections {
470-
for _, d := range c.dest {
471-
req := config.clientSet.CoreV1().RESTClient().Post().
472-
Resource("pods").
473-
Name(c.src.Name).
474-
Namespace(c.src.Namespace).
475-
SubResource("exec")
476-
477-
req.VersionedParams(&corev1.PodExecOptions{
478-
Command: []string{"/usr/bin/nc", "-z", d.addr, d.port},
479-
Stdin: false,
480-
Stdout: true,
481-
Stderr: true,
482-
TTY: false,
483-
}, scheme.ParameterCodec)
484-
485-
exec, err := remotecommand.NewSPDYExecutor(config.restConfig, "POST", req.URL())
486-
if err != nil {
487-
return nil, err
488-
}
489-
490-
var stdout, stderr bytes.Buffer
491-
492-
err = exec.StreamWithContext(ctx, remotecommand.StreamOptions{
493-
Stdout: &stdout,
494-
Stderr: &stderr,
495-
})
496-
if err != nil {
497-
return nil, errors.Wrapf(err, "connecting to %s", c.src.Name)
498-
}
499-
500-
if stderr.String() != "" {
501-
results = append(results, validate.Result{Status: validate.Failure, Message: stderr.String()})
502-
}
503-
}
504-
}
505-
506-
return results, nil
507-
}
508-
509376
func CurrentContextSetTo(clusterService string) error {
510377
currentContext, err := GetCurrentContext()
511378
if err != nil {

0 commit comments

Comments
 (0)