From e615ba3c6fabba89390cd5eff2828fffbc2e9fec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20L=C3=BCders?= Date: Mon, 17 May 2021 10:40:17 +0200 Subject: [PATCH 1/2] Add GatherMachineHealthCheck This adds the machinehealthcheck gathering with dynamic client. Documentation and testing --- docs/gathered-data.md | 121 ++++++++++++++---- .../machine-api-termination-handler.json | 73 +++++++++++ .../clusterconfig/clusterconfig_gatherer.go | 1 + .../clusterconfig/machine_healthcheck.go | 60 +++++++++ .../clusterconfig/machine_healthcheck_test.go | 52 ++++++++ 5 files changed, 282 insertions(+), 25 deletions(-) create mode 100644 docs/insights-archive-sample/config/machinehealthchecks/openshift-machine-api/machine-api-termination-handler.json create mode 100644 pkg/gatherers/clusterconfig/machine_healthcheck.go create mode 100644 pkg/gatherers/clusterconfig/machine_healthcheck_test.go diff --git a/docs/gathered-data.md b/docs/gathered-data.md index fb6543aea..b0c3af2b1 100644 --- a/docs/gathered-data.md +++ b/docs/gathered-data.md @@ -1,5 +1,9 @@ This document is auto-generated by `make gen-doc` +## 3Records + + + ## CRD collects the specified Custom Resource Definitions. @@ -19,8 +23,10 @@ The CRD sizes above are in the raw (uncompressed) state. collects anonymized CertificateSigningRequests. Collects CSRs which werent Verified, or when Now < ValidBefore or Now > ValidAfter -The Kubernetes api https://github.com/kubernetes/client-go/blob/master/kubernetes/typed/certificates/v1beta1/certificatesigningrequest.go#L78 -Response see https://docs.openshift.com/container-platform/4.3/rest_api/index.html#certificatesigningrequestlist-v1beta1certificates +The Kubernetes api: + https://github.com/kubernetes/client-go/blob/master/kubernetes/typed/certificates/v1beta1/certificatesigningrequest.go#L78 +Response see: + https://docs.openshift.com/container-platform/4.3/rest_api/index.html#certificatesigningrequestlist-v1beta1certificates * Location in archive: config/certificatesigningrequests/ * Id in config: certificate_signing_requests @@ -130,16 +136,24 @@ Response see https://docs.openshift.com/container-platform/4.3/rest_api/index.ht ## ClusterOperatorPodsAndEvents -collects all the ClusterOperators degraded Pods -for degraded cluster operators or that lives at the Cluster Operator's namespace, to collect: +collects information about all pods +and events from namespaces of degraded cluster operators. The collected +information includes: - Pod definitions -- Previous and current Pod Container logs (when available) -- Namespace Events - -* Location of pods in archive: config/pod/ +- Previous and current logs of pod containers (when available) +- Namespace events + +* Location of pod definitions: config/pod/{namespace}/{pod}.json +* Location of pod container current logs: + config/pod/{namespace}/logs/{pod}/{container}_current.log +* Location of pod container previous logs: + config/pod/{namespace}/logs/{pod}/{container}_previous.log * Location of events in archive: events/ * Id in config: operators_pods_and_events +* Spec config for CO resources since versions: + * 4.6.16+ + * 4.7+ ## ClusterOperators @@ -227,8 +241,10 @@ Specifically, the age of pods, the set of running images and the container names collects ContainerRuntimeConfig information -The Kubernetes api https://github.com/openshift/machine-config-operator/blob/master/pkg/apis/machineconfiguration.openshift.io/v1/types.go#L402 -Response see https://docs.okd.io/latest/rest_api/machine_apis/containerruntimeconfig-machineconfiguration-openshift-io-v1.html +The Kubernetes api: + https://github.com/openshift/machine-config-operator/blob/master/pkg/apis/machineconfiguration.openshift.io/v1/types.go#L402 +Response see: + https://docs.okd.io/latest/rest_api/machine_apis/containerruntimeconfig-machineconfiguration-openshift-io-v1.html * Location in archive: config/containerruntimeconfigs/ * Id in config: container_runtime_configs @@ -237,6 +253,10 @@ Response see https://docs.okd.io/latest/rest_api/machine_apis/containerruntimeco * 4.7+ +## Errors + + + ## HostSubnet collects HostSubnet information @@ -273,8 +293,10 @@ The Operators-Framework api https://github.com/operator-framework/api/blob/maste collects MachineConfigPool information -The Kubernetes api https://github.com/openshift/machine-config-operator/blob/master/pkg/apis/machineconfiguration.openshift.io/v1/types.go#L197 -Response see https://docs.okd.io/latest/rest_api/machine_apis/machineconfigpool-machineconfiguration-openshift-io-v1.html +The Kubernetes api: + https://github.com/openshift/machine-config-operator/blob/master/pkg/apis/machineconfiguration.openshift.io/v1/types.go#L197 +Response see: + https://docs.okd.io/latest/rest_api/machine_apis/machineconfigpool-machineconfiguration-openshift-io-v1.html * Location in archive: config/machineconfigpools/ * Id in config: machine_config_pools @@ -283,12 +305,31 @@ Response see https://docs.okd.io/latest/rest_api/machine_apis/machineconfigpool- * 4.6+ +## MachineHealthCheck + +collects MachineHealthCheck information + +The Kubernetes api: + https://github.com/openshift/machine-api-operator/blob/master/pkg/generated/clientset/versioned/typed/machine/v1beta1/machinehealthcheck.go +Response see: + https://docs.openshift.com/container-platform/4.3/rest_api/index.html#machinehealthcheck-v1beta1-machine-openshift-io + +* Location in archive: config/machinehealthchecks +* Id in config: machine_healthchecks +* Since versions: + * 4.4.29+ + * 4.5.15+ + * 4.6+ + + ## MachineSet collects MachineSet information -The Kubernetes api https://github.com/openshift/machine-api-operator/blob/master/pkg/generated/clientset/versioned/typed/machine/v1beta1/machineset.go -Response see https://docs.openshift.com/container-platform/4.3/rest_api/index.html#machineset-v1beta1-machine-openshift-io +The Kubernetes api: + https://github.com/openshift/machine-api-operator/blob/master/pkg/generated/clientset/versioned/typed/machine/v1beta1/machineset.go +Response see: + https://docs.openshift.com/container-platform/4.3/rest_api/index.html#machineset-v1beta1-machine-openshift-io * Location in archive: machinesets/ * Id in config: machine_sets @@ -304,7 +345,7 @@ gathers cluster Federated Monitoring metrics. The GET REST query to URL /federate Gathered metrics: - virt_platform + virt_platform etcd_object_counts cluster_installer vsphere_node_hw_version_total @@ -316,6 +357,10 @@ Gathered metrics: * Id in config: metrics +## Name + + + ## NetNamespace collects NetNamespaces networking information @@ -362,8 +407,10 @@ collects logs from openshift-apiserver-operator with following substrings: - "the server has received too many requests and has asked us" - "because serving request timed out and response had been started" -The Kubernetes API https://github.com/kubernetes/client-go/blob/master/kubernetes/typed/core/v1/pod_expansion.go#L48 -Response see https://docs.openshift.com/container-platform/4.6/rest_api/workloads_apis/pod-core-v1.html#apiv1namespacesnamespacepodsnamelog +The Kubernetes API: + https://github.com/kubernetes/client-go/blob/master/kubernetes/typed/core/v1/pod_expansion.go#L48 +Response see: + https://docs.openshift.com/container-platform/4.6/rest_api/workloads_apis/pod-core-v1.html#apiv1namespacesnamespacepodsnamelog * Location in archive: config/pod/{namespace-name}/logs/{pod-name}/errors.log @@ -373,8 +420,10 @@ Response see https://docs.openshift.com/container-platform/4.6/rest_api/workload collects logs from pods in openshift-authentication namespace with following substring: - "AuthenticationError: invalid resource name" -The Kubernetes API https://github.com/kubernetes/client-go/blob/master/kubernetes/typed/core/v1/pod_expansion.go#L48 -Response see https://docs.openshift.com/container-platform/4.6/rest_api/workloads_apis/pod-core-v1.html#apiv1namespacesnamespacepodsnamelog +The Kubernetes API: + https://github.com/kubernetes/client-go/blob/master/kubernetes/typed/core/v1/pod_expansion.go#L48 +Response see: + https://docs.openshift.com/container-platform/4.6/rest_api/workloads_apis/pod-core-v1.html#apiv1namespacesnamespacepodsnamelog * Location in archive: config/pod/openshift-authentication/logs/{pod-name}/errors.log * Since versions: @@ -395,8 +444,10 @@ collects logs from sdn-controller pod in openshift-sdn namespace with following Useful just in case that previous “Node %s is offline” messages are lost, so that we have a clue that there was failure previously. -The Kubernetes API https://github.com/kubernetes/client-go/blob/master/kubernetes/typed/core/v1/pod_expansion.go#L48 -Response see https://docs.openshift.com/container-platform/4.6/rest_api/workloads_apis/pod-core-v1.html#apiv1namespacesnamespacepodsnamelog +The Kubernetes API: + https://github.com/kubernetes/client-go/blob/master/kubernetes/typed/core/v1/pod_expansion.go#L48 +Response see: + https://docs.openshift.com/container-platform/4.6/rest_api/workloads_apis/pod-core-v1.html#apiv1namespacesnamespacepodsnamelog * Location in archive: config/pod/openshift-sdn/logs/{pod-name}/errors.log * Since versions: @@ -412,8 +463,10 @@ collects logs from pods in openshift-sdn namespace with following substrings: - "Unable to update proxy firewall for policy", - "Failed to update proxy firewall for policy", -The Kubernetes API https://github.com/kubernetes/client-go/blob/master/kubernetes/typed/core/v1/pod_expansion.go#L48 -Response see https://docs.openshift.com/container-platform/4.6/rest_api/workloads_apis/pod-core-v1.html#apiv1namespacesnamespacepodsnamelog +The Kubernetes API: + https://github.com/kubernetes/client-go/blob/master/kubernetes/typed/core/v1/pod_expansion.go#L48 +Response see: + https://docs.openshift.com/container-platform/4.6/rest_api/workloads_apis/pod-core-v1.html#apiv1namespacesnamespacepodsnamelog * Location in archive: config/pod/openshift-sdn/logs/{pod-name}/errors.log * Since versions: @@ -437,6 +490,14 @@ Docs for relevant types: https://pkg.go.dev/github.com/openshift/api/operatorcon * 4.8+ +## Panic + + + +## Period + + + ## PodDisruptionBudgets gathers the cluster's PodDisruptionBudgets. @@ -508,8 +569,10 @@ including one from license management pods with the following substring: **Conditional data**: This data is collected only if the "installers.datahub.sap.com" resource is found in the cluster. -The Kubernetes API https://github.com/kubernetes/client-go/blob/master/kubernetes/typed/core/v1/pod_expansion.go#L48 -Response see https://docs.openshift.com/container-platform/4.6/rest_api/workloads_apis/pod-core-v1.html#apiv1namespacesnamespacepodsnamelog +The Kubernetes API: + https://github.com/kubernetes/client-go/blob/master/kubernetes/typed/core/v1/pod_expansion.go#L48 +Response see: + https://docs.openshift.com/container-platform/4.6/rest_api/workloads_apis/pod-core-v1.html#apiv1namespacesnamespacepodsnamelog * Location in archive: config/pod/{namespace}/logs/{pod-name}/errors.log * Since versions: @@ -535,6 +598,14 @@ Response see https://docs.openshift.com/container-platform/4.3/rest_api/index.ht * 4.7+ +## ShouldBeProcessed + + + +## SomeField + + + ## WorkloadInfo collects summarized info about the workloads on a cluster diff --git a/docs/insights-archive-sample/config/machinehealthchecks/openshift-machine-api/machine-api-termination-handler.json b/docs/insights-archive-sample/config/machinehealthchecks/openshift-machine-api/machine-api-termination-handler.json new file mode 100644 index 000000000..85a4c53fd --- /dev/null +++ b/docs/insights-archive-sample/config/machinehealthchecks/openshift-machine-api/machine-api-termination-handler.json @@ -0,0 +1,73 @@ +{ + "apiVersion": "machine.openshift.io/v1beta1", + "kind": "MachineHealthCheck", + "metadata": { + "annotations": { + "exclude.release.openshift.io/internal-openshift-hosted": "true", + "include.release.openshift.io/self-managed-high-availability": "true" + }, + "creationTimestamp": "2021-05-06T14:22:08Z", + "generation": 1, + "labels": { + "api": "clsuterapi", + "k8s-app": "termination-handler" + }, + "managedFields": [ + { + "apiVersion": "machine.openshift.io/v1beta1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:metadata": { + "f:annotations": { + ".": {}, + "f:exclude.release.openshift.io/internal-openshift-hosted": {}, + "f:include.release.openshift.io/self-managed-high-availability": {} + }, + "f:labels": { + ".": {}, + "f:api": {}, + "f:k8s-app": {} + } + }, + "f:spec": { + ".": {}, + "f:maxUnhealthy": {}, + "f:nodeStartupTimeout": {}, + "f:selector": { + ".": {}, + "f:matchLabels": { + ".": {}, + "f:machine.openshift.io/interruptible-instance": {} + } + }, + "f:unhealthyConditions": {} + } + }, + "manager": "cluster-version-operator", + "operation": "Update", + "time": "2021-05-06T14:22:08Z" + } + ], + "name": "machine-api-termination-handler", + "namespace": "openshift-machine-api", + "resourceVersion": "7446", + "selfLink": "/apis/machine.openshift.io/v1beta1/namespaces/openshift-machine-api/machinehealthchecks/machine-api-termination-handler", + "uid": "7892d287-be94-4ae7-a65b-29dd78057e32" + }, + "spec": { + "maxUnhealthy": "100%", + "nodeStartupTimeout": "10m", + "selector": { + "matchLabels": { + "machine.openshift.io/interruptible-instance": "" + } + }, + "unhealthyConditions": [ + { + "status": "True", + "timeout": "0s", + "type": "Terminating" + } + ] + } +} \ No newline at end of file diff --git a/pkg/gatherers/clusterconfig/clusterconfig_gatherer.go b/pkg/gatherers/clusterconfig/clusterconfig_gatherer.go index d9bfe373d..15e589114 100644 --- a/pkg/gatherers/clusterconfig/clusterconfig_gatherer.go +++ b/pkg/gatherers/clusterconfig/clusterconfig_gatherer.go @@ -65,6 +65,7 @@ var gatheringFunctions = map[string]gatheringFunction{ "crds": importantFunc((*Gatherer).GatherCRD), "host_subnets": importantFunc((*Gatherer).GatherHostSubnet), "machine_sets": importantFunc((*Gatherer).GatherMachineSet), + "machine_healthchecks": importantFunc((*Gatherer).GatherMachineHealthCheck), "install_plans": importantFunc((*Gatherer).GatherInstallPlans), "service_accounts": importantFunc((*Gatherer).GatherServiceAccounts), "machine_config_pools": importantFunc((*Gatherer).GatherMachineConfigPool), diff --git a/pkg/gatherers/clusterconfig/machine_healthcheck.go b/pkg/gatherers/clusterconfig/machine_healthcheck.go new file mode 100644 index 000000000..506845685 --- /dev/null +++ b/pkg/gatherers/clusterconfig/machine_healthcheck.go @@ -0,0 +1,60 @@ +package clusterconfig + +import ( + "context" + "fmt" + + "github.com/openshift/insights-operator/pkg/record" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic" +) + +// GatherMachineHealthCheck collects MachineHealthCheck information +// +// The Kubernetes api: +// https://github.com/openshift/machine-api-operator/blob/master/pkg/generated/clientset/versioned/typed/machine/v1beta1/machinehealthcheck.go +// Response see: +// https://docs.openshift.com/container-platform/4.3/rest_api/index.html#machinehealthcheck-v1beta1-machine-openshift-io +// +// * Location in archive: config/machinehealthchecks +// * Id in config: machine_healthchecks +// * Since versions: +// * 4.4.29+ +// * 4.5.15+ +// * 4.6+ +func (g *Gatherer) GatherMachineHealthCheck(ctx context.Context) ([]record.Record, []error) { + dynamicClient, err := dynamic.NewForConfig(g.gatherKubeConfig) + if err != nil { + return nil, []error{err} + } + + return gatherMachineHealthCheck(ctx, dynamicClient) +} + +func gatherMachineHealthCheck(ctx context.Context, dynamicClient dynamic.Interface) ([]record.Record, []error) { + gvr := schema.GroupVersionResource{Group: "machine.openshift.io", Version: "v1beta1", Resource: "machinehealthchecks"} + machineHealthcheck, err := dynamicClient.Resource(gvr).List(ctx, metav1.ListOptions{}) + if errors.IsNotFound(err) { + return nil, nil + } + if err != nil { + return nil, []error{err} + } + + var records []record.Record + for _, i := range machineHealthcheck.Items { + recordName := fmt.Sprintf("config/machinehealthchecks/%s", i.GetName()) + if i.GetNamespace() != "" { + recordName = fmt.Sprintf("config/machinehealthchecks/%s/%s", i.GetNamespace(), i.GetName()) + } + records = append(records, record.Record{ + Name: recordName, + Item: record.JSONMarshaller{Object: i.Object}, + }) + + } + + return records, nil +} diff --git a/pkg/gatherers/clusterconfig/machine_healthcheck_test.go b/pkg/gatherers/clusterconfig/machine_healthcheck_test.go new file mode 100644 index 000000000..097bdbf29 --- /dev/null +++ b/pkg/gatherers/clusterconfig/machine_healthcheck_test.go @@ -0,0 +1,52 @@ +//nolint: dupl +package clusterconfig + +import ( + "context" + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/runtime/serializer/yaml" + dynamicfake "k8s.io/client-go/dynamic/fake" +) + +func Test_MachineHealthCheck_Gather(t *testing.T) { + var machineHealthCheckYAML = ` +apiVersion: machine.openshift.io/v1beta1 +kind: MachineHealthCheck +metadata: + name: test-machinehealthcheck +` + gvr := schema.GroupVersionResource{Group: "machine.openshift.io", Version: "v1beta1", Resource: "machinehealthchecks"} + client := dynamicfake.NewSimpleDynamicClientWithCustomListKinds(runtime.NewScheme(), map[schema.GroupVersionResource]string{ + gvr: "MachineHeachCheckList", + }) + decUnstructured := yaml.NewDecodingSerializer(unstructured.UnstructuredJSONScheme) + + testMachineHealthCheck := &unstructured.Unstructured{} + + _, _, err := decUnstructured.Decode([]byte(machineHealthCheckYAML), nil, testMachineHealthCheck) + if err != nil { + t.Fatal("unable to decode machinehealthcheck ", err) + } + _, err = client.Resource(gvr).Create(context.Background(), testMachineHealthCheck, metav1.CreateOptions{}) + if err != nil { + t.Fatal("unable to create fake machinehealthcheck ", err) + } + + ctx := context.Background() + records, errs := gatherMachineHealthCheck(ctx, client) + if len(errs) > 0 { + t.Errorf("unexpected errors: %#v", errs) + return + } + if len(records) != 1 { + t.Fatalf("unexpected number or records %d", len(records)) + } + if records[0].Name != "config/machinehealthchecks/test-machinehealthcheck" { + t.Fatalf("unexpected machinehealthcheck name %s", records[0].Name) + } +} From bdfe38d1008a1fc3cf4b1e111d2e67c0d60e9cb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20L=C3=BCders?= Date: Tue, 18 May 2021 09:39:10 +0200 Subject: [PATCH 2/2] Manual fixes to documentation --- docs/gathered-data.md | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/docs/gathered-data.md b/docs/gathered-data.md index b0c3af2b1..c72d92142 100644 --- a/docs/gathered-data.md +++ b/docs/gathered-data.md @@ -1,8 +1,5 @@ This document is auto-generated by `make gen-doc` -## 3Records - - ## CRD @@ -253,10 +250,6 @@ Response see: * 4.7+ -## Errors - - - ## HostSubnet collects HostSubnet information @@ -357,10 +350,6 @@ Gathered metrics: * Id in config: metrics -## Name - - - ## NetNamespace collects NetNamespaces networking information @@ -490,14 +479,6 @@ Docs for relevant types: https://pkg.go.dev/github.com/openshift/api/operatorcon * 4.8+ -## Panic - - - -## Period - - - ## PodDisruptionBudgets gathers the cluster's PodDisruptionBudgets. @@ -598,14 +579,6 @@ Response see https://docs.openshift.com/container-platform/4.3/rest_api/index.ht * 4.7+ -## ShouldBeProcessed - - - -## SomeField - - - ## WorkloadInfo collects summarized info about the workloads on a cluster