Add a script for updating files in the sample archive (#481)

natiiix · web-flow · commit 25b06fa8807e · 2021-09-06T13:08:33.000+02:00
* Add script for updating files in sample archive

* Add info about sample archive update script to README
diff --git a/README.md b/README.md
@@ -4,24 +4,32 @@ This cluster operator gathers anonymized system configuration and reports it to
 
 # Table of Contents
 
+- [Insights Operator](#insights-operator)
+- [Table of Contents](#table-of-contents)
 - [Building](#building)
 - [Testing](#testing)
 - [Documentation](#documentation)
 - [Getting metrics from Prometheus](#getting-metrics-from-prometheus)
-    - [Generate the certificate and key](#generate-the-certificate-and-key)
-    - [Prometheus metrics provided by Insights Operator](#prometheus-metrics-provided-by-insights-operator)
-    - [Getting the data directly from Prometheus](#getting-the-data-directly-from-prometheus)
-    - [Debugging Prometheus metrics without valid CA](#debugging-prometheus-metrics-without-valid-ca)
+  - [Generate the certificate and key](#generate-the-certificate-and-key)
+  - [Prometheus metrics provided by Insights Operator](#prometheus-metrics-provided-by-insights-operator)
+    - [Running IO locally](#running-io-locally)
+    - [Running IO on K8s](#running-io-on-k8s)
+  - [Getting the data directly from Prometheus](#getting-the-data-directly-from-prometheus)
+  - [Debugging Prometheus metrics without valid CA](#debugging-prometheus-metrics-without-valid-ca)
 - [Debugging](#debugging)
-    - [Using the profiler](#using-the-profiler)
+  - [Using the profiler](#using-the-profiler)
+    - [Starting IO with the profiler](#starting-io-with-the-profiler)
+    - [Collect profiling data](#collect-profiling-data)
+    - [Analyzing profiling data](#analyzing-profiling-data)
 - [Changelog](#changelog)
-    - [Updating the changelog](#updating-the-changelog)
+  - [Updating the changelog](#updating-the-changelog)
 - [Reported data](#reported-data)
-- [Insights Operator Archive](#insights-operator-archive)
+  - [Insights Operator Archive](#insights-operator-archive)
     - [Sample IO archive](#sample-io-archive)
     - [Generating a sample archive](#generating-a-sample-archive)
     - [Formatting archive json files](#formatting-archive-json-files)
     - [Obfuscating an archive](#obfuscating-an-archive)
+    - [Updating the sample archive](#updating-the-sample-archive)
 - [Contributing](#contributing)
 - [Support](#support)
 - [License](#license)
@@ -251,6 +259,44 @@ go run ./cmd/obfuscate-archive/main.go YOUR_ARCHIVE.tar.gz
 where `YOUR_ARCHIVE.tar.gz` is the path to the archive.
 The obfuscated version will be created in the same directory and called `YOUR_ARCHIVE-obfuscated.tar.gz`
 
+### Updating the sample archive
+
+The `docs/insights-archive-sample/` directory contains an example of an Insights
+Operator archive, extracted and with pretty-formatted JSON files.
+In case of any changes that affect multiple files in the archive, it is a good
+idea to regenerate the sample archive to make sure it remains up-to-date.
+
+There are two ways of updating the sample archive directory automatically.
+Both of them require running the Insights Operator, letting it generate an archive
+and extracting the archive into an otherwise empty directory.
+
+The script will automatically replace existing files in the sample archive with
+their respective counterparts from the supplied extracted IO archive.
+In case of files with (partially) randomized names, such as pods or nodes,
+the entire directory is deleted and replaced with a matching directory from
+the new archive if possible.
+Changes made by the script can be checked and reverted using Git.
+The updated JSON files will be automatically pretty-formatted using `jq`,
+which is the only dependency required for running the script.
+
+All existing files in the sample archive can be updated using the following command:
+
+```sh
+./scripts/update_sample_archive.sh <Path of directory with the NEW extracted IO archive>
+```
+
+If you only want to update files containing a certain string pattern,
+you can supply a regular expression as a second optional argument.
+For example, the following command was used to replace JSON files containing
+the `managedFields` field when it was removed from the IO archive to save space:
+
+```sh
+./scripts/update_sample_archive.sh <Path of directory with the NEW extracted IO archive> '"managedFields":'
+```
+
+The path of the sample archive directory should be constant relative to
+the path of the script and therefore does not have to be specified explicitly.
+
 # Contributing
 
 See [CONTRIBUTING](CONTRIBUTING.md) for workflow & convention details.
diff --git a/scripts/update_sample_archive.sh b/scripts/update_sample_archive.sh
@@ -0,0 +1,109 @@
+#!/bin/sh
+
+# Please keep in mind that when the comments mention a "source archive",
+# they are referring to a directory containing an _extracted_ IO archive.
+
+if [ -z "$1" ]; then
+    >&2 echo "Usage: update_sample_archive.sh <Extracted Archive Source Directory> [JSON Content Filter]"
+    exit
+fi
+
+# This allows the JSON-finding function to read the filter from
+# a global variable instead of having to pass it as an argument.
+CONTENT_FILTER="$2"
+
+# Get absolute path of the source IO archive.
+SOURCE_PREFIX=$(realpath "$1")/
+
+# Get absolute path of the IO sample archive directory.
+SAMPLE_PREFIX=$(realpath "$(dirname "$0")/../docs/insights-archive-sample")/
+
+# Escape dots and brackets (the most likely special characters found in paths)
+# with backslashes to prevent breaking the regular expressions.
+regexEscape() {
+    echo "$1" | sed 's/[][)(}{\.]/\\\0/g'
+}
+
+# Escaped version of the directory paths ready to be used in regular expressions.
+SOURCE_PREFIX_ESCAPED=$(regexEscape "$SOURCE_PREFIX")
+SAMPLE_PREFIX_ESCAPED=$(regexEscape "$SAMPLE_PREFIX")
+
+jq_update_file() {
+    source_file="$SOURCE_PREFIX$1"
+    if [ ! -f "$source_file" ]; then
+        >&2 echo "[WARN] Unable to update file '$1' (file not found in the source archive)"
+        return 1
+    fi
+
+    sample_file="$SAMPLE_PREFIX$1"
+    mkdir -p "${sample_file%/*}"
+    jq < "$source_file" > "$sample_file" || exit 1
+    echo "[OK] $source_file --> $sample_file"
+}
+
+jq_update_dir() {
+    source_dir="$SOURCE_PREFIX$1"
+    if [ ! -d "$source_dir" ]; then
+        >&2 echo "[WARN] Unable to update directory '$1' (directory not found in the source archive)"
+        return 1
+    fi
+
+    sample_dir="$SAMPLE_PREFIX$1"
+    # Delete the old JSON files.
+    [ -d "$sample_dir" ] && find "$sample_dir" -name '*.json' -type f -delete
+    # Copy and format JSON files from the source archive to the sample archive directory.
+    find "$SOURCE_PREFIX$1" -name '*.json' | grep -oP "^${SOURCE_PREFIX_ESCAPED}\K.+" | sort | uniq | while read -r fname; do
+        jq_update_file "$fname"
+    done
+}
+
+# Expression used when looking for unique directories containing found files.
+FIND_DIR_EXPR='/(?=[^/:]+'
+# Expression used when looking for all found files.
+FIND_FILE_EXPR='(?='
+
+# If a content filter was provided, then all JSON files that match the filter in the existing sample archive directory are returned.
+# Otherwise, a complete list of JSON files in the existing sample archive directory structure is returned.
+# The first argument is used switch between returning a list of files and a list of unique directories containing said files.
+find_jsons() {
+    if [ -z "$CONTENT_FILTER" ]; then
+        # find "$SOURCE_PREFIX" -iname "*.json" | grep -oP "^${SOURCE_PREFIX_ESCAPED}\K[^:]+${1})" | sort | uniq
+        find "$SAMPLE_PREFIX" -iname "*.json" | grep -oP "^${SAMPLE_PREFIX_ESCAPED}\K[^:]+${1})" | sort | uniq
+    else
+        grep -rn "$SAMPLE_PREFIX" --include \*.json -e "$CONTENT_FILTER" | grep -oP "^${SAMPLE_PREFIX_ESCAPED}\K[^:]+?${1}:)" | sort | uniq
+    fi
+}
+
+# Return value indicating if the specified directory is known to contain files with randomized names.
+# This function only checks the path prefix, which means that subdirectory/file paths can be checked as well.
+contains_randomized_names() {
+    case "$1" in
+        config/certificatesigningrequests/*|\
+        config/hostsubnet/*|\
+        config/machineconfigs/*|\
+        config/node/*|\
+        config/persistentvolumes/*|\
+        config/pod/*|\
+        machinesets/*)
+            true
+            ;;
+
+        *)
+            false
+            ;;
+    esac
+}
+
+# If one of the resources in a directory contains a filter hit, the whole directory must be updated
+# because some resource names are randomized and repeated sample archive updates would result in
+# size inflation of the sample archive (i.e., more and more pod resource JSONs with each archive update).
+# There is a list of directories which contain files with randomized names.
+# Remaning directories are handled on a file-by-file basis.
+find_jsons "$FIND_DIR_EXPR" | while read -r dir_name; do
+    contains_randomized_names "$dir_name" && jq_update_dir "$dir_name"
+done
+
+# This handles the remaining files after the entire directories of resources have already been updated.
+find_jsons "$FIND_FILE_EXPR" | while read -r file_name; do
+    contains_randomized_names "$file_name" || jq_update_file "$file_name"
+done