Skip to content

[ci] Move monitoring check from github action to code #3766

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 1 addition & 27 deletions .github/actions/run-monitored-tmpnet-cmd/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ runs:
run: ${{ inputs.run_env }} nix develop --impure --command bash -x ${{ inputs.run }}
env:
TMPNET_START_COLLECTORS: ${{ inputs.prometheus_username != '' }}
TMPNET_CHECK_MONITORING: ${{ inputs.prometheus_username != '' }}
LOKI_USERNAME: ${{ inputs.loki_username }}
LOKI_PASSWORD: ${{ inputs.loki_password }}
PROMETHEUS_USERNAME: ${{ inputs.prometheus_username }}
Expand All @@ -92,30 +93,3 @@ runs:
~/.tmpnet/prometheus/prometheus.log
~/.tmpnet/promtail/promtail.log
if-no-files-found: error
# TODO(marun) Maybe optionally run these checks in an AfterSuite step?
- name: Check that logs were collected
if: (inputs.prometheus_username != '')
shell: bash
run: go run github.com/ava-labs/avalanchego/tests/fixture/tmpnet/cmd check-logs
env:
LOKI_USERNAME: ${{ inputs.loki_username }}
LOKI_PASSWORD: ${{ inputs.loki_password }}
GH_REPO: ${{ inputs.repository_owner }}/${{ inputs.repository_name }}
GH_WORKFLOW: ${{ inputs.workflow }}
GH_RUN_ID: ${{ inputs.run_id }}
GH_RUN_NUMBER: ${{ inputs.run_number }}
GH_RUN_ATTEMPT: ${{ inputs.run_attempt }}
GH_JOB_ID: ${{ inputs.job }}
- name: Check that metrics were collected
if: (inputs.prometheus_username != '')
shell: bash
run: go run github.com/ava-labs/avalanchego/tests/fixture/tmpnet/cmd check-metrics
env:
PROMETHEUS_USERNAME: ${{ inputs.prometheus_username }}
PROMETHEUS_PASSWORD: ${{ inputs.prometheus_password }}
GH_REPO: ${{ inputs.repository_owner }}/${{ inputs.repository_name }}
GH_WORKFLOW: ${{ inputs.workflow }}
GH_RUN_ID: ${{ inputs.run_id }}
GH_RUN_NUMBER: ${{ inputs.run_number }}
GH_RUN_ATTEMPT: ${{ inputs.run_attempt }}
GH_JOB_ID: ${{ inputs.job }}
24 changes: 20 additions & 4 deletions tests/fixture/e2e/env.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package e2e

import (
"context"
"encoding/json"
"errors"
"math/rand"
Expand Down Expand Up @@ -79,12 +80,27 @@ func (te *TestEnvironment) Marshal() []byte {
func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork *tmpnet.Network) *TestEnvironment {
require := require.New(tc)

// Start collectors for any command but stop
if flagVars.StartCollectors() && !flagVars.StopNetwork() {
require.NoError(tmpnet.StartCollectors(tc.DefaultContext(), tc.Log()))
var network *tmpnet.Network

// Consider monitoring flags for any command but stop
if !flagVars.StopNetwork() {
if flagVars.StartCollectors() {
require.NoError(tmpnet.StartCollectors(tc.DefaultContext(), tc.Log()))
}
if flagVars.CheckMonitoring() {
// Register cleanup before network start to ensure it runs after the network is stopped (LIFO)
tc.DeferCleanup(func() {
if network == nil {
tc.Log().Warn("unable to check that logs and metrics were collected from an uninitialized network")
return
}
ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout)
defer cancel()
require.NoError(tmpnet.CheckMonitoring(ctx, tc.Log(), network.UUID))
})
}
}

var network *tmpnet.Network
// Need to load the network if it is being stopped or reused
if flagVars.StopNetwork() || flagVars.ReuseNetwork() {
networkDir := flagVars.NetworkDir()
Expand Down
20 changes: 17 additions & 3 deletions tests/fixture/e2e/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ type FlagVars struct {
networkDir string
reuseNetwork bool
startCollectors bool
checkMonitoring bool
startNetwork bool
stopNetwork bool
restartNetwork bool
Expand Down Expand Up @@ -77,6 +78,10 @@ func (v *FlagVars) StartCollectors() bool {
return v.startCollectors
}

func (v *FlagVars) CheckMonitoring() bool {
return v.checkMonitoring
}

func (v *FlagVars) NetworkShutdownDelay() time.Duration {
if v.startCollectors {
// Only return a non-zero value if we want to ensure the collectors have
Expand Down Expand Up @@ -140,7 +145,10 @@ func RegisterFlags() *FlagVars {
false,
"[optional] restart an existing network previously started with --reuse-network. Useful for ensuring a network is running with the current state of binaries on disk. Ignored if a network is not already running or --stop-network is provided.",
)
SetStartCollectorsFlag(&vars.startCollectors)
SetMonitoringFlags(
&vars.startCollectors,
&vars.checkMonitoring,
)
flag.BoolVar(
&vars.startNetwork,
"start-network",
Expand Down Expand Up @@ -170,11 +178,17 @@ func RegisterFlags() *FlagVars {
}

// Enable reuse by the upgrade job
func SetStartCollectorsFlag(p *bool) {
func SetMonitoringFlags(startCollectors *bool, checkMonitoring *bool) {
flag.BoolVar(
p,
startCollectors,
"start-collectors",
cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_START_COLLECTORS", "false")),
"[optional] whether to start collectors of logs and metrics from nodes of the temporary network.",
)
flag.BoolVar(
checkMonitoring,
"check-monitoring",
cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_CHECK_MONITORING", "false")),
"[optional] whether to check that logs and metrics have been collected from nodes of the temporary network.",
)
}
28 changes: 23 additions & 5 deletions tests/fixture/tmpnet/check_monitoring.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,16 @@ import (

type getCountFunc func() (int, error)

// CheckMonitoring checks if logs and metrics exist for the given network. If no network
// UUID is provided, an attempt will be made to derive selectors from env vars (GH_*)
// identifying a github actions run.
func CheckMonitoring(ctx context.Context, log logging.Logger, networkUUID string) error {
return errors.Join(
CheckLogsExist(ctx, log, networkUUID),
CheckMetricsExist(ctx, log, networkUUID),
)
}

// waitForCount waits until the provided function returns greater than zero.
func waitForCount(ctx context.Context, log logging.Logger, name string, getCount getCountFunc) error {
err := pollUntilContextCancel(
Expand Down Expand Up @@ -55,8 +65,9 @@ func waitForCount(ctx context.Context, log logging.Logger, name string, getCount
return nil
}

// CheckLogsExist checks if logs exist for the given network. Github labels are also
// included if provided as env vars (GH_*).
// CheckLogsExist checks if logs exist for the given network. If no network UUID is
// provided, an attempt will be made to derive selectors from env vars (GH_*) identifying
// a github actions run.
func CheckLogsExist(ctx context.Context, log logging.Logger, networkUUID string) error {
username, password, err := getCollectorCredentials(promtailCmd)
if err != nil {
Expand Down Expand Up @@ -163,7 +174,7 @@ func queryLoki(
}

// CheckMetricsExist checks if metrics exist for the given network. Github labels are also
// included if provided as env vars (GH_*).
// used as filters if provided as env vars (GH_*).
func CheckMetricsExist(ctx context.Context, log logging.Logger, networkUUID string) error {
username, password, err := getCollectorCredentials(prometheusCmd)
if err != nil {
Expand Down Expand Up @@ -253,10 +264,13 @@ func (b *basicAuthRoundTripper) RoundTrip(req *http.Request) (*http.Response, er

// getSelectors returns the comma-separated list of selectors.
func getSelectors(networkUUID string) (string, error) {
selectors := []string{}
// If network UUID is provided, use it as the only selector
if len(networkUUID) > 0 {
selectors = append(selectors, fmt.Sprintf(`network_uuid="%s"`, networkUUID))
return fmt.Sprintf(`network_uuid="%s"`, networkUUID), nil
}

// Fall back to using Github labels as selectors
selectors := []string{}
githubLabels := githubLabelsFromEnv()
for label := range githubLabels {
value, err := githubLabels.GetStringVal(label)
Expand All @@ -268,5 +282,9 @@ func getSelectors(networkUUID string) (string, error) {
}
selectors = append(selectors, fmt.Sprintf(`%s="%s"`, label, value))
}
if len(selectors) == 0 {
return "", errors.New("no GH_* env vars set to use for selectors")
}

return strings.Join(selectors, ","), nil
}
1 change: 1 addition & 0 deletions tests/fixture/tmpnet/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ type Network struct {

func NewDefaultNetwork(owner string) *Network {
return &Network{
UUID: uuid.NewString(),
Owner: owner,
Nodes: NewNodesOrPanic(DefaultNodeCount),
}
Expand Down
16 changes: 15 additions & 1 deletion tests/upgrade/upgrade_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package upgrade

import (
"context"
"flag"
"fmt"
"testing"
Expand All @@ -24,6 +25,7 @@ var (
avalancheGoExecPath string
avalancheGoExecPathToUpgradeTo string
startCollectors bool
checkMonitoring bool
)

func init() {
Expand All @@ -39,7 +41,10 @@ func init() {
"",
"avalanchego executable path to upgrade to",
)
e2e.SetStartCollectorsFlag(&startCollectors)
e2e.SetMonitoringFlags(
&startCollectors,
&checkMonitoring,
)
}

var _ = ginkgo.Describe("[Upgrade]", func() {
Expand All @@ -59,6 +64,15 @@ var _ = ginkgo.Describe("[Upgrade]", func() {
require.NoError(tmpnet.StartCollectors(tc.DefaultContext(), tc.Log()))
shutdownDelay = tmpnet.NetworkShutdownDelay // Ensure a final metrics scrape
}
if checkMonitoring {
// Since cleanups are run in LIFO order, adding this cleanup before
// StartNetwork is called ensures network shutdown will be called first.
tc.DeferCleanup(func() {
ctx, cancel := context.WithTimeout(context.Background(), e2e.DefaultTimeout)
defer cancel()
require.NoError(tmpnet.CheckMonitoring(ctx, tc.Log(), network.UUID))
})
}

e2e.StartNetwork(
tc,
Expand Down
Loading