Skip to content

Commit 2cc6e08

Browse files
committed
fixup: Strictly separate start and stop
1 parent 9f3afe7 commit 2cc6e08

File tree

9 files changed

+237
-196
lines changed

9 files changed

+237
-196
lines changed

.github/actions/run-monitored-tmpnet-cmd/action.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,10 @@ runs:
6767
# --impure ensures the env vars are accessible to the command
6868
run: ${{ inputs.run_env }} ${{ github.action_path }}/nix-develop.sh --impure --command bash -x ${{ inputs.run }}
6969
env:
70-
TMPNET_START_METRIC_COLLECTOR: ${{ inputs.prometheus_username != '' }}
71-
TMPNET_START_LOG_COLLECTOR: ${{ inputs.loki_username != '' }}
72-
TMPNET_CHECK_MONITORING: ${{ inputs.prometheus_username != '' }}
70+
TMPNET_START_METRICS_COLLECTOR: ${{ inputs.prometheus_username != '' }}
71+
TMPNET_START_LOGS_COLLECTOR: ${{ inputs.loki_username != '' }}
72+
TMPNET_CHECK_METRICS_COLLECTED: ${{ inputs.prometheus_username != '' }}
73+
TMPNET_CHECK_LOGS_COLLECTED: ${{ inputs.loki_username != '' }}
7374
LOKI_USERNAME: ${{ inputs.loki_username }}
7475
LOKI_PASSWORD: ${{ inputs.loki_password }}
7576
PROMETHEUS_USERNAME: ${{ inputs.prometheus_username }}

tests/e2e/README.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,10 @@ E2E_SKIP_BOOTSTRAP_CHECKS=1 ./bin/ginkgo -v ./tests/e2e ...
113113
It is possible to enable collection of logs and metrics from the
114114
temporary networks used for e2e testing by:
115115

116-
- Supplying `--start-collectors` as an argument to the test suite
116+
- Supplying `--start-metrics-collector` and `--start-logs-collector`
117+
as arguments to the test suite
117118
- Starting collectors in advance of a test run with `tmpnetctl
118-
start-collectors`
119+
start-metrics-collector` and ` tmpnetctl start-logs-collector`
119120

120121
Both methods require:
121122

@@ -126,11 +127,11 @@ Both methods require:
126127
- `LOKI_PASSWORD`
127128
- The availability in the path of binaries for promtail and prometheus
128129
- Starting a development shell with `nix develop` is one way to
129-
ensure this and requires the [installation of
130-
nix](https://github.com/DeterminateSystems/nix-installer?tab=readme-ov-file#install-nix).
130+
ensure this and requires the installation of nix
131+
(e.g. `./scripts/run_task.sh install-nix`).
131132

132133
Once started, the collectors will continue to run in the background
133-
until stopped by `tmpnetctl stop-collectors`.
134+
until stopped by `tmpnetctl stop-metrics-collector` and `tmpnetctl stop-logs-collector`.
134135

135136
The results of collection will be viewable at
136137
https://grafana-poc.avax-dev.network.

tests/fixture/e2e/env.go

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -90,22 +90,36 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork
9090

9191
// Consider monitoring flags for any command but stop
9292
if networkCmd != StopNetworkCmd {
93-
if flagVars.StartMetricCollector() {
93+
if flagVars.StartMetricsCollector() {
9494
require.NoError(tmpnet.StartPrometheus(tc.DefaultContext(), tc.Log()))
9595
}
96-
if flagVars.StartLogCollector() {
96+
if flagVars.StartLogsCollector() {
9797
require.NoError(tmpnet.StartPromtail(tc.DefaultContext(), tc.Log()))
9898
}
99-
if flagVars.CheckMonitoring() {
100-
// Register cleanup before network start to ensure it runs after the network is stopped (LIFO)
99+
100+
// Register cleanups before network start to ensure they run after the network is stopped (LIFO)
101+
102+
if flagVars.CheckMetricsCollected() {
103+
tc.DeferCleanup(func() {
104+
if network == nil {
105+
tc.Log().Warn("unable to check that metrics were collected from an uninitialized network")
106+
return
107+
}
108+
ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout)
109+
defer cancel()
110+
require.NoError(tmpnet.CheckMetricsExist(ctx, tc.Log(), network.UUID))
111+
})
112+
}
113+
114+
if flagVars.CheckLogsCollected() {
101115
tc.DeferCleanup(func() {
102116
if network == nil {
103-
tc.Log().Warn("unable to check that logs and metrics were collected from an uninitialized network")
117+
tc.Log().Warn("unable to check that logs were collected from an uninitialized network")
104118
return
105119
}
106120
ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout)
107121
defer cancel()
108-
require.NoError(tmpnet.CheckMonitoring(ctx, tc.Log(), network.UUID))
122+
require.NoError(tmpnet.CheckLogsExist(ctx, tc.Log(), network.UUID))
109123
})
110124
}
111125
}
@@ -181,7 +195,7 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork
181195
}
182196

183197
// Once one or more nodes are running it should be safe to wait for promtail to report readiness
184-
if flagVars.StartLogCollector() {
198+
if flagVars.StartLogsCollector() {
185199
require.NoError(tmpnet.WaitForPromtailReadiness(tc.DefaultContext(), tc.Log()))
186200
}
187201

tests/fixture/e2e/flags.go

Lines changed: 41 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,11 @@ type FlagVars struct {
3131
startNetworkVars *flags.StartNetworkVars
3232

3333
// The collectors configured by these flags run as local processes
34-
startCollectors bool
35-
startMetricCollector bool
36-
startLogCollector bool
34+
startMetricsCollector bool
35+
startLogsCollector bool
3736

38-
checkMonitoring bool
37+
checkMetricsCollected bool
38+
checkLogsCollected bool
3939

4040
networkDir string
4141
reuseNetwork bool
@@ -87,16 +87,20 @@ func (v *FlagVars) NodeRuntimeConfig() (*tmpnet.NodeRuntimeConfig, error) {
8787
return v.startNetworkVars.GetNodeRuntimeConfig()
8888
}
8989

90-
func (v *FlagVars) StartMetricCollector() bool {
91-
return v.startCollectors || v.startMetricCollector
90+
func (v *FlagVars) StartMetricsCollector() bool {
91+
return v.startMetricsCollector
9292
}
9393

94-
func (v *FlagVars) StartLogCollector() bool {
95-
return v.startCollectors || v.startLogCollector
94+
func (v *FlagVars) StartLogsCollector() bool {
95+
return v.startLogsCollector
9696
}
9797

98-
func (v *FlagVars) CheckMonitoring() bool {
99-
return v.checkMonitoring
98+
func (v *FlagVars) CheckMetricsCollected() bool {
99+
return v.checkMetricsCollected
100+
}
101+
102+
func (v *FlagVars) CheckLogsCollected() bool {
103+
return v.checkLogsCollected
100104
}
101105

102106
func (v *FlagVars) NetworkDir() string {
@@ -110,7 +114,7 @@ func (v *FlagVars) NetworkDir() string {
110114
}
111115

112116
func (v *FlagVars) NetworkShutdownDelay() time.Duration {
113-
if v.StartLogCollector() || v.StartMetricCollector() {
117+
if v.StartMetricsCollector() {
114118
// Only return a non-zero value if we want to ensure the collectors have
115119
// a chance to collect the metrics at the end of the test.
116120
return tmpnet.NetworkShutdownDelay
@@ -138,11 +142,11 @@ func RegisterFlagsWithDefaultOwner(defaultOwner string) *FlagVars {
138142

139143
vars.startNetworkVars = flags.NewStartNetworkFlagVars(defaultOwner)
140144

141-
SetAllMonitoringFlags(
142-
&vars.startCollectors,
143-
&vars.startMetricCollector,
144-
&vars.startLogCollector,
145-
&vars.checkMonitoring,
145+
SetMonitoringFlags(
146+
&vars.startMetricsCollector,
147+
&vars.startLogsCollector,
148+
&vars.checkMetricsCollected,
149+
&vars.checkLogsCollected,
146150
)
147151

148152
flag.StringVar(
@@ -183,44 +187,29 @@ func RegisterFlagsWithDefaultOwner(defaultOwner string) *FlagVars {
183187
return &vars
184188
}
185189

186-
// Enable reuse by the upgrade job which doesn't need fine-grained control over collector start since it never runs in kube.
187-
func SetSimpleMonitoringFlags(startCollectors *bool, checkMonitoring *bool) {
188-
SetAllMonitoringFlags(startCollectors, nil, nil, checkMonitoring)
189-
}
190-
191-
func SetAllMonitoringFlags(startCollectors *bool, startMetricCollector *bool, startLogCollector *bool, checkMonitoring *bool) {
192-
// Ensure the upgrade job is configured to start both collectors if they are individually configured
193-
defaultStartMetricCollector := cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_START_METRIC_COLLECTOR", "false"))
194-
defaultStartLogCollector := cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_START_LOG_COLLECTOR", "false"))
195-
defaultStartCollectors := cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_START_COLLECTORS", "false")) || (defaultStartMetricCollector && defaultStartLogCollector)
196-
190+
func SetMonitoringFlags(startMetricsCollector, startLogsCollector, checkMetricsCollected, checkLogsCollected *bool) {
197191
flag.BoolVar(
198-
startCollectors,
199-
"start-collectors",
200-
defaultStartCollectors,
201-
"[optional] whether to start local collectors of logs and metrics from nodes of the temporary network.",
192+
startMetricsCollector,
193+
"start-metrics-collector",
194+
cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_START_METRICS_COLLECTOR", "false")),
195+
"[optional] whether to start a local collector of metrics from nodes of the temporary network.",
196+
)
197+
flag.BoolVar(
198+
startLogsCollector,
199+
"start-logs-collector",
200+
cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_START_LOGS_COLLECTOR", "false")),
201+
"[optional] whether to start a local collector of logs from nodes of the temporary network.",
202+
)
203+
flag.BoolVar(
204+
checkMetricsCollected,
205+
"check-metrics-collected",
206+
cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_CHECK_METRICS_COLLECTED", "false")),
207+
"[optional] whether to check that metrics have been collected from nodes of the temporary network.",
202208
)
203-
// These 2 flags are not used by the upgrade job
204-
if startMetricCollector != nil {
205-
flag.BoolVar(
206-
startMetricCollector,
207-
"start-metric-collector",
208-
defaultStartMetricCollector,
209-
"[optional] whether to start a local collector of metrics from nodes of the temporary network. Can also be enabled by --start-collectors.",
210-
)
211-
}
212-
if startLogCollector != nil {
213-
flag.BoolVar(
214-
startMetricCollector,
215-
"start-log-collector",
216-
defaultStartLogCollector,
217-
"[optional] whether to start a local collector of metrics from nodes of the temporary network. Can also be enabled by --start-collectors.",
218-
)
219-
}
220209
flag.BoolVar(
221-
checkMonitoring,
222-
"check-monitoring",
223-
cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_CHECK_MONITORING", "false")),
224-
"[optional] whether to check that logs and metrics have been collected from nodes of the temporary network.",
210+
checkLogsCollected,
211+
"check-logs-collected",
212+
cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_CHECK_LOGS_COLLECTED", "false")),
213+
"[optional] whether to check that logs have been collected from nodes of the temporary network.",
225214
)
226215
}

tests/fixture/tmpnet/README.md

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,9 @@ orchestrate the same temporary networks without the use of an rpc daemon.
2828
- [Process details](#process-details)
2929
- [Monitoring](#monitoring)
3030
- [Example usage](#example-usage)
31-
- [Starting collectors](#starting-collectors)
32-
- [Stopping collectors](#stopping-collectors)
33-
- [Metrics collection](#metrics-collection)
34-
- [Log collection](#log-collection)
31+
- [Running collectors](#running-collectors)
32+
- [Metric collection configuration](#metric-collection-configuration)
33+
- [Log collection configuration](#log-collection-configuration)
3534
- [Labels](#labels)
3635
- [CI Collection](#ci-collection)
3736
- [Viewing](#viewing)
@@ -332,49 +331,50 @@ PROMETHEUS_USERNAME=<username> \
332331
PROMETHEUS_PASSWORD=<password> \
333332
LOKI_USERNAME=<username> \
334333
LOKI_PASSWORD=<password> \
335-
./bin/tmpnetctl start-collectors
334+
./bin/tmpnetctl start-metrics-collector
335+
./bin/tmpnetctl start-logs-collector
336336

337337
# Network start emits link to grafana displaying collected logs and metrics
338338
./bin/tmpnetctl start-network
339339

340340
# When done with the network, stop the collectors
341-
./bin/tmpnetctl stop-collectors
341+
./bin/tmpnetctl stop-metrics-collector
342+
./bin/tmpnetctl stop-logs-collector
342343
```
343344

344-
### Starting collectors
345+
### Running collectors
345346
[Top](#table-of-contents)
346347

347-
Collectors for logs and metrics can be started by `tmpnetctl
348-
start-collectors`:
349-
350-
- Requires that the following env vars be set
351-
- `PROMETHEUS_USERNAME`
352-
- `PROMETHEUS_PASSWORD`
353-
- `LOKI_USERNAME`
354-
- `LOKI_PASSWORD`
355-
- Requires that binaries for promtail and prometheus be available in the path
356-
- Starting a development shell with `nix develop` is one way to
357-
ensure this and requires the [installation of
358-
nix](https://github.com/DeterminateSystems/nix-installer?tab=readme-ov-file#install-nix).
359-
- Starts prometheus in agent mode configured to scrape metrics from
360-
configured nodes and forward them to
361-
https://prometheus-poc.avax-dev.network.
362-
- Starts promtail configured to collect logs from configured nodes
363-
and forward them to https://loki-poc.avax-dev.network.
364-
365-
### Stopping collectors
366-
367-
Collectors for logs and metrics can be stopped by `tmpnetctl
368-
stop-collectors`:
369-
370-
### Metrics collection
348+
- `tmpnetctl start-metrics-collector` starts `prometheus` in agent mode
349+
configured to scrape metrics from configured nodes and forward
350+
them to https://prometheus-poc.avax-dev.network.
351+
- Requires:
352+
- Credentials supplied as env vars:
353+
- `PROMETHEUS_USERNAME`
354+
- `PROMETHEUS_PASSWORD`
355+
- A `prometheus` binary available in the path
356+
- Once started, `prometheus` can be stopped by `tmpnetctl stop-metrics-collector`
357+
- `tmpnetctl start-logs-collector` starts `promtail` configured to collect logs
358+
from configured nodes and forward them to
359+
https://loki-poc.avax-dev.network.
360+
- Requires:
361+
- Credentials supplied as env vars:
362+
- `LOKI_USERNAME`
363+
- `LOKI_PASSWORD`
364+
- A `promtail` binary available in the path
365+
- Once started, `promtail` can be stopped by `tmpnetctl stop-logs-collector`
366+
- Starting a development shell with `nix develop` is one way to
367+
ensure availability of the necessary binaries and requires the
368+
installation of nix (e.g. `./scripts/run_task.sh install-nix`).
369+
370+
### Metric collection configuration
371371
[Top](#table-of-contents)
372372

373373
When a node is started, configuration enabling collection of metrics
374374
from the node is written to
375375
`~/.tmpnet/prometheus/file_sd_configs/[network uuid]-[node id].json`.
376376

377-
### Log collection
377+
### Log collection configuration
378378
[Top](#table-of-contents)
379379

380380
Nodes log are stored at `~/.tmpnet/networks/[network id]/[node

tests/fixture/tmpnet/check_monitoring.go

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,6 @@ import (
2828

2929
type getCountFunc func() (int, error)
3030

31-
// CheckMonitoring checks if logs and metrics exist for the given network. If no network
32-
// UUID is provided, an attempt will be made to derive selectors from env vars (GH_*)
33-
// identifying a github actions run.
34-
func CheckMonitoring(ctx context.Context, log logging.Logger, networkUUID string) error {
35-
return errors.Join(
36-
CheckLogsExist(ctx, log, networkUUID),
37-
CheckMetricsExist(ctx, log, networkUUID),
38-
)
39-
}
40-
4131
// waitForCount waits until the provided function returns greater than zero.
4232
func waitForCount(ctx context.Context, log logging.Logger, name string, getCount getCountFunc) error {
4333
err := pollUntilContextCancel(

0 commit comments

Comments
 (0)