Skip to content

Commit 6e04bc0

Browse files
authored
[tmpnet] Separate start of prometheus and promtail collectors (#3947)
1 parent 55d8f74 commit 6e04bc0

File tree

9 files changed

+254
-159
lines changed

9 files changed

+254
-159
lines changed

.github/actions/run-monitored-tmpnet-cmd/action.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,10 @@ runs:
6767
# --impure ensures the env vars are accessible to the command
6868
run: ${{ inputs.run_env }} ${{ github.action_path }}/nix-develop.sh --impure --command bash -x ${{ inputs.run }}
6969
env:
70-
TMPNET_START_COLLECTORS: ${{ inputs.prometheus_username != '' }}
71-
TMPNET_CHECK_MONITORING: ${{ inputs.prometheus_username != '' }}
70+
TMPNET_START_METRICS_COLLECTOR: ${{ inputs.prometheus_username != '' }}
71+
TMPNET_START_LOGS_COLLECTOR: ${{ inputs.loki_username != '' }}
72+
TMPNET_CHECK_METRICS_COLLECTED: ${{ inputs.prometheus_username != '' }}
73+
TMPNET_CHECK_LOGS_COLLECTED: ${{ inputs.loki_username != '' }}
7274
LOKI_USERNAME: ${{ inputs.loki_username }}
7375
LOKI_PASSWORD: ${{ inputs.loki_password }}
7476
PROMETHEUS_USERNAME: ${{ inputs.prometheus_username }}

tests/e2e/README.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,10 @@ E2E_SKIP_BOOTSTRAP_CHECKS=1 ./bin/ginkgo -v ./tests/e2e ...
113113
It is possible to enable collection of logs and metrics from the
114114
temporary networks used for e2e testing by:
115115

116-
- Supplying `--start-collectors` as an argument to the test suite
116+
- Supplying `--start-metrics-collector` and `--start-logs-collector`
117+
as arguments to the test suite
117118
- Starting collectors in advance of a test run with `tmpnetctl
118-
start-collectors`
119+
start-metrics-collector` and ` tmpnetctl start-logs-collector`
119120

120121
Both methods require:
121122

@@ -126,11 +127,11 @@ Both methods require:
126127
- `LOKI_PASSWORD`
127128
- The availability in the path of binaries for promtail and prometheus
128129
- Starting a development shell with `nix develop` is one way to
129-
ensure this and requires the [installation of
130-
nix](https://github.com/DeterminateSystems/nix-installer?tab=readme-ov-file#install-nix).
130+
ensure this and requires the installation of nix
131+
(e.g. `./scripts/run_task.sh install-nix`).
131132

132133
Once started, the collectors will continue to run in the background
133-
until stopped by `tmpnetctl stop-collectors`.
134+
until stopped by `tmpnetctl stop-metrics-collector` and `tmpnetctl stop-logs-collector`.
134135

135136
The results of collection will be viewable at
136137
https://grafana-poc.avax-dev.network.

tests/fixture/e2e/env.go

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -90,19 +90,36 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork
9090

9191
// Consider monitoring flags for any command but stop
9292
if networkCmd != StopNetworkCmd {
93-
if flagVars.StartCollectors() {
94-
require.NoError(tmpnet.StartCollectors(tc.DefaultContext(), tc.Log()))
93+
if flagVars.StartMetricsCollector() {
94+
require.NoError(tmpnet.StartPrometheus(tc.DefaultContext(), tc.Log()))
9595
}
96-
if flagVars.CheckMonitoring() {
97-
// Register cleanup before network start to ensure it runs after the network is stopped (LIFO)
96+
if flagVars.StartLogsCollector() {
97+
require.NoError(tmpnet.StartPromtail(tc.DefaultContext(), tc.Log()))
98+
}
99+
100+
// Register cleanups before network start to ensure they run after the network is stopped (LIFO)
101+
102+
if flagVars.CheckMetricsCollected() {
103+
tc.DeferCleanup(func() {
104+
if network == nil {
105+
tc.Log().Warn("unable to check that metrics were collected from an uninitialized network")
106+
return
107+
}
108+
ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout)
109+
defer cancel()
110+
require.NoError(tmpnet.CheckMetricsExist(ctx, tc.Log(), network.UUID))
111+
})
112+
}
113+
114+
if flagVars.CheckLogsCollected() {
98115
tc.DeferCleanup(func() {
99116
if network == nil {
100-
tc.Log().Warn("unable to check that logs and metrics were collected from an uninitialized network")
117+
tc.Log().Warn("unable to check that logs were collected from an uninitialized network")
101118
return
102119
}
103120
ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout)
104121
defer cancel()
105-
require.NoError(tmpnet.CheckMonitoring(ctx, tc.Log(), network.UUID))
122+
require.NoError(tmpnet.CheckLogsExist(ctx, tc.Log(), network.UUID))
106123
})
107124
}
108125
}
@@ -178,7 +195,7 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork
178195
}
179196

180197
// Once one or more nodes are running it should be safe to wait for promtail to report readiness
181-
if flagVars.StartCollectors() {
198+
if flagVars.StartLogsCollector() {
182199
require.NoError(tmpnet.WaitForPromtailReadiness(tc.DefaultContext(), tc.Log()))
183200
}
184201

tests/fixture/e2e/flags.go

Lines changed: 44 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,12 @@ type FlagVars struct {
3030
startNetwork bool
3131
startNetworkVars *flags.StartNetworkVars
3232

33-
startCollectors bool
34-
checkMonitoring bool
33+
// The collectors configured by these flags run as local processes
34+
startMetricsCollector bool
35+
startLogsCollector bool
36+
37+
checkMetricsCollected bool
38+
checkLogsCollected bool
3539

3640
networkDir string
3741
reuseNetwork bool
@@ -83,12 +87,20 @@ func (v *FlagVars) NodeRuntimeConfig() (*tmpnet.NodeRuntimeConfig, error) {
8387
return v.startNetworkVars.GetNodeRuntimeConfig()
8488
}
8589

86-
func (v *FlagVars) StartCollectors() bool {
87-
return v.startCollectors
90+
func (v *FlagVars) StartMetricsCollector() bool {
91+
return v.startMetricsCollector
92+
}
93+
94+
func (v *FlagVars) StartLogsCollector() bool {
95+
return v.startLogsCollector
96+
}
97+
98+
func (v *FlagVars) CheckMetricsCollected() bool {
99+
return v.checkMetricsCollected
88100
}
89101

90-
func (v *FlagVars) CheckMonitoring() bool {
91-
return v.checkMonitoring
102+
func (v *FlagVars) CheckLogsCollected() bool {
103+
return v.checkLogsCollected
92104
}
93105

94106
func (v *FlagVars) NetworkDir() string {
@@ -102,7 +114,7 @@ func (v *FlagVars) NetworkDir() string {
102114
}
103115

104116
func (v *FlagVars) NetworkShutdownDelay() time.Duration {
105-
if v.startCollectors {
117+
if v.StartMetricsCollector() {
106118
// Only return a non-zero value if we want to ensure the collectors have
107119
// a chance to collect the metrics at the end of the test.
108120
return tmpnet.NetworkShutdownDelay
@@ -131,8 +143,10 @@ func RegisterFlagsWithDefaultOwner(defaultOwner string) *FlagVars {
131143
vars.startNetworkVars = flags.NewStartNetworkFlagVars(defaultOwner)
132144

133145
SetMonitoringFlags(
134-
&vars.startCollectors,
135-
&vars.checkMonitoring,
146+
&vars.startMetricsCollector,
147+
&vars.startLogsCollector,
148+
&vars.checkMetricsCollected,
149+
&vars.checkLogsCollected,
136150
)
137151

138152
flag.StringVar(
@@ -173,18 +187,29 @@ func RegisterFlagsWithDefaultOwner(defaultOwner string) *FlagVars {
173187
return &vars
174188
}
175189

176-
// Enable reuse by the upgrade job
177-
func SetMonitoringFlags(startCollectors *bool, checkMonitoring *bool) {
190+
func SetMonitoringFlags(startMetricsCollector, startLogsCollector, checkMetricsCollected, checkLogsCollected *bool) {
191+
flag.BoolVar(
192+
startMetricsCollector,
193+
"start-metrics-collector",
194+
cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_START_METRICS_COLLECTOR", "false")),
195+
"[optional] whether to start a local collector of metrics from nodes of the temporary network.",
196+
)
197+
flag.BoolVar(
198+
startLogsCollector,
199+
"start-logs-collector",
200+
cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_START_LOGS_COLLECTOR", "false")),
201+
"[optional] whether to start a local collector of logs from nodes of the temporary network.",
202+
)
178203
flag.BoolVar(
179-
startCollectors,
180-
"start-collectors",
181-
cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_START_COLLECTORS", "false")),
182-
"[optional] whether to start collectors of logs and metrics from nodes of the temporary network.",
204+
checkMetricsCollected,
205+
"check-metrics-collected",
206+
cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_CHECK_METRICS_COLLECTED", "false")),
207+
"[optional] whether to check that metrics have been collected from nodes of the temporary network.",
183208
)
184209
flag.BoolVar(
185-
checkMonitoring,
186-
"check-monitoring",
187-
cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_CHECK_MONITORING", "false")),
188-
"[optional] whether to check that logs and metrics have been collected from nodes of the temporary network.",
210+
checkLogsCollected,
211+
"check-logs-collected",
212+
cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_CHECK_LOGS_COLLECTED", "false")),
213+
"[optional] whether to check that logs have been collected from nodes of the temporary network.",
189214
)
190215
}

tests/fixture/tmpnet/README.md

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,9 @@ orchestrate the same temporary networks without the use of an rpc daemon.
2828
- [Process details](#process-details)
2929
- [Monitoring](#monitoring)
3030
- [Example usage](#example-usage)
31-
- [Starting collectors](#starting-collectors)
32-
- [Stopping collectors](#stopping-collectors)
33-
- [Metrics collection](#metrics-collection)
34-
- [Log collection](#log-collection)
31+
- [Running collectors](#running-collectors)
32+
- [Metric collection configuration](#metric-collection-configuration)
33+
- [Log collection configuration](#log-collection-configuration)
3534
- [Labels](#labels)
3635
- [CI Collection](#ci-collection)
3736
- [Viewing](#viewing)
@@ -332,49 +331,50 @@ PROMETHEUS_USERNAME=<username> \
332331
PROMETHEUS_PASSWORD=<password> \
333332
LOKI_USERNAME=<username> \
334333
LOKI_PASSWORD=<password> \
335-
./bin/tmpnetctl start-collectors
334+
./bin/tmpnetctl start-metrics-collector
335+
./bin/tmpnetctl start-logs-collector
336336

337337
# Network start emits link to grafana displaying collected logs and metrics
338338
./bin/tmpnetctl start-network
339339

340340
# When done with the network, stop the collectors
341-
./bin/tmpnetctl stop-collectors
341+
./bin/tmpnetctl stop-metrics-collector
342+
./bin/tmpnetctl stop-logs-collector
342343
```
343344

344-
### Starting collectors
345+
### Running collectors
345346
[Top](#table-of-contents)
346347

347-
Collectors for logs and metrics can be started by `tmpnetctl
348-
start-collectors`:
349-
350-
- Requires that the following env vars be set
351-
- `PROMETHEUS_USERNAME`
352-
- `PROMETHEUS_PASSWORD`
353-
- `LOKI_USERNAME`
354-
- `LOKI_PASSWORD`
355-
- Requires that binaries for promtail and prometheus be available in the path
356-
- Starting a development shell with `nix develop` is one way to
357-
ensure this and requires the [installation of
358-
nix](https://github.com/DeterminateSystems/nix-installer?tab=readme-ov-file#install-nix).
359-
- Starts prometheus in agent mode configured to scrape metrics from
360-
configured nodes and forward them to
361-
https://prometheus-poc.avax-dev.network.
362-
- Starts promtail configured to collect logs from configured nodes
363-
and forward them to https://loki-poc.avax-dev.network.
364-
365-
### Stopping collectors
366-
367-
Collectors for logs and metrics can be stopped by `tmpnetctl
368-
stop-collectors`:
369-
370-
### Metrics collection
348+
- `tmpnetctl start-metrics-collector` starts `prometheus` in agent mode
349+
configured to scrape metrics from configured nodes and forward
350+
them to https://prometheus-poc.avax-dev.network.
351+
- Requires:
352+
- Credentials supplied as env vars:
353+
- `PROMETHEUS_USERNAME`
354+
- `PROMETHEUS_PASSWORD`
355+
- A `prometheus` binary available in the path
356+
- Once started, `prometheus` can be stopped by `tmpnetctl stop-metrics-collector`
357+
- `tmpnetctl start-logs-collector` starts `promtail` configured to collect logs
358+
from configured nodes and forward them to
359+
https://loki-poc.avax-dev.network.
360+
- Requires:
361+
- Credentials supplied as env vars:
362+
- `LOKI_USERNAME`
363+
- `LOKI_PASSWORD`
364+
- A `promtail` binary available in the path
365+
- Once started, `promtail` can be stopped by `tmpnetctl stop-logs-collector`
366+
- Starting a development shell with `nix develop` is one way to
367+
ensure availability of the necessary binaries and requires the
368+
installation of nix (e.g. `./scripts/run_task.sh install-nix`).
369+
370+
### Metric collection configuration
371371
[Top](#table-of-contents)
372372

373373
When a node is started, configuration enabling collection of metrics
374374
from the node is written to
375375
`~/.tmpnet/prometheus/file_sd_configs/[network uuid]-[node id].json`.
376376

377-
### Log collection
377+
### Log collection configuration
378378
[Top](#table-of-contents)
379379

380380
Nodes log are stored at `~/.tmpnet/networks/[network id]/[node

tests/fixture/tmpnet/check_monitoring.go

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,6 @@ import (
2828

2929
type getCountFunc func() (int, error)
3030

31-
// CheckMonitoring checks if logs and metrics exist for the given network. If no network
32-
// UUID is provided, an attempt will be made to derive selectors from env vars (GH_*)
33-
// identifying a github actions run.
34-
func CheckMonitoring(ctx context.Context, log logging.Logger, networkUUID string) error {
35-
return errors.Join(
36-
CheckLogsExist(ctx, log, networkUUID),
37-
CheckMetricsExist(ctx, log, networkUUID),
38-
)
39-
}
40-
4131
// waitForCount waits until the provided function returns greater than zero.
4232
func waitForCount(ctx context.Context, log logging.Logger, name string, getCount getCountFunc) error {
4333
err := pollUntilContextCancel(

0 commit comments

Comments
 (0)