MON-4057: Expose scrapeInterval setting for UWM Prometheus

slashpai · slashpai · commit 1ab34a5dd4f7 · 2024-10-24T08:03:06.000+05:30
Allows to configure scrapeInterval between 5s and 5m
in `openshift-user-workload-monitoring/user-workload-monitoring-config`
configmap under `prometheus` field.

Signed-off-by: Jayapriya Pai &lt;janantha@redhat.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # Note: This CHANGELOG is only for the monitoring team to track all monitoring related changes. Please see OpenShift release notes for official changes.
 
+## 4.18
+
+- [#2503] (https://github.com/openshift/cluster-monitoring-operator/issues/2503) Expose `scrapeInterval` setting for UWM Prometheus.
+
 ## 4.17
 
 - [#2409](https://github.com/openshift/cluster-monitoring-operator/issues/2409) Remove prometheus-adapter code from CMO
diff --git a/Documentation/api.md b/Documentation/api.md
@@ -484,6 +484,7 @@ The `PrometheusRestrictedConfig` resource defines the settings for the Prometheu
 
 | Property | Type | Description |
 | -------- | ---- | ----------- |
+| scrapeInterval | string | Configures the default interval between consecutive scrapes in case the ServiceMonitor or PodMonitor resource does not specify any value. The default value is `30s`. |
 | additionalAlertmanagerConfigs | [][AdditionalAlertmanagerConfig](#additionalalertmanagerconfig) | Configures additional Alertmanager instances that receive alerts from the Prometheus component. By default, no additional Alertmanager instances are configured. |
 | enforcedLabelLimit | *uint64 | Specifies a per-scrape limit on the number of labels accepted for a sample. If the number of labels exceeds this limit after metric relabeling, the entire scrape is treated as failed. The default value is `0`, which means that no limit is set. |
 | enforcedLabelNameLengthLimit | *uint64 | Specifies a per-scrape limit on the length of a label name for a sample. If the length of a label name exceeds this limit after metric relabeling, the entire scrape is treated as failed. The default value is `0`, which means that no limit is set. |
diff --git a/Documentation/openshiftdocs/modules/prometheusrestrictedconfig.adoc b/Documentation/openshiftdocs/modules/prometheusrestrictedconfig.adoc
@@ -18,6 +18,8 @@ Appears in: link:userworkloadconfiguration.adoc[UserWorkloadConfiguration]
 [options="header"]
 |===
 | Property | Type | Description 
+|scrapeInterval|string|Configures the default interval between consecutive scrapes in case the ServiceMonitor or PodMonitor resource does not specify any value. The default value is `30s`.
+
 |additionalAlertmanagerConfigs|[]link:additionalalertmanagerconfig.adoc[AdditionalAlertmanagerConfig]|Configures additional Alertmanager instances that receive alerts from the Prometheus component. By default, no additional Alertmanager instances are configured.
 
 |enforcedLabelLimit|*uint64|Specifies a per-scrape limit on the number of labels accepted for a sample. If the number of labels exceeds this limit after metric relabeling, the entire scrape is treated as failed. The default value is `0`, which means that no limit is set.
diff --git a/hack/build-jsonnet.sh b/hack/build-jsonnet.sh
@@ -37,7 +37,7 @@ for file in "${files[@]}"; do
 	}&
 
 	# wait for at least one of the jobs to finish if there are more than maxProc jobs
-	while [[ $(jobs -r | wc -l ) -ge "$maxProc" ]]; do wait -n; done
+	while [[ $(jobs -r | wc -l ) -ge "$maxProc" ]]; do wait; done
 done
 # wait for all jobs to finish
 wait
diff --git a/pkg/manifests/config.go b/pkg/manifests/config.go
@@ -26,6 +26,7 @@ import (
 
 	"github.com/alecthomas/units"
 	configv1 "github.com/openshift/api/config/v1"
+	"github.com/prometheus/common/model"
 	v1 "k8s.io/api/core/v1"
 	k8syaml "k8s.io/apimachinery/pkg/util/yaml"
 	auditv1 "k8s.io/apiserver/pkg/apis/audit/v1"
@@ -126,6 +127,27 @@ func (c Config) GetThanosRulerAlertmanagerConfigs() []AdditionalAlertmanagerConf
 	return alertmanagerConfigs
 }
 
+func (c Config) HasInvalidScrapeIntervalDuration() bool {
+	if c.ClusterMonitoringConfiguration == nil || c.UserWorkloadConfiguration == nil {
+		return false
+	}
+
+	if c.UserWorkloadConfiguration.Prometheus == nil || c.UserWorkloadConfiguration.Prometheus.ScrapeInterval == "" {
+		return false
+	}
+
+	scrapeInterval, err := model.ParseDuration(c.UserWorkloadConfiguration.Prometheus.ScrapeInterval)
+
+	if err != nil {
+		return true
+	}
+
+	allowedLowerLimit, _ := model.ParseDuration("5s")
+	allowedUpperLimit, _ := model.ParseDuration("5m")
+
+	return (scrapeInterval < allowedLowerLimit) || (scrapeInterval > allowedUpperLimit)
+}
+
 type Images struct {
 	MetricsServer                      string
 	PromLabelProxy                     string
@@ -436,6 +458,9 @@ func (c *Config) LoadEnforcedBodySizeLimit(pcr PodCapacityReader, ctx context.Co
 }
 
 func (c *Config) Precheck() error {
+	if c.HasInvalidScrapeIntervalDuration() {
+		return fmt.Errorf("%w: scrapeInterval specified should be between 5s and 5m", ErrUserWorkloadInvalidConfiguration)
+	}
 	if c.ClusterMonitoringConfiguration.PrometheusK8sConfig.CollectionProfile != FullCollectionProfile && !c.CollectionProfilesFeatureGateEnabled {
 		return fmt.Errorf("%w: collectionProfiles is currently a TechPreview feature behind the \"MetricsCollectionProfiles\" feature-gate, to be able to use a profile different from the default (\"full\") please enable it first", ErrConfigValidation)
 	}
diff --git a/pkg/manifests/config_test.go b/pkg/manifests/config_test.go
@@ -318,6 +318,61 @@ func TestLoadEnforcedBodySizeLimit(t *testing.T) {
 	}
 }
 
+func TestScrapeIntervalUWMPreCheck(t *testing.T) {
+	for _, tc := range []struct {
+		name          string
+		uwmconfig     string
+		expectedError bool
+	}{
+		{
+			name:          "default",
+			uwmconfig:     "",
+			expectedError: false,
+		},
+		{
+			name: "scrapeInterval valid within limits",
+			uwmconfig: `prometheus:
+  scrapeInterval: 15s
+  `,
+			expectedError: false,
+		},
+		{
+			name: "scrapeInterval < allowed lower limit",
+			uwmconfig: `prometheus:
+  scrapeInterval: 2s
+  `,
+			expectedError: true,
+		},
+		{
+			name: "scrapeInterval > allowed upper limit",
+			uwmconfig: `prometheus:
+  scrapeInterval: 10m
+  `,
+			expectedError: true,
+		},
+		{
+			name: "incorrect scrape interval value",
+			uwmconfig: `prometheus:
+  scrapeInterval: 1234www
+  `,
+			expectedError: true,
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			c := NewDefaultConfig()
+			uwc, err := NewUserConfigFromString(tc.uwmconfig)
+			require.NoError(t, err)
+			c.UserWorkloadConfiguration = uwc
+			err = c.Precheck()
+			if err != nil && tc.expectedError {
+				return
+			}
+
+			require.NoError(t, err)
+		})
+	}
+}
+
 func TestCollectionProfilePreCheck(t *testing.T) {
 	for _, tc := range []struct {
 		name          string
diff --git a/pkg/manifests/manifests.go b/pkg/manifests/manifests.go
@@ -329,7 +329,8 @@ var (
 )
 
 var (
-	ErrConfigValidation = fmt.Errorf("invalid value for config")
+	ErrConfigValidation                 = fmt.Errorf("invalid value for config")
+	ErrUserWorkloadInvalidConfiguration = fmt.Errorf("invalid value for user-workload config")
 )
 
 type Factory struct {
@@ -1658,6 +1659,10 @@ func (f *Factory) PrometheusUserWorkload(grpcTLS *v1.Secret) (*monv1.Prometheus,
 	if err != nil {
 		return nil, err
 	}
+	if f.config.UserWorkloadConfiguration.Prometheus.ScrapeInterval != "" {
+		p.Spec.ScrapeInterval = monv1.Duration(f.config.UserWorkloadConfiguration.Prometheus.ScrapeInterval)
+	}
+
 	if f.config.UserWorkloadConfiguration.Prometheus.LogLevel != "" {
 		p.Spec.LogLevel = f.config.UserWorkloadConfiguration.Prometheus.LogLevel
 	}
diff --git a/pkg/manifests/manifests_test.go b/pkg/manifests/manifests_test.go
@@ -1643,6 +1643,7 @@ func TestPrometheusUserWorkloadConfiguration(t *testing.T) {
 	c := NewDefaultConfig()
 
 	uwc, err := NewUserConfigFromString(`prometheus:
+  scrapeInterval: 15s
   resources:
     requests:
       cpu: 100m
@@ -1670,6 +1671,10 @@ func TestPrometheusUserWorkloadConfiguration(t *testing.T) {
 		t.Fatal(err)
 	}
 
+	if p.Spec.ScrapeInterval != "15s" {
+		t.Fatal("Prometheus UWM scrapeInterval not configured correctly")
+	}
+
 	if p.Spec.TopologySpreadConstraints[0].MaxSkew != 1 {
 		t.Fatal("Prometheus UWM spread constraints MaxSkew not configured correctly")
 	}
diff --git a/pkg/manifests/types.go b/pkg/manifests/types.go
@@ -601,6 +601,9 @@ type AlertmanagerUserWorkloadConfig struct {
 // The `PrometheusRestrictedConfig` resource defines the settings for the
 // Prometheus component that monitors user-defined projects.
 type PrometheusRestrictedConfig struct {
+	// Configures the default interval between consecutive scrapes in case the ServiceMonitor or PodMonitor resource does not specify any value.
+	// The default value is `30s`.
+	ScrapeInterval string `json:"scrapeInterval,omitempty"`
 	// Configures additional Alertmanager instances that receive alerts from
 	// the Prometheus component. By default, no additional Alertmanager
 	// instances are configured.
diff --git a/test/e2e/config_test.go b/test/e2e/config_test.go
@@ -30,7 +30,9 @@ import (
 	"github.com/openshift/cluster-monitoring-operator/test/e2e/framework"
 	"github.com/stretchr/testify/require"
 
+	monv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
 	v1 "k8s.io/api/core/v1"
+
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
@@ -585,6 +587,7 @@ func TestUserWorkloadMonitorPrometheusK8Config(t *testing.T) {
 
 	uwmCM := f.BuildUserWorkloadConfigMap(t,
 		fmt.Sprintf(`prometheus:
+  scrapeInterval: 15s
   enforcedTargetLimit: 10
   enforcedLabelLimit: 500
   enforcedLabelNameLengthLimit: 50
@@ -671,6 +674,10 @@ func TestUserWorkloadMonitorPrometheusK8Config(t *testing.T) {
 			name:      "assert query log file value is set and correct",
 			assertion: assertQueryLogValueEquals(f.UserWorkloadMonitoringNs, crName, "/tmp/test.log"),
 		},
+		{
+			name:      "assert scrape interval is configured",
+			assertion: assertScrapeInterval("15s"),
+		},
 	} {
 		t.Run(tc.name, tc.assertion)
 	}
@@ -1028,6 +1035,30 @@ func assertRemoteWriteWasSet(namespace, crName, urlValue string) func(t *testing
 	}
 }
 
+func assertScrapeInterval(scrapeInterval string) func(*testing.T) {
+	ctx := context.Background()
+	return func(t *testing.T) {
+		err := framework.Poll(time.Second, 5*time.Minute, func() error {
+			p, err := f.MonitoringClient.Prometheuses(f.UserWorkloadMonitoringNs).Get(ctx, "user-workload", metav1.GetOptions{})
+			if err != nil {
+				return err
+			}
+
+			if p.Spec.ScrapeInterval == "" {
+				return errors.New("scrapeInterval is not set")
+			} else if p.Spec.ScrapeInterval != monv1.Duration(scrapeInterval) {
+				return fmt.Errorf("expected scrapeInterval to be %s, but got %s", scrapeInterval, p.Spec.ScrapeInterval)
+			}
+
+			return nil
+		})
+
+		if err != nil {
+			t.Fatalf("Timed out waiting for scrapeInterval configuration: %v", err)
+		}
+	}
+}
+
 func assertEnforcedTargetLimit(limit uint64) func(*testing.T) {
 	ctx := context.Background()
 	return func(t *testing.T) {

Original file line number	Diff line number	Diff line change
`@@ -329,7 +329,8 @@ var (`
`329`	`329`	`)`
`330`	`330`
`331`	`331`	`var (`
`332`		`- ErrConfigValidation = fmt.Errorf("invalid value for config")`
	`332`	`+ ErrConfigValidation = fmt.Errorf("invalid value for config")`
	`333`	`+ ErrUserWorkloadInvalidConfiguration = fmt.Errorf("invalid value for user-workload config")`
`333`	`334`	`)`
`334`	`335`
`335`	`336`	`type Factory struct {`
`@@ -1658,6 +1659,10 @@ func (f Factory) PrometheusUserWorkload(grpcTLS v1.Secret) (*monv1.Prometheus,`
`1658`	`1659`	`if err != nil {`
`1659`	`1660`	`return nil, err`
`1660`	`1661`	`}`
	`1662`	`+ if f.config.UserWorkloadConfiguration.Prometheus.ScrapeInterval != "" {`
	`1663`	`+ p.Spec.ScrapeInterval = monv1.Duration(f.config.UserWorkloadConfiguration.Prometheus.ScrapeInterval)`
	`1664`	`+ }`
	`1665`	`+`
`1661`	`1666`	`if f.config.UserWorkloadConfiguration.Prometheus.LogLevel != "" {`
`1662`	`1667`	`p.Spec.LogLevel = f.config.UserWorkloadConfiguration.Prometheus.LogLevel`
`1663`	`1668`	`}`