MON-4057: Expose scrapeInterval setting for UWM Prometheus

slashpai · slashpai · commit aaf668156826 · 2024-11-07T12:07:30.000+05:30
Allows to configure scrapeInterval between 5s and 5m
in `openshift-user-workload-monitoring/user-workload-monitoring-config`
configmap under `prometheus` field.

Signed-off-by: Jayapriya Pai &lt;janantha@redhat.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # Note: This CHANGELOG is only for the monitoring team to track all monitoring related changes. Please see OpenShift release notes for official changes.
 
+## 4.18
+
+- [#2503] (https://github.com/openshift/cluster-monitoring-operator/issues/2503) Expose `scrapeInterval` setting for UWM Prometheus.
+
 ## 4.17
 
 - [#2409](https://github.com/openshift/cluster-monitoring-operator/issues/2409) Remove prometheus-adapter code from CMO
diff --git a/Documentation/api.md b/Documentation/api.md
@@ -484,6 +484,7 @@ The `PrometheusRestrictedConfig` resource defines the settings for the Prometheu
 
 | Property | Type | Description |
 | -------- | ---- | ----------- |
+| scrapeInterval | string | Configures the default interval between consecutive scrapes in case the ServiceMonitor or PodMonitor resource does not specify any value. The interval needs to be greater than or equal to 5 seconds and less than or equal to 5 minutes. The value can be expressed as: * seconds, for example `30s`. * minutees, for example `1m`. * a mix of minutes and seconds, for example `1m30s` The default value is `30s`. |
 | additionalAlertmanagerConfigs | [][AdditionalAlertmanagerConfig](#additionalalertmanagerconfig) | Configures additional Alertmanager instances that receive alerts from the Prometheus component. By default, no additional Alertmanager instances are configured. |
 | enforcedLabelLimit | *uint64 | Specifies a per-scrape limit on the number of labels accepted for a sample. If the number of labels exceeds this limit after metric relabeling, the entire scrape is treated as failed. The default value is `0`, which means that no limit is set. |
 | enforcedLabelNameLengthLimit | *uint64 | Specifies a per-scrape limit on the length of a label name for a sample. If the length of a label name exceeds this limit after metric relabeling, the entire scrape is treated as failed. The default value is `0`, which means that no limit is set. |
diff --git a/Documentation/openshiftdocs/modules/prometheusrestrictedconfig.adoc b/Documentation/openshiftdocs/modules/prometheusrestrictedconfig.adoc
@@ -18,6 +18,8 @@ Appears in: link:userworkloadconfiguration.adoc[UserWorkloadConfiguration]
 [options="header"]
 |===
 | Property | Type | Description 
+|scrapeInterval|string|Configures the default interval between consecutive scrapes in case the ServiceMonitor or PodMonitor resource does not specify any value. The interval needs to be greater than or equal to 5 seconds and less than or equal to 5 minutes. The value can be expressed as: * seconds, for example `30s`. * minutees, for example `1m`. * a mix of minutes and seconds, for example `1m30s` The default value is `30s`.
+
 |additionalAlertmanagerConfigs|[]link:additionalalertmanagerconfig.adoc[AdditionalAlertmanagerConfig]|Configures additional Alertmanager instances that receive alerts from the Prometheus component. By default, no additional Alertmanager instances are configured.
 
 |enforcedLabelLimit|*uint64|Specifies a per-scrape limit on the number of labels accepted for a sample. If the number of labels exceeds this limit after metric relabeling, the entire scrape is treated as failed. The default value is `0`, which means that no limit is set.
diff --git a/pkg/manifests/config.go b/pkg/manifests/config.go
@@ -26,6 +26,7 @@ import (
 
 	"github.com/alecthomas/units"
 	configv1 "github.com/openshift/api/config/v1"
+	"github.com/prometheus/common/model"
 	v1 "k8s.io/api/core/v1"
 	k8syaml "k8s.io/apimachinery/pkg/util/yaml"
 	auditv1 "k8s.io/apiserver/pkg/apis/audit/v1"
@@ -126,6 +127,35 @@ func (c Config) GetThanosRulerAlertmanagerConfigs() []AdditionalAlertmanagerConf
 	return alertmanagerConfigs
 }
 
+func scrapeIntervalLimits() (model.Duration, model.Duration) {
+	lowerLimit, _ := model.ParseDuration("5s")
+	upperLimit, _ := model.ParseDuration("5m")
+	return lowerLimit, upperLimit
+}
+
+func (c Config) checkUserWorkloadPrometheusConfig() error {
+	if c.ClusterMonitoringConfiguration == nil || c.UserWorkloadConfiguration == nil {
+		return nil
+	}
+
+	if c.UserWorkloadConfiguration.Prometheus == nil || c.UserWorkloadConfiguration.Prometheus.ScrapeInterval == "" {
+		return nil
+	}
+
+	scrapeInterval, err := model.ParseDuration(c.UserWorkloadConfiguration.Prometheus.ScrapeInterval)
+
+	if err != nil {
+		return fmt.Errorf("invalid scrape interval value: %w", err)
+	}
+
+	allowedLowerLimit, allowedUpperLimit := scrapeIntervalLimits()
+
+	if (scrapeInterval < allowedLowerLimit) || (scrapeInterval > allowedUpperLimit) {
+		return fmt.Errorf("scrape interval value %q outside of the allowed range [%q, %q]", c.UserWorkloadConfiguration.Prometheus.ScrapeInterval, allowedLowerLimit, allowedUpperLimit)
+	}
+	return nil
+}
+
 type Images struct {
 	MetricsServer                      string
 	PromLabelProxy                     string
@@ -465,6 +495,14 @@ func (c *Config) Precheck() error {
 	return nil
 }
 
+func (c *Config) UserWorkloadPrecheck() error {
+	if err := c.checkUserWorkloadPrometheusConfig(); err != nil {
+		return err
+	}
+
+	return nil
+}
+
 func calculateBodySizeLimit(podCapacity int) string {
 	const samplesPerPod = 400 // 400 samples per pod
 	const sizePerSample = 200 // 200 Bytes
diff --git a/pkg/manifests/config_test.go b/pkg/manifests/config_test.go
@@ -318,6 +318,68 @@ func TestLoadEnforcedBodySizeLimit(t *testing.T) {
 	}
 }
 
+func TestScrapeIntervalUWMPreCheck(t *testing.T) {
+	for _, tc := range []struct {
+		name          string
+		uwmconfig     string
+		expectedError bool
+	}{
+		{
+			name:          "default",
+			uwmconfig:     "",
+			expectedError: false,
+		},
+		{
+			name: "scrapeInterval valid within limits",
+			uwmconfig: `prometheus:
+  scrapeInterval: 15s
+  `,
+			expectedError: false,
+		},
+		{
+			name: "scrapeInterval valid within limits-mix-of-minutes-seconds",
+			uwmconfig: `prometheus:
+  scrapeInterval: 1m30s
+  `,
+			expectedError: false,
+		},
+		{
+			name: "scrapeInterval < allowed lower limit",
+			uwmconfig: `prometheus:
+  scrapeInterval: 2s
+  `,
+			expectedError: true,
+		},
+		{
+			name: "scrapeInterval > allowed upper limit",
+			uwmconfig: `prometheus:
+  scrapeInterval: 10m
+  `,
+			expectedError: true,
+		},
+		{
+			name: "incorrect scrape interval value",
+			uwmconfig: `prometheus:
+  scrapeInterval: 1234www
+  `,
+			expectedError: true,
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			c := NewDefaultConfig()
+			uwc, err := NewUserConfigFromString(tc.uwmconfig)
+			require.NoError(t, err)
+			c.UserWorkloadConfiguration = uwc
+			err = c.UserWorkloadPrecheck()
+			if err != nil && tc.expectedError {
+				return
+			}
+
+			require.NoError(t, err)
+		})
+	}
+}
+
 func TestCollectionProfilePreCheck(t *testing.T) {
 	for _, tc := range []struct {
 		name          string
diff --git a/pkg/manifests/manifests.go b/pkg/manifests/manifests.go
@@ -1658,6 +1658,10 @@ func (f *Factory) PrometheusUserWorkload(grpcTLS *v1.Secret) (*monv1.Prometheus,
 	if err != nil {
 		return nil, err
 	}
+	if f.config.UserWorkloadConfiguration.Prometheus.ScrapeInterval != "" {
+		p.Spec.ScrapeInterval = monv1.Duration(f.config.UserWorkloadConfiguration.Prometheus.ScrapeInterval)
+	}
+
 	if f.config.UserWorkloadConfiguration.Prometheus.LogLevel != "" {
 		p.Spec.LogLevel = f.config.UserWorkloadConfiguration.Prometheus.LogLevel
 	}
diff --git a/pkg/manifests/manifests_test.go b/pkg/manifests/manifests_test.go
@@ -1643,6 +1643,7 @@ func TestPrometheusUserWorkloadConfiguration(t *testing.T) {
 	c := NewDefaultConfig()
 
 	uwc, err := NewUserConfigFromString(`prometheus:
+  scrapeInterval: 15s
   resources:
     requests:
       cpu: 100m
@@ -1670,6 +1671,10 @@ func TestPrometheusUserWorkloadConfiguration(t *testing.T) {
 		t.Fatal(err)
 	}
 
+	if p.Spec.ScrapeInterval != "15s" {
+		t.Fatal("Prometheus UWM scrapeInterval not configured correctly")
+	}
+
 	if p.Spec.TopologySpreadConstraints[0].MaxSkew != 1 {
 		t.Fatal("Prometheus UWM spread constraints MaxSkew not configured correctly")
 	}
diff --git a/pkg/manifests/types.go b/pkg/manifests/types.go
@@ -601,6 +601,14 @@ type AlertmanagerUserWorkloadConfig struct {
 // The `PrometheusRestrictedConfig` resource defines the settings for the
 // Prometheus component that monitors user-defined projects.
 type PrometheusRestrictedConfig struct {
+	// Configures the default interval between consecutive scrapes in case the ServiceMonitor or PodMonitor resource does not specify any value.
+	// The interval needs to be greater than or equal to 5 seconds and less than or equal to 5 minutes.
+	// The value can be expressed as:
+	// * seconds, for example `30s`.
+	// * minutees, for example `1m`.
+	// * a mix of minutes and seconds, for example `1m30s`
+	// The default value is `30s`.
+	ScrapeInterval string `json:"scrapeInterval,omitempty"`
 	// Configures additional Alertmanager instances that receive alerts from
 	// the Prometheus component. By default, no additional Alertmanager
 	// instances are configured.
diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go
@@ -1018,6 +1018,7 @@ func (o *Operator) Config(ctx context.Context, key string) (*manifests.Config, e
 	if err != nil {
 		return nil, err
 	}
+
 	err = c.Precheck()
 	if err != nil {
 		return nil, err
@@ -1034,6 +1035,11 @@ func (o *Operator) Config(ctx context.Context, key string) (*manifests.Config, e
 		}
 	}
 
+	err = c.UserWorkloadPrecheck()
+	if err != nil {
+		return nil, fmt.Errorf("%w: %w", ErrUserWorkloadInvalidConfiguration, err)
+	}
+
 	err = c.LoadEnforcedBodySizeLimit(o.client, ctx)
 	if err != nil {
 		c.ClusterMonitoringConfiguration.PrometheusK8sConfig.EnforcedBodySizeLimit = ""
diff --git a/test/e2e/config_test.go b/test/e2e/config_test.go
@@ -30,7 +30,9 @@ import (
 	"github.com/openshift/cluster-monitoring-operator/test/e2e/framework"
 	"github.com/stretchr/testify/require"
 
+	monv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
 	v1 "k8s.io/api/core/v1"
+
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
@@ -585,6 +587,7 @@ func TestUserWorkloadMonitorPrometheusK8Config(t *testing.T) {
 
 	uwmCM := f.BuildUserWorkloadConfigMap(t,
 		fmt.Sprintf(`prometheus:
+  scrapeInterval: 15s
   enforcedTargetLimit: 10
   enforcedLabelLimit: 500
   enforcedLabelNameLengthLimit: 50
@@ -671,6 +674,10 @@ func TestUserWorkloadMonitorPrometheusK8Config(t *testing.T) {
 			name:      "assert query log file value is set and correct",
 			assertion: assertQueryLogValueEquals(f.UserWorkloadMonitoringNs, crName, "/tmp/test.log"),
 		},
+		{
+			name:      "assert scrape interval is configured",
+			assertion: assertScrapeInterval("15s"),
+		},
 	} {
 		t.Run(tc.name, tc.assertion)
 	}
@@ -1028,6 +1035,30 @@ func assertRemoteWriteWasSet(namespace, crName, urlValue string) func(t *testing
 	}
 }
 
+func assertScrapeInterval(scrapeInterval string) func(*testing.T) {
+	ctx := context.Background()
+	return func(t *testing.T) {
+		err := framework.Poll(time.Second, 5*time.Minute, func() error {
+			p, err := f.MonitoringClient.Prometheuses(f.UserWorkloadMonitoringNs).Get(ctx, "user-workload", metav1.GetOptions{})
+			if err != nil {
+				return err
+			}
+
+			if p.Spec.ScrapeInterval == "" {
+				return errors.New("scrapeInterval is not set")
+			} else if p.Spec.ScrapeInterval != monv1.Duration(scrapeInterval) {
+				return fmt.Errorf("expected scrapeInterval to be %s, but got %s", scrapeInterval, p.Spec.ScrapeInterval)
+			}
+
+			return nil
+		})
+
+		if err != nil {
+			t.Fatalf("Timed out waiting for scrapeInterval configuration: %v", err)
+		}
+	}
+}
+
 func assertEnforcedTargetLimit(limit uint64) func(*testing.T) {
 	ctx := context.Background()
 	return func(t *testing.T) {

Original file line number	Diff line number	Diff line change
`@@ -1658,6 +1658,10 @@ func (f Factory) PrometheusUserWorkload(grpcTLS v1.Secret) (*monv1.Prometheus,`
`1658`	`1658`	`if err != nil {`
`1659`	`1659`	`return nil, err`
`1660`	`1660`	`}`
	`1661`	`+ if f.config.UserWorkloadConfiguration.Prometheus.ScrapeInterval != "" {`
	`1662`	`+ p.Spec.ScrapeInterval = monv1.Duration(f.config.UserWorkloadConfiguration.Prometheus.ScrapeInterval)`
	`1663`	`+ }`
	`1664`	`+`
`1661`	`1665`	`if f.config.UserWorkloadConfiguration.Prometheus.LogLevel != "" {`
`1662`	`1666`	`p.Spec.LogLevel = f.config.UserWorkloadConfiguration.Prometheus.LogLevel`
`1663`	`1667`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1018,6 +1018,7 @@ func (o Operator) Config(ctx context.Context, key string) (manifests.Config, e`
`1018`	`1018`	`if err != nil {`
`1019`	`1019`	`return nil, err`
`1020`	`1020`	`}`
	`1021`	`+`
`1021`	`1022`	`err = c.Precheck()`
`1022`	`1023`	`if err != nil {`
`1023`	`1024`	`return nil, err`
`@@ -1034,6 +1035,11 @@ func (o Operator) Config(ctx context.Context, key string) (manifests.Config, e`
`1034`	`1035`	`}`
`1035`	`1036`	`}`
`1036`	`1037`
	`1038`	`+ err = c.UserWorkloadPrecheck()`
	`1039`	`+ if err != nil {`
	`1040`	`+ return nil, fmt.Errorf("%w: %w", ErrUserWorkloadInvalidConfiguration, err)`
	`1041`	`+ }`
	`1042`	`+`
`1037`	`1043`	`err = c.LoadEnforcedBodySizeLimit(o.client, ctx)`
`1038`	`1044`	`if err != nil {`
`1039`	`1045`	`c.ClusterMonitoringConfiguration.PrometheusK8sConfig.EnforcedBodySizeLimit = ""`