noahstho
diff --git a/‎.github/workflows/microbenchmarks-ci.yaml
+2-2 b/‎.github/workflows/microbenchmarks-ci.yaml
+2-2
diff --git a/‎pkg/cmd/microbench-ci/benchmark.go
+10-9 b/‎pkg/cmd/microbench-ci/benchmark.go
+10-9
diff --git a/‎pkg/cmd/microbench-ci/compare.go
+66-20 b/‎pkg/cmd/microbench-ci/compare.go
+66-20
diff --git a/‎pkg/cmd/microbench-ci/config/pull-request-suite.yml
+21-18 b/‎pkg/cmd/microbench-ci/config/pull-request-suite.yml
+21-18
diff --git a/‎pkg/cmd/microbench-ci/report.go
+32-14 b/‎pkg/cmd/microbench-ci/report.go
+32-14
@@ -45,7 +45,7 @@ jobs:
           pkg: ${{ env.PACKAGE }}
   run-group-1:
     runs-on: [self-hosted, basic_microbench_runner_group]
-    timeout-minutes: 30
+    timeout-minutes: 60
     needs: [base, head]
     steps:
       - name: Checkout
@@ -58,7 +58,7 @@ jobs:
           group: 1
   run-group-2:
     runs-on: [self-hosted, basic_microbench_runner_group]
-    timeout-minutes: 30
+    timeout-minutes: 60
     needs: [base, head]
     steps:
       - name: Checkout
 
@@ -19,15 +19,16 @@ import (
 
 type (
 	Benchmark struct {
-		DisplayName string   `yaml:"display_name"`
-		Package     string   `yaml:"package"`
-		Labels      []string `yaml:"labels"`
-		Name        string   `yaml:"name"`
-		RunnerGroup int      `yaml:"runner_group"`
-		Count       int      `yaml:"count"`
-		Iterations  int      `yaml:"iterations"`
-
-		Thresholds map[string]float64 `yaml:"thresholds"`
+		DisplayName  string   `yaml:"display_name"`
+		Package      string   `yaml:"package"`
+		Labels       []string `yaml:"labels"`
+		Name         string   `yaml:"name"`
+		RunnerGroup  int      `yaml:"runner_group"`
+		Count        int      `yaml:"count"`
+		Iterations   int      `yaml:"iterations"`
+		CompareAlpha float64  `yaml:"compare_alpha"`
+		Retries      int      `yaml:"retries"`
+		Metrics      []string `yaml:"metrics"`
 	}
 	Benchmarks  []Benchmark
 	ProfileType string
 
@@ -6,16 +6,17 @@
 package main
 
 import (
+	"bufio"
 	"bytes"
 	"fmt"
-	"math"
 	"os"
 	"path"
 
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachprod-microbench/model"
 	"github.com/cockroachdb/errors"
 	"golang.org/x/exp/maps"
 	"golang.org/x/perf/benchfmt"
+	"golang.org/x/perf/benchmath"
 )
 
 type (
@@ -31,11 +32,24 @@ type (
 
 const (
 	NoChange Status = iota
-	Better
-	Worse
-	Regression
+	Improved
+	Regressed
 )
 
+// String returns the string representation of the status.
+func (s Status) String() string {
+	switch s {
+	case NoChange:
+		return "No Change"
+	case Improved:
+		return "Improved"
+	case Regressed:
+		return "Regressed"
+	default:
+		panic(fmt.Sprintf("unknown status: %d", s))
+	}
+}
+
 // status returns the status of a metric in the comparison.
 func (c *CompareResult) status(metricName string) Status {
 	entry := c.MetricMap[metricName]
@@ -47,35 +61,36 @@ func (c *CompareResult) status(metricName string) Status {
 		return NoChange
 	}
 	status := NoChange
-	threshold := c.Benchmark.Thresholds[metricName] * 100.0
 	if cc.Delta*float64(entry.Better) > 0 {
-		status = Better
+		status = Improved
 	} else if cc.Delta*float64(entry.Better) < 0 {
-		status = Worse
-		if math.Abs(cc.Delta) >= threshold {
-			status = Regression
-		}
+		status = Regressed
 	}
 	return status
 }
 
-// regressed returns true if any metric in the comparison has regressed.
-func (c *CompareResult) regressed() bool {
+// top returns the top status of all metrics in the comparison.
+func (c *CompareResult) top() Status {
+	topStatus := NoChange
 	for metric := range c.MetricMap {
 		status := c.status(metric)
-		if status == Regression {
-			return true
+		if status > topStatus {
+			topStatus = status
 		}
 	}
-	return false
+	return topStatus
 }
 
-// compare compares the metrics of a benchmark between two revisions.
-func (b *Benchmark) compare() (*CompareResult, error) {
-	builder := model.NewBuilder()
+// compare compares the metrics of a benchmark between two revisions. Only the
+// specified last number of lines of the benchmark logs are considered. If lines
+// is 0, it considers the entire logs.
+func (b *Benchmark) compare(lines int) (*CompareResult, error) {
+	builder := model.NewBuilder(model.WithThresholds(&benchmath.Thresholds{
+		CompareAlpha: b.CompareAlpha,
+	}))
 	compareResult := CompareResult{Benchmark: b}
 	for _, revision := range []Revision{Old, New} {
-		data, err := os.ReadFile(path.Join(suite.artifactsDir(revision), b.cleanLog()))
+		data, err := logTail(path.Join(suite.artifactsDir(revision), b.cleanLog()), lines)
 		if err != nil {
 			return nil, err
 		}
@@ -110,11 +125,42 @@ func (b *Benchmark) compare() (*CompareResult, error) {
 func (b Benchmarks) compareBenchmarks() (CompareResults, error) {
 	compareResults := make(CompareResults, 0, len(b))
 	for _, benchmark := range b {
-		compareResult, err := benchmark.compare()
+		compareResult, err := benchmark.compare(0)
 		if err != nil {
 			return nil, err
 		}
 		compareResults = append(compareResults, compareResult)
 	}
 	return compareResults, nil
 }
+
+// logTail returns the last N lines of a file.
+// If N is 0, it returns the entire file.
+func logTail(filePath string, N int) ([]byte, error) {
+	if N == 0 {
+		return os.ReadFile(filePath)
+	}
+	file, err := os.Open(filePath)
+	if err != nil {
+		return nil, err
+	}
+	defer file.Close()
+
+	lines := make([]string, 0, N)
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		lines = append(lines, scanner.Text())
+		if len(lines) > N {
+			lines = lines[1:]
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, err
+	}
+
+	var buffer bytes.Buffer
+	for _, line := range lines {
+		buffer.WriteString(line + "\n")
+	}
+	return buffer.Bytes(), nil
+}
@@ -4,33 +4,36 @@ benchmarks:
     name: "BenchmarkSysbench/SQL/3node/oltp_read_write"
     package: "pkg/sql/tests"
     runner_group: 1
-    count: 10
-    iterations: 3000
-    thresholds:
-      "sec/op": .03
-      "B/op": .02
-      "allocs/op": .02
+    count: 15
+    iterations: 1500
+    compare_alpha: 0.025
+    retries: 3
+    metrics:
+      - "sec/op"
+      - "allocs/op"
 
   - display_name: Sysbench
     labels: ["KV", "1node", "local", "oltp_read_only"]
     name: "BenchmarkSysbench/KV/1node_local/oltp_read_only"
     package: "pkg/sql/tests"
     runner_group: 2
-    count: 10
-    iterations: 12000
-    thresholds:
-      "sec/op": .02
-      "B/op": .015
-      "allocs/op": .015
+    count: 20
+    iterations: 6000
+    compare_alpha: 0.025
+    retries: 3
+    metrics:
+      - "sec/op"
+      - "allocs/op"
 
   - display_name: Sysbench
     labels: ["KV", "1node", "local", "oltp_write_only"]
     name: "BenchmarkSysbench/KV/1node_local/oltp_write_only"
     package: "pkg/sql/tests"
     runner_group: 2
-    count: 10
-    iterations: 12000
-    thresholds:
-      "sec/op": .025
-      "B/op": .0175
-      "allocs/op": .0175
+    count: 20
+    iterations: 6000
+    compare_alpha: 0.025
+    retries: 3
+    metrics:
+      - "sec/op"
+      - "allocs/op"
@@ -12,7 +12,9 @@ import (
 	"log"
 	"math"
 	"os"
+	"regexp"
 	"sort"
+	"strconv"
 	"strings"
 	"text/template"
 
@@ -57,14 +59,18 @@ func (c *CompareResult) generateSummaryData(
 	statusTemplateFunc func(status Status) string,
 ) []SummaryData {
 	summaryData := make([]SummaryData, 0, len(c.MetricMap))
-	for metricName, entry := range c.MetricMap {
+	for _, metricName := range c.Benchmark.Metrics {
+		entry := c.MetricMap[metricName]
+		if entry == nil {
+			log.Printf("WARN: no metric found for benchmark metric %q", metricName)
+			continue
+		}
 		benchmark := entry.BenchmarkEntries[c.EntryName]
 		cc := entry.ComputeComparison(c.EntryName, string(Old), string(New))
 		if cc == nil {
 			log.Printf("WARN: no comparison found for benchmark metric %q:%q", c.EntryName, metricName)
 			continue
 		}
-		threshold := c.Benchmark.Thresholds[metricName] * 100.0
 		status := statusTemplateFunc(c.status(metricName))
 		oldSum := benchmark.Summaries[string(Old)]
 		newSum := benchmark.Summaries[string(New)]
@@ -74,7 +80,6 @@ func (c *CompareResult) generateSummaryData(
 			NewCenter: fmt.Sprintf("%s ±%s", formatValue(newSum.Center, metricName), newSum.PctRangeString()),
 			Delta:     cc.FormattedDelta,
 			Note:      cc.Distribution.String(),
-			Threshold: fmt.Sprintf("%.1f%%", threshold),
 			Status:    status,
 		})
 	}
@@ -112,13 +117,6 @@ func (c *CompareResult) benchdiffData() BenchdiffData {
 // writeJSONSummary writes a JSON summary of the comparison results to the given
 // path.
 func (c CompareResults) writeJSONSummary(path string) error {
-	file, err := os.Create(path)
-	if err != nil {
-		return err
-	}
-	defer file.Close()
-	encoder := json.NewEncoder(file)
-	encoder.SetIndent("", "  ")
 	type (
 		Data struct {
 			Metric  string
@@ -161,13 +159,20 @@ func (c CompareResults) writeJSONSummary(path string) error {
 			Data:       data,
 		}
 	}
-	return encoder.Encode(struct {
+
+	jsonData, err := json.MarshalIndent(struct {
 		Entries   []Entry
 		Revisions Revisions
 	}{
 		Entries:   entries,
 		Revisions: suite.Revisions,
-	})
+	}, "", "  ")
+	if err != nil {
+		return err
+	}
+
+	formattedData := formatFloats(jsonData, 5)
+	return os.WriteFile(path, formattedData, 0644)
 }
 
 // writeGitHubSummary writes a markdown summary of the comparison results to the
@@ -192,7 +197,7 @@ func (c CompareResults) writeGitHubSummary(path string) error {
 			if status > finalStatus {
 				finalStatus = status
 			}
-			if status == Regression {
+			if status == Regressed {
 				regressionDetected = true
 			}
 			return statusToDot(status)
@@ -227,11 +232,24 @@ func (c CompareResults) writeGitHubSummary(path string) error {
 }
 
 func statusToDot(status Status) string {
-	return string([]rune("⚪🟢🟡🔴")[status])
+	return string([]rune("⚪🟢🔴")[status])
 }
 
 // formatValue formats a value according to the unit of the metric.
 func formatValue(val float64, metric string) string {
 	cls := benchunit.ClassOf(metric)
 	return benchunit.Scale(val, cls)
 }
+
+// formatFloats formats all floating point numbers in the JSON data to the given
+// precision.
+func formatFloats(jsonData []byte, precision int) []byte {
+	re := regexp.MustCompile(`\d+\.\d+`)
+	return re.ReplaceAllFunc(jsonData, func(match []byte) []byte {
+		f, err := strconv.ParseFloat(string(match), 64)
+		if err != nil {
+			return match
+		}
+		return []byte(strconv.FormatFloat(f, 'f', precision, 64))
+	})
+}