diff --git a/.travis.yml b/.travis.yml index 409a5b6..e334612 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,6 @@ language: go go: - - "1.3" - "1.4" - "1.5" - "1.6" diff --git a/ewma_test.go b/ewma_test.go index 058ae2d..ab68c4d 100644 --- a/ewma_test.go +++ b/ewma_test.go @@ -48,67 +48,67 @@ func TestEWMA1(t *testing.T) { a := NewEWMA1() a.Update(3) a.Tick() - if rate := a.Rate(); 0.6 != rate { + if rate := a.Rate(); float64NotEqual(0.6, rate) { t.Errorf("initial a.Rate(): 0.6 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.22072766470286553 != rate { + if rate := a.Rate(); float64NotEqual(0.22072766470286553, rate) { t.Errorf("1 minute a.Rate(): 0.22072766470286553 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.08120116994196772 != rate { + if rate := a.Rate(); float64NotEqual(0.08120116994196772, rate) { t.Errorf("2 minute a.Rate(): 0.08120116994196772 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.029872241020718428 != rate { + if rate := a.Rate(); float64NotEqual(0.029872241020718428, rate) { t.Errorf("3 minute a.Rate(): 0.029872241020718428 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.01098938333324054 != rate { + if rate := a.Rate(); float64NotEqual(0.01098938333324054, rate) { t.Errorf("4 minute a.Rate(): 0.01098938333324054 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.004042768199451294 != rate { + if rate := a.Rate(); float64NotEqual(0.004042768199451294, rate) { t.Errorf("5 minute a.Rate(): 0.004042768199451294 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.0014872513059998212 != rate { + if rate := a.Rate(); float64NotEqual(0.0014872513059998212, rate) { t.Errorf("6 minute a.Rate(): 0.0014872513059998212 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.0005471291793327122 != rate { + if rate := a.Rate(); float64NotEqual(0.0005471291793327122, rate) { t.Errorf("7 minute a.Rate(): 0.0005471291793327122 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.00020127757674150815 != rate { + if rate := a.Rate(); float64NotEqual(0.00020127757674150815, rate) { t.Errorf("8 minute a.Rate(): 0.00020127757674150815 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 7.404588245200814e-05 != rate { + if rate := a.Rate(); float64NotEqual(7.404588245200814e-05, rate) { t.Errorf("9 minute a.Rate(): 7.404588245200814e-05 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 2.7239957857491083e-05 != rate { + if rate := a.Rate(); float64NotEqual(2.7239957857491083e-05, rate) { t.Errorf("10 minute a.Rate(): 2.7239957857491083e-05 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 1.0021020474147462e-05 != rate { + if rate := a.Rate(); float64NotEqual(1.0021020474147462e-05, rate) { t.Errorf("11 minute a.Rate(): 1.0021020474147462e-05 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 3.6865274119969525e-06 != rate { + if rate := a.Rate(); float64NotEqual(3.6865274119969525e-06, rate) { t.Errorf("12 minute a.Rate(): 3.6865274119969525e-06 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 1.3561976441886433e-06 != rate { + if rate := a.Rate(); float64NotEqual(1.3561976441886433e-06, rate) { t.Errorf("13 minute a.Rate(): 1.3561976441886433e-06 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 4.989172314621449e-07 != rate { + if rate := a.Rate(); float64NotEqual(4.989172314621449e-07, rate) { t.Errorf("14 minute a.Rate(): 4.989172314621449e-07 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 1.8354139230109722e-07 != rate { + if rate := a.Rate(); float64NotEqual(1.8354139230109722e-07, rate) { t.Errorf("15 minute a.Rate(): 1.8354139230109722e-07 != %v\n", rate) } } @@ -117,67 +117,67 @@ func TestEWMA5(t *testing.T) { a := NewEWMA5() a.Update(3) a.Tick() - if rate := a.Rate(); 0.6 != rate { + if rate := a.Rate(); float64NotEqual(0.6, rate) { t.Errorf("initial a.Rate(): 0.6 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.49123845184678905 != rate { + if rate := a.Rate(); float64NotEqual(0.49123845184678905, rate) { t.Errorf("1 minute a.Rate(): 0.49123845184678905 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.4021920276213837 != rate { + if rate := a.Rate(); float64NotEqual(0.4021920276213837, rate) { t.Errorf("2 minute a.Rate(): 0.4021920276213837 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.32928698165641596 != rate { + if rate := a.Rate(); float64NotEqual(0.32928698165641596, rate) { t.Errorf("3 minute a.Rate(): 0.32928698165641596 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.269597378470333 != rate { + if rate := a.Rate(); float64NotEqual(0.269597378470333, rate) { t.Errorf("4 minute a.Rate(): 0.269597378470333 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.2207276647028654 != rate { + if rate := a.Rate(); float64NotEqual(0.2207276647028654, rate) { t.Errorf("5 minute a.Rate(): 0.2207276647028654 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.18071652714732128 != rate { + if rate := a.Rate(); float64NotEqual(0.18071652714732128, rate) { t.Errorf("6 minute a.Rate(): 0.18071652714732128 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.14795817836496392 != rate { + if rate := a.Rate(); float64NotEqual(0.14795817836496392, rate) { t.Errorf("7 minute a.Rate(): 0.14795817836496392 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.12113791079679326 != rate { + if rate := a.Rate(); float64NotEqual(0.12113791079679326, rate) { t.Errorf("8 minute a.Rate(): 0.12113791079679326 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.09917933293295193 != rate { + if rate := a.Rate(); float64NotEqual(0.09917933293295193, rate) { t.Errorf("9 minute a.Rate(): 0.09917933293295193 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.08120116994196763 != rate { + if rate := a.Rate(); float64NotEqual(0.08120116994196763, rate) { t.Errorf("10 minute a.Rate(): 0.08120116994196763 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.06648189501740036 != rate { + if rate := a.Rate(); float64NotEqual(0.06648189501740036, rate) { t.Errorf("11 minute a.Rate(): 0.06648189501740036 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.05443077197364752 != rate { + if rate := a.Rate(); float64NotEqual(0.05443077197364752, rate) { t.Errorf("12 minute a.Rate(): 0.05443077197364752 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.04456414692860035 != rate { + if rate := a.Rate(); float64NotEqual(0.04456414692860035, rate) { t.Errorf("13 minute a.Rate(): 0.04456414692860035 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.03648603757513079 != rate { + if rate := a.Rate(); float64NotEqual(0.03648603757513079, rate) { t.Errorf("14 minute a.Rate(): 0.03648603757513079 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.0298722410207183831020718428 != rate { + if rate := a.Rate(); float64NotEqual(0.0298722410207183831020718428, rate) { t.Errorf("15 minute a.Rate(): 0.0298722410207183831020718428 != %v\n", rate) } } @@ -186,67 +186,67 @@ func TestEWMA15(t *testing.T) { a := NewEWMA15() a.Update(3) a.Tick() - if rate := a.Rate(); 0.6 != rate { + if rate := a.Rate(); float64NotEqual(0.6, rate) { t.Errorf("initial a.Rate(): 0.6 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.5613041910189706 != rate { + if rate := a.Rate(); float64NotEqual(0.5613041910189706, rate) { t.Errorf("1 minute a.Rate(): 0.5613041910189706 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.5251039914257684 != rate { + if rate := a.Rate(); float64NotEqual(0.5251039914257684, rate) { t.Errorf("2 minute a.Rate(): 0.5251039914257684 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.4912384518467888184678905 != rate { + if rate := a.Rate(); float64NotEqual(0.4912384518467888184678905, rate) { t.Errorf("3 minute a.Rate(): 0.4912384518467888184678905 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.459557003018789 != rate { + if rate := a.Rate(); float64NotEqual(0.459557003018789, rate) { t.Errorf("4 minute a.Rate(): 0.459557003018789 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.4299187863442732 != rate { + if rate := a.Rate(); float64NotEqual(0.4299187863442732, rate) { t.Errorf("5 minute a.Rate(): 0.4299187863442732 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.4021920276213831 != rate { + if rate := a.Rate(); float64NotEqual(0.4021920276213831, rate) { t.Errorf("6 minute a.Rate(): 0.4021920276213831 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.37625345116383313 != rate { + if rate := a.Rate(); float64NotEqual(0.37625345116383313, rate) { t.Errorf("7 minute a.Rate(): 0.37625345116383313 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.3519877317060185 != rate { + if rate := a.Rate(); float64NotEqual(0.3519877317060185, rate) { t.Errorf("8 minute a.Rate(): 0.3519877317060185 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.3292869816564153165641596 != rate { + if rate := a.Rate(); float64NotEqual(0.3292869816564153165641596, rate) { t.Errorf("9 minute a.Rate(): 0.3292869816564153165641596 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.3080502714195546 != rate { + if rate := a.Rate(); float64NotEqual(0.3080502714195546, rate) { t.Errorf("10 minute a.Rate(): 0.3080502714195546 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.2881831806538789 != rate { + if rate := a.Rate(); float64NotEqual(0.2881831806538789, rate) { t.Errorf("11 minute a.Rate(): 0.2881831806538789 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.26959737847033216 != rate { + if rate := a.Rate(); float64NotEqual(0.26959737847033216, rate) { t.Errorf("12 minute a.Rate(): 0.26959737847033216 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.2522102307052083 != rate { + if rate := a.Rate(); float64NotEqual(0.2522102307052083, rate) { t.Errorf("13 minute a.Rate(): 0.2522102307052083 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.23594443252115815 != rate { + if rate := a.Rate(); float64NotEqual(0.23594443252115815, rate) { t.Errorf("14 minute a.Rate(): 0.23594443252115815 != %v\n", rate) } elapseMinute(a) - if rate := a.Rate(); 0.2207276647028646247028654470286553 != rate { + if rate := a.Rate(); float64NotEqual(0.2207276647028646247028654470286553, rate) { t.Errorf("15 minute a.Rate(): 0.2207276647028646247028654470286553 != %v\n", rate) } } diff --git a/gauge_float64_test.go b/gauge_float64_test.go index 6769b95..d5b601b 100644 --- a/gauge_float64_test.go +++ b/gauge_float64_test.go @@ -23,7 +23,7 @@ func BenchmarkGuageFloat64Parallel(b *testing.B) { func TestGaugeFloat64(t *testing.T) { g := NewGaugeFloat64() g.Update(float64(47.0)) - if v := g.Value(); float64(47.0) != v { + if v := g.Value(); float64NotEqual(float64(47.0), v) { t.Errorf("g.Value(): 47.0 != %v\n", v) } } @@ -33,7 +33,7 @@ func TestGaugeFloat64Snapshot(t *testing.T) { g.Update(float64(47.0)) snapshot := g.Snapshot() g.Update(float64(0)) - if v := snapshot.Value(); float64(47.0) != v { + if v := snapshot.Value(); float64NotEqual(float64(47.0), v) { t.Errorf("g.Value(): 47.0 != %v\n", v) } } @@ -42,7 +42,7 @@ func TestGetOrRegisterGaugeFloat64(t *testing.T) { r := NewRegistry() NewRegisteredGaugeFloat64("foo", r).Update(float64(47.0)) t.Logf("registry: %v", r) - if g := GetOrRegisterGaugeFloat64("foo", r); float64(47.0) != g.Value() { + if g := GetOrRegisterGaugeFloat64("foo", r); float64NotEqual(float64(47.0), g.Value()) { t.Fatal(g) } } @@ -63,7 +63,7 @@ func TestFunctionalGaugeFloat64(t *testing.T) { func TestGetOrRegisterFunctionalGaugeFloat64(t *testing.T) { r := NewRegistry() NewRegisteredFunctionalGaugeFloat64("foo", r, func() float64 { return 47 }) - if g := GetOrRegisterGaugeFloat64("foo", r); 47 != g.Value() { + if g := GetOrRegisterGaugeFloat64("foo", r); float64NotEqual(47, g.Value()) { t.Fatal(g) } } diff --git a/histogram_test.go b/histogram_test.go index d7f4f01..f2a2b00 100644 --- a/histogram_test.go +++ b/histogram_test.go @@ -38,20 +38,20 @@ func TestHistogramEmpty(t *testing.T) { if max := h.Max(); 0 != max { t.Errorf("h.Max(): 0 != %v\n", max) } - if mean := h.Mean(); 0.0 != mean { + if mean := h.Mean(); float64NotEqual(0.0, mean) { t.Errorf("h.Mean(): 0.0 != %v\n", mean) } - if stdDev := h.StdDev(); 0.0 != stdDev { + if stdDev := h.StdDev(); float64NotEqual(0.0, stdDev) { t.Errorf("h.StdDev(): 0.0 != %v\n", stdDev) } ps := h.Percentiles([]float64{0.5, 0.75, 0.99}) - if 0.0 != ps[0] { + if float64NotEqual(0.0, ps[0]) { t.Errorf("median: 0.0 != %v\n", ps[0]) } - if 0.0 != ps[1] { + if float64NotEqual(0.0, ps[1]) { t.Errorf("75th percentile: 0.0 != %v\n", ps[1]) } - if 0.0 != ps[2] { + if float64NotEqual(0.0, ps[2]) { t.Errorf("99th percentile: 0.0 != %v\n", ps[2]) } } @@ -76,20 +76,20 @@ func testHistogram10000(t *testing.T, h Histogram) { if max := h.Max(); 10000 != max { t.Errorf("h.Max(): 10000 != %v\n", max) } - if mean := h.Mean(); 5000.5 != mean { + if mean := h.Mean(); float64NotEqual(5000.5, mean) { t.Errorf("h.Mean(): 5000.5 != %v\n", mean) } - if stdDev := h.StdDev(); 2886.751331514372 != stdDev { + if stdDev := h.StdDev(); float64NotEqual(2886.751331514372, stdDev) { t.Errorf("h.StdDev(): 2886.751331514372 != %v\n", stdDev) } ps := h.Percentiles([]float64{0.5, 0.75, 0.99}) - if 5000.5 != ps[0] { + if float64NotEqual(5000.5, ps[0]) { t.Errorf("median: 5000.5 != %v\n", ps[0]) } - if 7500.75 != ps[1] { + if float64NotEqual(7500.75, ps[1]) { t.Errorf("75th percentile: 7500.75 != %v\n", ps[1]) } - if 9900.99 != ps[2] { + if float64NotEqual(9900.99, ps[2]) { t.Errorf("99th percentile: 9900.99 != %v\n", ps[2]) } } diff --git a/sample.go b/sample.go index fecee5e..b566392 100644 --- a/sample.go +++ b/sample.go @@ -6,10 +6,20 @@ import ( "sort" "sync" "time" + + "golang.org/x/sys/cpu" ) const rescaleThreshold = time.Hour +var x86HasSSE42, x86HasAVX, x86HasAVX2 bool + +func init() { + x86HasSSE42 = cpu.X86.HasSSE42 + x86HasAVX = cpu.X86.HasAVX + x86HasAVX2 = cpu.X86.HasAVX2 +} + // Samples maintain a statistically-significant selection of values from // a stream. type Sample interface { @@ -233,18 +243,8 @@ func (NilSample) Values() []int64 { return []int64{} } func (NilSample) Variance() float64 { return 0.0 } // SampleMax returns the maximum value of the slice of int64. -func SampleMax(values []int64) int64 { - if 0 == len(values) { - return 0 - } - var max int64 = math.MinInt64 - for _, v := range values { - if max < v { - max = v - } - } - return max -} +//go:noescape +func SampleMax(values []int64) int64 // SampleMean returns the mean value of the slice of int64. func SampleMean(values []int64) float64 { @@ -255,18 +255,8 @@ func SampleMean(values []int64) float64 { } // SampleMin returns the minimum value of the slice of int64. -func SampleMin(values []int64) int64 { - if 0 == len(values) { - return 0 - } - var min int64 = math.MaxInt64 - for _, v := range values { - if min > v { - min = v - } - } - return min -} +//go:noescape +func SampleMin(values []int64) int64 // SamplePercentiles returns an arbitrary percentile of the slice of int64. func SamplePercentile(values int64Slice, p float64) float64 { @@ -372,27 +362,12 @@ func SampleStdDev(values []int64) float64 { } // SampleSum returns the sum of the slice of int64. -func SampleSum(values []int64) int64 { - var sum int64 - for _, v := range values { - sum += v - } - return sum -} +//go:noescape +func SampleSum(values []int64) int64 // SampleVariance returns the variance of the slice of int64. -func SampleVariance(values []int64) float64 { - if 0 == len(values) { - return 0.0 - } - m := SampleMean(values) - var sum float64 - for _, v := range values { - d := float64(v) - m - sum += d * d - } - return sum / float64(len(values)) -} +//go:noescape +func SampleVariance(values []int64) float64 // A uniform sample using Vitter's Algorithm R. // @@ -614,3 +589,54 @@ type int64Slice []int64 func (p int64Slice) Len() int { return len(p) } func (p int64Slice) Less(i, j int) bool { return p[i] < p[j] } func (p int64Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } + +// sampleMax returns the maximum value of the slice of int64. +func sampleMax(values []int64) int64 { + if 0 == len(values) { + return 0 + } + var max int64 = math.MinInt64 + for _, v := range values { + if max < v { + max = v + } + } + return max +} + +// sampleMin returns the minimum value of the slice of int64. +func sampleMin(values []int64) int64 { + if 0 == len(values) { + return 0 + } + var min int64 = math.MaxInt64 + for _, v := range values { + if min > v { + min = v + } + } + return min +} + +// sampleSum returns the sum of the slice of int64. +func sampleSum(values []int64) int64 { + var sum int64 + for _, v := range values { + sum += v + } + return sum +} + +// sampleVariance returns the variance of the slice of int64. +func sampleVariance(values []int64) float64 { + if 0 == len(values) { + return 0.0 + } + m := SampleMean(values) + var sum float64 + for _, v := range values { + d := float64(v) - m + sum += d * d + } + return sum / float64(len(values)) +} diff --git a/sample_amd64.s b/sample_amd64.s new file mode 100644 index 0000000..f432f25 --- /dev/null +++ b/sample_amd64.s @@ -0,0 +1,1952 @@ +// +build !gccgo + +#include "textflag.h" + +TEXT ·SampleSum(SB), NOSPLIT, $0-32 + CMPB ·x86HasSSE42(SB), $0 + JNE HasAVX2 + CMPB ·x86HasAVX(SB), $0 + JNE HasAVX + CMPB ·x86HasSSE42(SB), $0 + JNE HasSSE42 + JMP ·sampleSum(SB) + +HasAVX2: + JMP sampleSumAVX2<>(SB) + +HasAVX: + JMP sampleSumAVX<>(SB) + +HasSSE42: + JMP sampleSumSSE42<>(SB) + +TEXT ·SampleVariance(SB), NOSPLIT, $0-32 + CMPB ·x86HasAVX2(SB), $0 + JNE HasAVX2 + CMPB ·x86HasAVX(SB), $0 + JNE HasAVX + CMPB ·x86HasSSE42(SB), $0 + JNE HasSSE42 + JMP ·sampleVariance(SB) + +HasAVX2: + JMP sampleVarianceAVX2<>(SB) + +HasAVX: + JMP sampleVarianceAVX<>(SB) + +HasSSE42: + JMP sampleVarianceSSE42<>(SB) + +TEXT ·SampleMin(SB), NOSPLIT, $0-32 + CMPB ·x86HasAVX2(SB), $0 + JNE HasAVX2 + CMPB ·x86HasAVX(SB), $0 + JNE HasAVX + CMPB ·x86HasSSE42(SB), $0 + JNE HasSSE42 + JMP ·sampleMin(SB) + +HasAVX2: + JMP sampleMinAVX2<>(SB) + +HasAVX: + JMP sampleMinAVX<>(SB) + +HasSSE42: + JMP sampleMinSSE42<>(SB) + +TEXT ·SampleMax(SB), NOSPLIT, $0-32 + CMPB ·x86HasAVX2(SB), $0 + JNE HasAVX2 + CMPB ·x86HasAVX(SB), $0 + JNE HasAVX + CMPB ·x86HasSSE42(SB), $0 + JNE HasSSE42 + JMP ·sampleMax(SB) + +HasAVX2: + JMP sampleMaxAVX2<>(SB) + +HasAVX: + JMP sampleMaxAVX<>(SB) + +HasSSE42: + JMP sampleMaxSSE42<>(SB) + +TEXT sampleSumAVX2<>(SB), NOSPLIT, $0-32 + + MOVQ addr+0(FP), DI + MOVQ len+8(FP), SI + MOVQ cap+16(FP), DX + + WORD $0x8548; BYTE $0xf6 // test rsi, rsi + JLE LBB0_1 + LONG $0x10fe8348 // cmp rsi, 16 + JAE LBB0_4 + WORD $0xc931 // xor ecx, ecx + WORD $0xc031 // xor eax, eax + JMP LBB0_11 + +LBB0_1: + WORD $0xc031 // xor eax, eax + MOVQ AX, x+24(FP) + RET + +LBB0_4: + WORD $0x8948; BYTE $0xf1 // mov rcx, rsi + LONG $0xf0e18348 // and rcx, -16 + LONG $0xf0518d48 // lea rdx, [rcx - 16] + WORD $0x8948; BYTE $0xd0 // mov rax, rdx + LONG $0x04e8c148 // shr rax, 4 + LONG $0x01c08348 // add rax, 1 + WORD $0x8941; BYTE $0xc0 // mov r8d, eax + LONG $0x01e08341 // and r8d, 1 + WORD $0x8548; BYTE $0xd2 // test rdx, rdx + JE LBB0_5 + LONG $0x000001ba; BYTE $0x00 // mov edx, 1 + WORD $0x2948; BYTE $0xc2 // sub rdx, rax + LONG $0x10048d49 // lea rax, [r8 + rdx] + LONG $0xffc08348 // add rax, -1 + LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 + WORD $0xd231 // xor edx, edx + LONG $0xc9eff1c5 // vpxor xmm1, xmm1, xmm1 + LONG $0xd2efe9c5 // vpxor xmm2, xmm2, xmm2 + LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 + +LBB0_7: + LONG $0x04d4fdc5; BYTE $0xd7 // vpaddq ymm0, ymm0, yword [rdi + 8*rdx] + LONG $0x4cd4f5c5; WORD $0x20d7 // vpaddq ymm1, ymm1, yword [rdi + 8*rdx + 32] + LONG $0x54d4edc5; WORD $0x40d7 // vpaddq ymm2, ymm2, yword [rdi + 8*rdx + 64] + LONG $0x5cd4e5c5; WORD $0x60d7 // vpaddq ymm3, ymm3, yword [rdi + 8*rdx + 96] + QUAD $0x000080d784d4fdc5; BYTE $0x00 // vpaddq ymm0, ymm0, yword [rdi + 8*rdx + 128] + QUAD $0x0000a0d78cd4f5c5; BYTE $0x00 // vpaddq ymm1, ymm1, yword [rdi + 8*rdx + 160] + QUAD $0x0000c0d794d4edc5; BYTE $0x00 // vpaddq ymm2, ymm2, yword [rdi + 8*rdx + 192] + QUAD $0x0000e0d79cd4e5c5; BYTE $0x00 // vpaddq ymm3, ymm3, yword [rdi + 8*rdx + 224] + LONG $0x20c28348 // add rdx, 32 + LONG $0x02c08348 // add rax, 2 + JNE LBB0_7 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JE LBB0_10 + +LBB0_9: + LONG $0x5cd4e5c5; WORD $0x60d7 // vpaddq ymm3, ymm3, yword [rdi + 8*rdx + 96] + LONG $0x54d4edc5; WORD $0x40d7 // vpaddq ymm2, ymm2, yword [rdi + 8*rdx + 64] + LONG $0x4cd4f5c5; WORD $0x20d7 // vpaddq ymm1, ymm1, yword [rdi + 8*rdx + 32] + LONG $0x04d4fdc5; BYTE $0xd7 // vpaddq ymm0, ymm0, yword [rdi + 8*rdx] + +LBB0_10: + LONG $0xcbd4f5c5 // vpaddq ymm1, ymm1, ymm3 + LONG $0xc2d4fdc5 // vpaddq ymm0, ymm0, ymm2 + LONG $0xc1d4fdc5 // vpaddq ymm0, ymm0, ymm1 + LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 + LONG $0xc1d4fdc5 // vpaddq ymm0, ymm0, ymm1 + LONG $0xc870f9c5; BYTE $0x4e // vpshufd xmm1, xmm0, 78 + LONG $0xc1d4fdc5 // vpaddq ymm0, ymm0, ymm1 + LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0 + WORD $0x3948; BYTE $0xf1 // cmp rcx, rsi + JE LBB0_12 + +LBB0_11: + LONG $0xcf040348 // add rax, qword [rdi + 8*rcx] + LONG $0x01c18348 // add rcx, 1 + WORD $0x3948; BYTE $0xce // cmp rsi, rcx + JNE LBB0_11 + +LBB0_12: + BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER + MOVQ AX, x+24(FP) + RET + +LBB0_5: + LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 + WORD $0xd231 // xor edx, edx + LONG $0xc9eff1c5 // vpxor xmm1, xmm1, xmm1 + LONG $0xd2efe9c5 // vpxor xmm2, xmm2, xmm2 + LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JNE LBB0_9 + JMP LBB0_10 + +TEXT sampleSumAVX<>(SB), NOSPLIT, $0-32 + + MOVQ addr+0(FP), DI + MOVQ len+8(FP), SI + MOVQ cap+16(FP), DX + + WORD $0x8548; BYTE $0xf6 // test rsi, rsi + JLE LBB0_1 + LONG $0x10fe8348 // cmp rsi, 16 + JAE LBB0_4 + WORD $0xc931 // xor ecx, ecx + WORD $0xc031 // xor eax, eax + JMP LBB0_11 + +LBB0_1: + WORD $0xc031 // xor eax, eax + MOVQ AX, x+24(FP) + RET + +LBB0_4: + WORD $0x8948; BYTE $0xf1 // mov rcx, rsi + LONG $0xf0e18348 // and rcx, -16 + LONG $0xf0518d48 // lea rdx, [rcx - 16] + WORD $0x8948; BYTE $0xd0 // mov rax, rdx + LONG $0x04e8c148 // shr rax, 4 + LONG $0x01c08348 // add rax, 1 + WORD $0x8941; BYTE $0xc0 // mov r8d, eax + LONG $0x01e08341 // and r8d, 1 + WORD $0x8548; BYTE $0xd2 // test rdx, rdx + JE LBB0_5 + LONG $0x000001ba; BYTE $0x00 // mov edx, 1 + WORD $0x2948; BYTE $0xc2 // sub rdx, rax + LONG $0x10048d49 // lea rax, [r8 + rdx] + LONG $0xffc08348 // add rax, -1 + LONG $0xef3941c4; BYTE $0xc0 // vpxor xmm8, xmm8, xmm8 + WORD $0xd231 // xor edx, edx + LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 + LONG $0xef3141c4; BYTE $0xc9 // vpxor xmm9, xmm9, xmm9 + LONG $0xef2941c4; BYTE $0xd2 // vpxor xmm10, xmm10, xmm10 + +LBB0_7: + LONG $0x246ffec5; BYTE $0xd7 // vmovdqu ymm4, yword [rdi + 8*rdx] + LONG $0x6c6ffec5; WORD $0x20d7 // vmovdqu ymm5, yword [rdi + 8*rdx + 32] + LONG $0x746ffec5; WORD $0x40d7 // vmovdqu ymm6, yword [rdi + 8*rdx + 64] + LONG $0x7c6ffec5; WORD $0x60d7 // vmovdqu ymm7, yword [rdi + 8*rdx + 96] + LONG $0xd45941c4; BYTE $0xd8 // vpaddq xmm11, xmm4, xmm8 + LONG $0x197de3c4; WORD $0x01e4 // vextractf128 xmm4, ymm4, 1 + LONG $0x197d63c4; WORD $0x01c1 // vextractf128 xmm1, ymm8, 1 + LONG $0xc9d4d9c5 // vpaddq xmm1, xmm4, xmm1 + LONG $0xebd451c5 // vpaddq xmm13, xmm5, xmm3 + LONG $0x197de3c4; WORD $0x01ed // vextractf128 xmm5, ymm5, 1 + LONG $0x197de3c4; WORD $0x01db // vextractf128 xmm3, ymm3, 1 + LONG $0xdbd4d1c5 // vpaddq xmm3, xmm5, xmm3 + LONG $0xd449c1c4; BYTE $0xe9 // vpaddq xmm5, xmm6, xmm9 + LONG $0x197de3c4; WORD $0x01f6 // vextractf128 xmm6, ymm6, 1 + LONG $0x197d63c4; WORD $0x01ca // vextractf128 xmm2, ymm9, 1 + LONG $0xd2d4c9c5 // vpaddq xmm2, xmm6, xmm2 + LONG $0xd441c1c4; BYTE $0xf2 // vpaddq xmm6, xmm7, xmm10 + LONG $0x197de3c4; WORD $0x01ff // vextractf128 xmm7, ymm7, 1 + LONG $0x197d63c4; WORD $0x01d0 // vextractf128 xmm0, ymm10, 1 + LONG $0xc0d4c1c5 // vpaddq xmm0, xmm7, xmm0 + QUAD $0x000080d7bc6ffec5; BYTE $0x00 // vmovdqu ymm7, yword [rdi + 8*rdx + 128] + QUAD $0x0000a0d78c6f7ec5; BYTE $0x00 // vmovdqu ymm9, yword [rdi + 8*rdx + 160] + QUAD $0x0000c0d7946f7ec5; BYTE $0x00 // vmovdqu ymm10, yword [rdi + 8*rdx + 192] + QUAD $0x0000e0d7a46f7ec5; BYTE $0x00 // vmovdqu ymm12, yword [rdi + 8*rdx + 224] + LONG $0x197de3c4; WORD $0x01fc // vextractf128 xmm4, ymm7, 1 + LONG $0xc9d4d9c5 // vpaddq xmm1, xmm4, xmm1 + LONG $0xd441c1c4; BYTE $0xe3 // vpaddq xmm4, xmm7, xmm11 + LONG $0x185d63c4; WORD $0x01c1 // vinsertf128 ymm8, ymm4, xmm1, 1 + LONG $0x197d63c4; WORD $0x01c9 // vextractf128 xmm1, ymm9, 1 + LONG $0xcbd4f1c5 // vpaddq xmm1, xmm1, xmm3 + LONG $0xd431c1c4; BYTE $0xdd // vpaddq xmm3, xmm9, xmm13 + LONG $0x1865e3c4; WORD $0x01d9 // vinsertf128 ymm3, ymm3, xmm1, 1 + LONG $0x197d63c4; WORD $0x01d1 // vextractf128 xmm1, ymm10, 1 + LONG $0xcad4f1c5 // vpaddq xmm1, xmm1, xmm2 + LONG $0xd5d4a9c5 // vpaddq xmm2, xmm10, xmm5 + LONG $0x186d63c4; WORD $0x01c9 // vinsertf128 ymm9, ymm2, xmm1, 1 + LONG $0x197d63c4; WORD $0x01e1 // vextractf128 xmm1, ymm12, 1 + LONG $0xc0d4f1c5 // vpaddq xmm0, xmm1, xmm0 + LONG $0xced499c5 // vpaddq xmm1, xmm12, xmm6 + LONG $0x187563c4; WORD $0x01d0 // vinsertf128 ymm10, ymm1, xmm0, 1 + LONG $0x20c28348 // add rdx, 32 + LONG $0x02c08348 // add rax, 2 + JNE LBB0_7 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JE LBB0_10 + +LBB0_9: + LONG $0x246ffec5; BYTE $0xd7 // vmovdqu ymm4, yword [rdi + 8*rdx] + LONG $0x446ffec5; WORD $0x20d7 // vmovdqu ymm0, yword [rdi + 8*rdx + 32] + LONG $0x4c6ffec5; WORD $0x40d7 // vmovdqu ymm1, yword [rdi + 8*rdx + 64] + LONG $0x546ffec5; WORD $0x60d7 // vmovdqu ymm2, yword [rdi + 8*rdx + 96] + LONG $0x197de3c4; WORD $0x01d5 // vextractf128 xmm5, ymm2, 1 + LONG $0x197d63c4; WORD $0x01d6 // vextractf128 xmm6, ymm10, 1 + LONG $0xeed4d1c5 // vpaddq xmm5, xmm5, xmm6 + LONG $0xd469c1c4; BYTE $0xd2 // vpaddq xmm2, xmm2, xmm10 + LONG $0x186d63c4; WORD $0x01d5 // vinsertf128 ymm10, ymm2, xmm5, 1 + LONG $0x197de3c4; WORD $0x01ca // vextractf128 xmm2, ymm1, 1 + LONG $0x197d63c4; WORD $0x01cd // vextractf128 xmm5, ymm9, 1 + LONG $0xd5d4e9c5 // vpaddq xmm2, xmm2, xmm5 + LONG $0xd471c1c4; BYTE $0xc9 // vpaddq xmm1, xmm1, xmm9 + LONG $0x187563c4; WORD $0x01ca // vinsertf128 ymm9, ymm1, xmm2, 1 + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0x197de3c4; WORD $0x01da // vextractf128 xmm2, ymm3, 1 + LONG $0xcad4f1c5 // vpaddq xmm1, xmm1, xmm2 + LONG $0xc3d4f9c5 // vpaddq xmm0, xmm0, xmm3 + LONG $0x187de3c4; WORD $0x01d9 // vinsertf128 ymm3, ymm0, xmm1, 1 + LONG $0x197de3c4; WORD $0x01e0 // vextractf128 xmm0, ymm4, 1 + LONG $0x197d63c4; WORD $0x01c1 // vextractf128 xmm1, ymm8, 1 + LONG $0xc1d4f9c5 // vpaddq xmm0, xmm0, xmm1 + LONG $0xd459c1c4; BYTE $0xc8 // vpaddq xmm1, xmm4, xmm8 + LONG $0x187563c4; WORD $0x01c0 // vinsertf128 ymm8, ymm1, xmm0, 1 + +LBB0_10: + LONG $0x197d63c4; WORD $0x01c0 // vextractf128 xmm0, ymm8, 1 + LONG $0x197de3c4; WORD $0x01d9 // vextractf128 xmm1, ymm3, 1 + LONG $0xc0d4f1c5 // vpaddq xmm0, xmm1, xmm0 + LONG $0xd461c1c4; BYTE $0xc8 // vpaddq xmm1, xmm3, xmm8 + LONG $0x197d63c4; WORD $0x01ca // vextractf128 xmm2, ymm9, 1 + LONG $0x197d63c4; WORD $0x01d3 // vextractf128 xmm3, ymm10, 1 + LONG $0xd3d4e9c5 // vpaddq xmm2, xmm2, xmm3 + LONG $0xc2d4f9c5 // vpaddq xmm0, xmm0, xmm2 + LONG $0xd431c1c4; BYTE $0xd2 // vpaddq xmm2, xmm9, xmm10 + LONG $0xcad4f1c5 // vpaddq xmm1, xmm1, xmm2 + LONG $0xc0d4f1c5 // vpaddq xmm0, xmm1, xmm0 + LONG $0xc870f9c5; BYTE $0x4e // vpshufd xmm1, xmm0, 78 + LONG $0xc1d4f9c5 // vpaddq xmm0, xmm0, xmm1 + LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0 + WORD $0x3948; BYTE $0xf1 // cmp rcx, rsi + JE LBB0_12 + +LBB0_11: + LONG $0xcf040348 // add rax, qword [rdi + 8*rcx] + LONG $0x01c18348 // add rcx, 1 + WORD $0x3948; BYTE $0xce // cmp rsi, rcx + JNE LBB0_11 + +LBB0_12: + BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER + MOVQ AX, x+24(FP) + RET + +LBB0_5: + LONG $0xef3941c4; BYTE $0xc0 // vpxor xmm8, xmm8, xmm8 + WORD $0xd231 // xor edx, edx + LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 + LONG $0xef3141c4; BYTE $0xc9 // vpxor xmm9, xmm9, xmm9 + LONG $0xef2941c4; BYTE $0xd2 // vpxor xmm10, xmm10, xmm10 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JNE LBB0_9 + JMP LBB0_10 + +TEXT sampleSumSSE42<>(SB), NOSPLIT, $0-32 + + MOVQ addr+0(FP), DI + MOVQ len+8(FP), SI + MOVQ cap+16(FP), DX + + WORD $0x8548; BYTE $0xf6 // test rsi, rsi + JLE LBB0_1 + LONG $0x04fe8348 // cmp rsi, 4 + JAE LBB0_4 + WORD $0xc931 // xor ecx, ecx + WORD $0xc031 // xor eax, eax + JMP LBB0_12 + +LBB0_1: + WORD $0xc031 // xor eax, eax + JMP LBB0_13 + +LBB0_4: + WORD $0x8948; BYTE $0xf1 // mov rcx, rsi + LONG $0xfce18348 // and rcx, -4 + LONG $0xfc518d48 // lea rdx, [rcx - 4] + WORD $0x8948; BYTE $0xd0 // mov rax, rdx + LONG $0x02e8c148 // shr rax, 2 + LONG $0x01c08348 // add rax, 1 + WORD $0x8941; BYTE $0xc0 // mov r8d, eax + LONG $0x03e08341 // and r8d, 3 + LONG $0x0cfa8348 // cmp rdx, 12 + JAE LBB0_6 + LONG $0xc0ef0f66 // pxor xmm0, xmm0 + WORD $0xd231 // xor edx, edx + LONG $0xc9ef0f66 // pxor xmm1, xmm1 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JNE LBB0_9 + JMP LBB0_11 + +LBB0_6: + LONG $0x000001ba; BYTE $0x00 // mov edx, 1 + WORD $0x2948; BYTE $0xc2 // sub rdx, rax + LONG $0x10048d49 // lea rax, [r8 + rdx] + LONG $0xffc08348 // add rax, -1 + LONG $0xc0ef0f66 // pxor xmm0, xmm0 + WORD $0xd231 // xor edx, edx + LONG $0xc9ef0f66 // pxor xmm1, xmm1 + +LBB0_7: + LONG $0x146f0ff3; BYTE $0xd7 // movdqu xmm2, oword [rdi + 8*rdx] + LONG $0xd0d40f66 // paddq xmm2, xmm0 + LONG $0x446f0ff3; WORD $0x10d7 // movdqu xmm0, oword [rdi + 8*rdx + 16] + LONG $0xc1d40f66 // paddq xmm0, xmm1 + LONG $0x4c6f0ff3; WORD $0x20d7 // movdqu xmm1, oword [rdi + 8*rdx + 32] + LONG $0x5c6f0ff3; WORD $0x30d7 // movdqu xmm3, oword [rdi + 8*rdx + 48] + LONG $0x646f0ff3; WORD $0x40d7 // movdqu xmm4, oword [rdi + 8*rdx + 64] + LONG $0xe1d40f66 // paddq xmm4, xmm1 + LONG $0xe2d40f66 // paddq xmm4, xmm2 + LONG $0x546f0ff3; WORD $0x50d7 // movdqu xmm2, oword [rdi + 8*rdx + 80] + LONG $0xd3d40f66 // paddq xmm2, xmm3 + LONG $0xd0d40f66 // paddq xmm2, xmm0 + LONG $0x446f0ff3; WORD $0x60d7 // movdqu xmm0, oword [rdi + 8*rdx + 96] + LONG $0xc4d40f66 // paddq xmm0, xmm4 + LONG $0x4c6f0ff3; WORD $0x70d7 // movdqu xmm1, oword [rdi + 8*rdx + 112] + LONG $0xcad40f66 // paddq xmm1, xmm2 + LONG $0x10c28348 // add rdx, 16 + LONG $0x04c08348 // add rax, 4 + JNE LBB0_7 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JE LBB0_11 + +LBB0_9: + LONG $0xd7048d48 // lea rax, [rdi + 8*rdx] + LONG $0x10c08348 // add rax, 16 + WORD $0xf749; BYTE $0xd8 // neg r8 + +LBB0_10: + LONG $0x506f0ff3; BYTE $0xf0 // movdqu xmm2, oword [rax - 16] + LONG $0xc2d40f66 // paddq xmm0, xmm2 + LONG $0x106f0ff3 // movdqu xmm2, oword [rax] + LONG $0xcad40f66 // paddq xmm1, xmm2 + LONG $0x20c08348 // add rax, 32 + LONG $0x01c08349 // add r8, 1 + JNE LBB0_10 + +LBB0_11: + LONG $0xc1d40f66 // paddq xmm0, xmm1 + LONG $0xc8700f66; BYTE $0x4e // pshufd xmm1, xmm0, 78 + LONG $0xc8d40f66 // paddq xmm1, xmm0 + LONG $0x7e0f4866; BYTE $0xc8 // movq rax, xmm1 + WORD $0x3948; BYTE $0xf1 // cmp rcx, rsi + JE LBB0_13 + +LBB0_12: + LONG $0xcf040348 // add rax, qword [rdi + 8*rcx] + LONG $0x01c18348 // add rcx, 1 + WORD $0x3948; BYTE $0xce // cmp rsi, rcx + JNE LBB0_12 + +LBB0_13: + MOVQ AX, x+24(FP) + RET + +TEXT sampleVarianceAVX2<>(SB), NOSPLIT, $0-32 + + MOVQ addr+0(FP), DI + MOVQ len+8(FP), SI + MOVQ cap+16(FP), DX + + WORD $0x8548; BYTE $0xf6 // test rsi, rsi + JE LBB3_1 + JLE LBB3_23 + LONG $0x0ffe8348 // cmp rsi, 15 + JA LBB3_5 + WORD $0xc031 // xor eax, eax + WORD $0xc931 // xor ecx, ecx + JMP LBB3_12 + +LBB3_1: + LONG $0xc057f8c5 // vxorps xmm0, xmm0, xmm0 + MOVQ X0, x+24(FP) + RET + +LBB3_23: + LONG $0x2afb61c4; BYTE $0xde // vcvtsi2sd xmm11, xmm0, rsi + LONG $0xd257e9c5 // vxorpd xmm2, xmm2, xmm2 + LONG $0x5e6bc1c4; BYTE $0xc3 // vdivsd xmm0, xmm2, xmm11 + JMP LBB3_22 + +LBB3_5: + WORD $0x8948; BYTE $0xf0 // mov rax, rsi + LONG $0xf0e08348 // and rax, -16 + LONG $0xf0508d48 // lea rdx, [rax - 16] + WORD $0x8948; BYTE $0xd1 // mov rcx, rdx + LONG $0x04e9c148 // shr rcx, 4 + LONG $0x01c18348 // add rcx, 1 + WORD $0x8941; BYTE $0xc8 // mov r8d, ecx + LONG $0x01e08341 // and r8d, 1 + WORD $0x8548; BYTE $0xd2 // test rdx, rdx + JE LBB3_6 + LONG $0x000001ba; BYTE $0x00 // mov edx, 1 + WORD $0x2948; BYTE $0xca // sub rdx, rcx + LONG $0x100c8d49 // lea rcx, [r8 + rdx] + LONG $0xffc18348 // add rcx, -1 + LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 + WORD $0xd231 // xor edx, edx + LONG $0xc9eff1c5 // vpxor xmm1, xmm1, xmm1 + LONG $0xd2efe9c5 // vpxor xmm2, xmm2, xmm2 + LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 + +LBB3_8: + LONG $0x04d4fdc5; BYTE $0xd7 // vpaddq ymm0, ymm0, yword [rdi + 8*rdx] + LONG $0x4cd4f5c5; WORD $0x20d7 // vpaddq ymm1, ymm1, yword [rdi + 8*rdx + 32] + LONG $0x54d4edc5; WORD $0x40d7 // vpaddq ymm2, ymm2, yword [rdi + 8*rdx + 64] + LONG $0x5cd4e5c5; WORD $0x60d7 // vpaddq ymm3, ymm3, yword [rdi + 8*rdx + 96] + QUAD $0x000080d784d4fdc5; BYTE $0x00 // vpaddq ymm0, ymm0, yword [rdi + 8*rdx + 128] + QUAD $0x0000a0d78cd4f5c5; BYTE $0x00 // vpaddq ymm1, ymm1, yword [rdi + 8*rdx + 160] + QUAD $0x0000c0d794d4edc5; BYTE $0x00 // vpaddq ymm2, ymm2, yword [rdi + 8*rdx + 192] + QUAD $0x0000e0d79cd4e5c5; BYTE $0x00 // vpaddq ymm3, ymm3, yword [rdi + 8*rdx + 224] + LONG $0x20c28348 // add rdx, 32 + LONG $0x02c18348 // add rcx, 2 + JNE LBB3_8 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JE LBB3_11 + +LBB3_10: + LONG $0x5cd4e5c5; WORD $0x60d7 // vpaddq ymm3, ymm3, yword [rdi + 8*rdx + 96] + LONG $0x54d4edc5; WORD $0x40d7 // vpaddq ymm2, ymm2, yword [rdi + 8*rdx + 64] + LONG $0x4cd4f5c5; WORD $0x20d7 // vpaddq ymm1, ymm1, yword [rdi + 8*rdx + 32] + LONG $0x04d4fdc5; BYTE $0xd7 // vpaddq ymm0, ymm0, yword [rdi + 8*rdx] + +LBB3_11: + LONG $0xcbd4f5c5 // vpaddq ymm1, ymm1, ymm3 + LONG $0xc2d4fdc5 // vpaddq ymm0, ymm0, ymm2 + LONG $0xc1d4fdc5 // vpaddq ymm0, ymm0, ymm1 + LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 + LONG $0xc1d4fdc5 // vpaddq ymm0, ymm0, ymm1 + LONG $0xc870f9c5; BYTE $0x4e // vpshufd xmm1, xmm0, 78 + LONG $0xc1d4fdc5 // vpaddq ymm0, ymm0, ymm1 + LONG $0x7ef9e1c4; BYTE $0xc1 // vmovq rcx, xmm0 + WORD $0x3948; BYTE $0xf0 // cmp rax, rsi + JE LBB3_13 + +LBB3_12: + LONG $0xc70c0348 // add rcx, qword [rdi + 8*rax] + LONG $0x01c08348 // add rax, 1 + WORD $0x3948; BYTE $0xc6 // cmp rsi, rax + JNE LBB3_12 + +LBB3_13: + LONG $0x2adb61c4; BYTE $0xde // vcvtsi2sd xmm11, xmm4, rsi + WORD $0x8548; BYTE $0xf6 // test rsi, rsi + JLE LBB3_14 + LONG $0x2adbe1c4; BYTE $0xc9 // vcvtsi2sd xmm1, xmm4, rcx + LONG $0x5e7341c4; BYTE $0xe3 // vdivsd xmm12, xmm1, xmm11 + LONG $0x10fe8348 // cmp rsi, 16 + JAE LBB3_17 + LONG $0xd2efe9c5 // vpxor xmm2, xmm2, xmm2 + WORD $0xc031 // xor eax, eax + JMP LBB3_20 + +LBB3_14: + LONG $0xd2efe9c5 // vpxor xmm2, xmm2, xmm2 + LONG $0x5e6bc1c4; BYTE $0xc3 // vdivsd xmm0, xmm2, xmm11 + JMP LBB3_22 + +LBB3_17: + WORD $0x8948; BYTE $0xf0 // mov rax, rsi + LONG $0xf0e08348 // and rax, -16 + LONG $0x197d42c4; BYTE $0xec // vbroadcastsd ymm13, xmm12 + LONG $0x570941c4; BYTE $0xf6 // vxorpd xmm14, xmm14, xmm14 + WORD $0xc931 // xor ecx, ecx + LONG $0xe457d9c5 // vxorpd xmm4, xmm4, xmm4 + LONG $0xed57d1c5 // vxorpd xmm5, xmm5, xmm5 + LONG $0xf657c9c5 // vxorpd xmm6, xmm6, xmm6 + +LBB3_18: + LONG $0x046f7ec5; BYTE $0xcf // vmovdqu ymm8, yword [rdi + 8*rcx] + LONG $0x546f7ec5; WORD $0x20cf // vmovdqu ymm10, yword [rdi + 8*rcx + 32] + LONG $0x4c6f7ec5; WORD $0x40cf // vmovdqu ymm9, yword [rdi + 8*rcx + 64] + LONG $0x397d63c4; WORD $0x01c0 // vextracti128 xmm0, ymm8, 1 + LONG $0x16f9e3c4; WORD $0x01c2 // vpextrq rdx, xmm0, 1 + LONG $0x7c6ffec5; WORD $0x60cf // vmovdqu ymm7, yword [rdi + 8*rcx + 96] + LONG $0x2a83e1c4; BYTE $0xca // vcvtsi2sd xmm1, xmm15, rdx + LONG $0x7ef9e1c4; BYTE $0xc2 // vmovq rdx, xmm0 + LONG $0x2a83e1c4; BYTE $0xc2 // vcvtsi2sd xmm0, xmm15, rdx + LONG $0x16f963c4; WORD $0x01c2 // vpextrq rdx, xmm8, 1 + LONG $0xc114f9c5 // vunpcklpd xmm0, xmm0, xmm1 + LONG $0x2a83e1c4; BYTE $0xca // vcvtsi2sd xmm1, xmm15, rdx + LONG $0x7ef961c4; BYTE $0xc2 // vmovq rdx, xmm8 + LONG $0x2a83e1c4; BYTE $0xda // vcvtsi2sd xmm3, xmm15, rdx + LONG $0xc914e1c5 // vunpcklpd xmm1, xmm3, xmm1 + LONG $0x397d63c4; WORD $0x01d3 // vextracti128 xmm3, ymm10, 1 + LONG $0x16f9e3c4; WORD $0x01da // vpextrq rdx, xmm3, 1 + LONG $0x2a83e1c4; BYTE $0xd2 // vcvtsi2sd xmm2, xmm15, rdx + LONG $0x187563c4; WORD $0x01c0 // vinsertf128 ymm8, ymm1, xmm0, 1 + LONG $0x7ef9e1c4; BYTE $0xda // vmovq rdx, xmm3 + LONG $0x2a83e1c4; BYTE $0xc2 // vcvtsi2sd xmm0, xmm15, rdx + LONG $0x16f963c4; WORD $0x01d2 // vpextrq rdx, xmm10, 1 + LONG $0x2a83e1c4; BYTE $0xca // vcvtsi2sd xmm1, xmm15, rdx + LONG $0xc214f9c5 // vunpcklpd xmm0, xmm0, xmm2 + LONG $0x7ef961c4; BYTE $0xd2 // vmovq rdx, xmm10 + LONG $0x2a83e1c4; BYTE $0xd2 // vcvtsi2sd xmm2, xmm15, rdx + LONG $0xc914e9c5 // vunpcklpd xmm1, xmm2, xmm1 + LONG $0x397d63c4; WORD $0x01ca // vextracti128 xmm2, ymm9, 1 + LONG $0x16f9e3c4; WORD $0x01d2 // vpextrq rdx, xmm2, 1 + LONG $0x187563c4; WORD $0x01d0 // vinsertf128 ymm10, ymm1, xmm0, 1 + LONG $0x2a83e1c4; BYTE $0xca // vcvtsi2sd xmm1, xmm15, rdx + LONG $0x7ef9e1c4; BYTE $0xd2 // vmovq rdx, xmm2 + LONG $0x2a83e1c4; BYTE $0xd2 // vcvtsi2sd xmm2, xmm15, rdx + LONG $0x16f963c4; WORD $0x01ca // vpextrq rdx, xmm9, 1 + LONG $0xc914e9c5 // vunpcklpd xmm1, xmm2, xmm1 + LONG $0x2a83e1c4; BYTE $0xd2 // vcvtsi2sd xmm2, xmm15, rdx + LONG $0x7ef961c4; BYTE $0xca // vmovq rdx, xmm9 + LONG $0x2a83e1c4; BYTE $0xda // vcvtsi2sd xmm3, xmm15, rdx + LONG $0xd214e1c5 // vunpcklpd xmm2, xmm3, xmm2 + LONG $0x397de3c4; WORD $0x01fb // vextracti128 xmm3, ymm7, 1 + LONG $0x16f9e3c4; WORD $0x01da // vpextrq rdx, xmm3, 1 + LONG $0x2a83e1c4; BYTE $0xc2 // vcvtsi2sd xmm0, xmm15, rdx + LONG $0x186de3c4; WORD $0x01c9 // vinsertf128 ymm1, ymm2, xmm1, 1 + LONG $0x7ef9e1c4; BYTE $0xda // vmovq rdx, xmm3 + LONG $0x2a83e1c4; BYTE $0xd2 // vcvtsi2sd xmm2, xmm15, rdx + LONG $0x16f9e3c4; WORD $0x01fa // vpextrq rdx, xmm7, 1 + LONG $0x2a83e1c4; BYTE $0xda // vcvtsi2sd xmm3, xmm15, rdx + LONG $0xc014e9c5 // vunpcklpd xmm0, xmm2, xmm0 + LONG $0x7ef9e1c4; BYTE $0xfa // vmovq rdx, xmm7 + LONG $0x2a83e1c4; BYTE $0xd2 // vcvtsi2sd xmm2, xmm15, rdx + LONG $0xd314e9c5 // vunpcklpd xmm2, xmm2, xmm3 + LONG $0x186de3c4; WORD $0x01c0 // vinsertf128 ymm0, ymm2, xmm0, 1 + LONG $0x5c3dc1c4; BYTE $0xd5 // vsubpd ymm2, ymm8, ymm13 + LONG $0x5c2dc1c4; BYTE $0xdd // vsubpd ymm3, ymm10, ymm13 + LONG $0x5c75c1c4; BYTE $0xcd // vsubpd ymm1, ymm1, ymm13 + LONG $0x5c7dc1c4; BYTE $0xc5 // vsubpd ymm0, ymm0, ymm13 + LONG $0xd259edc5 // vmulpd ymm2, ymm2, ymm2 + LONG $0x586d41c4; BYTE $0xf6 // vaddpd ymm14, ymm2, ymm14 + LONG $0xd359e5c5 // vmulpd ymm2, ymm3, ymm3 + LONG $0xe458edc5 // vaddpd ymm4, ymm2, ymm4 + LONG $0xc959f5c5 // vmulpd ymm1, ymm1, ymm1 + LONG $0xed58f5c5 // vaddpd ymm5, ymm1, ymm5 + LONG $0xc059fdc5 // vmulpd ymm0, ymm0, ymm0 + LONG $0xf658fdc5 // vaddpd ymm6, ymm0, ymm6 + LONG $0x10c18348 // add rcx, 16 + WORD $0x3948; BYTE $0xc8 // cmp rax, rcx + JNE LBB3_18 + LONG $0x585dc1c4; BYTE $0xc6 // vaddpd ymm0, ymm4, ymm14 + LONG $0xc058d5c5 // vaddpd ymm0, ymm5, ymm0 + LONG $0xc058cdc5 // vaddpd ymm0, ymm6, ymm0 + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0xc158fdc5 // vaddpd ymm0, ymm0, ymm1 + LONG $0xd07cfdc5 // vhaddpd ymm2, ymm0, ymm0 + WORD $0x3948; BYTE $0xf0 // cmp rax, rsi + JE LBB3_21 + +LBB3_20: + LONG $0x2a83e1c4; WORD $0xc704 // vcvtsi2sd xmm0, xmm15, qword [rdi + 8*rax] + LONG $0x5c7bc1c4; BYTE $0xc4 // vsubsd xmm0, xmm0, xmm12 + LONG $0xc059fbc5 // vmulsd xmm0, xmm0, xmm0 + LONG $0xd258fbc5 // vaddsd xmm2, xmm0, xmm2 + LONG $0x01c08348 // add rax, 1 + WORD $0x3948; BYTE $0xc6 // cmp rsi, rax + JNE LBB3_20 + +LBB3_21: + LONG $0x5e6bc1c4; BYTE $0xc3 // vdivsd xmm0, xmm2, xmm11 + +LBB3_22: + BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER + MOVQ X0, x+24(FP) + RET + +LBB3_6: + LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 + WORD $0xd231 // xor edx, edx + LONG $0xc9eff1c5 // vpxor xmm1, xmm1, xmm1 + LONG $0xd2efe9c5 // vpxor xmm2, xmm2, xmm2 + LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JNE LBB3_10 + JMP LBB3_11 + +TEXT sampleVarianceAVX<>(SB), NOSPLIT, $0-32 + + MOVQ addr+0(FP), DI + MOVQ len+8(FP), SI + MOVQ cap+16(FP), DX + + WORD $0x8548; BYTE $0xf6 // test rsi, rsi + JE LBB3_1 + JLE LBB3_23 + LONG $0x0ffe8348 // cmp rsi, 15 + JA LBB3_5 + WORD $0xc031 // xor eax, eax + WORD $0xc931 // xor ecx, ecx + JMP LBB3_12 + +LBB3_1: + LONG $0xc057f8c5 // vxorps xmm0, xmm0, xmm0 + MOVQ X0, x+24(FP) + RET + +LBB3_23: + LONG $0x2afb61c4; BYTE $0xde // vcvtsi2sd xmm11, xmm0, rsi + LONG $0xd257e9c5 // vxorpd xmm2, xmm2, xmm2 + LONG $0x5e6bc1c4; BYTE $0xc3 // vdivsd xmm0, xmm2, xmm11 + JMP LBB3_22 + +LBB3_5: + WORD $0x8948; BYTE $0xf0 // mov rax, rsi + LONG $0xf0e08348 // and rax, -16 + LONG $0xf0508d48 // lea rdx, [rax - 16] + WORD $0x8948; BYTE $0xd1 // mov rcx, rdx + LONG $0x04e9c148 // shr rcx, 4 + LONG $0x01c18348 // add rcx, 1 + WORD $0x8941; BYTE $0xc8 // mov r8d, ecx + LONG $0x01e08341 // and r8d, 1 + WORD $0x8548; BYTE $0xd2 // test rdx, rdx + JE LBB3_6 + LONG $0x000001ba; BYTE $0x00 // mov edx, 1 + WORD $0x2948; BYTE $0xca // sub rdx, rcx + LONG $0x100c8d49 // lea rcx, [r8 + rdx] + LONG $0xffc18348 // add rcx, -1 + LONG $0xef3941c4; BYTE $0xc0 // vpxor xmm8, xmm8, xmm8 + WORD $0xd231 // xor edx, edx + LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 + LONG $0xef3141c4; BYTE $0xc9 // vpxor xmm9, xmm9, xmm9 + LONG $0xef2941c4; BYTE $0xd2 // vpxor xmm10, xmm10, xmm10 + +LBB3_8: + LONG $0x246ffec5; BYTE $0xd7 // vmovdqu ymm4, yword [rdi + 8*rdx] + LONG $0x6c6ffec5; WORD $0x20d7 // vmovdqu ymm5, yword [rdi + 8*rdx + 32] + LONG $0x746ffec5; WORD $0x40d7 // vmovdqu ymm6, yword [rdi + 8*rdx + 64] + LONG $0x7c6ffec5; WORD $0x60d7 // vmovdqu ymm7, yword [rdi + 8*rdx + 96] + LONG $0xd45941c4; BYTE $0xd8 // vpaddq xmm11, xmm4, xmm8 + LONG $0x197de3c4; WORD $0x01e4 // vextractf128 xmm4, ymm4, 1 + LONG $0x197d63c4; WORD $0x01c1 // vextractf128 xmm1, ymm8, 1 + LONG $0xc9d4d9c5 // vpaddq xmm1, xmm4, xmm1 + LONG $0xebd451c5 // vpaddq xmm13, xmm5, xmm3 + LONG $0x197de3c4; WORD $0x01ed // vextractf128 xmm5, ymm5, 1 + LONG $0x197de3c4; WORD $0x01db // vextractf128 xmm3, ymm3, 1 + LONG $0xdbd4d1c5 // vpaddq xmm3, xmm5, xmm3 + LONG $0xd449c1c4; BYTE $0xe9 // vpaddq xmm5, xmm6, xmm9 + LONG $0x197de3c4; WORD $0x01f6 // vextractf128 xmm6, ymm6, 1 + LONG $0x197d63c4; WORD $0x01ca // vextractf128 xmm2, ymm9, 1 + LONG $0xd2d4c9c5 // vpaddq xmm2, xmm6, xmm2 + LONG $0xd441c1c4; BYTE $0xf2 // vpaddq xmm6, xmm7, xmm10 + LONG $0x197de3c4; WORD $0x01ff // vextractf128 xmm7, ymm7, 1 + LONG $0x197d63c4; WORD $0x01d0 // vextractf128 xmm0, ymm10, 1 + LONG $0xc0d4c1c5 // vpaddq xmm0, xmm7, xmm0 + QUAD $0x000080d7bc6ffec5; BYTE $0x00 // vmovdqu ymm7, yword [rdi + 8*rdx + 128] + QUAD $0x0000a0d78c6f7ec5; BYTE $0x00 // vmovdqu ymm9, yword [rdi + 8*rdx + 160] + QUAD $0x0000c0d7946f7ec5; BYTE $0x00 // vmovdqu ymm10, yword [rdi + 8*rdx + 192] + QUAD $0x0000e0d7a46f7ec5; BYTE $0x00 // vmovdqu ymm12, yword [rdi + 8*rdx + 224] + LONG $0x197de3c4; WORD $0x01fc // vextractf128 xmm4, ymm7, 1 + LONG $0xc9d4d9c5 // vpaddq xmm1, xmm4, xmm1 + LONG $0xd441c1c4; BYTE $0xe3 // vpaddq xmm4, xmm7, xmm11 + LONG $0x185d63c4; WORD $0x01c1 // vinsertf128 ymm8, ymm4, xmm1, 1 + LONG $0x197d63c4; WORD $0x01c9 // vextractf128 xmm1, ymm9, 1 + LONG $0xcbd4f1c5 // vpaddq xmm1, xmm1, xmm3 + LONG $0xd431c1c4; BYTE $0xdd // vpaddq xmm3, xmm9, xmm13 + LONG $0x1865e3c4; WORD $0x01d9 // vinsertf128 ymm3, ymm3, xmm1, 1 + LONG $0x197d63c4; WORD $0x01d1 // vextractf128 xmm1, ymm10, 1 + LONG $0xcad4f1c5 // vpaddq xmm1, xmm1, xmm2 + LONG $0xd5d4a9c5 // vpaddq xmm2, xmm10, xmm5 + LONG $0x186d63c4; WORD $0x01c9 // vinsertf128 ymm9, ymm2, xmm1, 1 + LONG $0x197d63c4; WORD $0x01e1 // vextractf128 xmm1, ymm12, 1 + LONG $0xc0d4f1c5 // vpaddq xmm0, xmm1, xmm0 + LONG $0xced499c5 // vpaddq xmm1, xmm12, xmm6 + LONG $0x187563c4; WORD $0x01d0 // vinsertf128 ymm10, ymm1, xmm0, 1 + LONG $0x20c28348 // add rdx, 32 + LONG $0x02c18348 // add rcx, 2 + JNE LBB3_8 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JE LBB3_11 + +LBB3_10: + LONG $0x246ffec5; BYTE $0xd7 // vmovdqu ymm4, yword [rdi + 8*rdx] + LONG $0x446ffec5; WORD $0x20d7 // vmovdqu ymm0, yword [rdi + 8*rdx + 32] + LONG $0x4c6ffec5; WORD $0x40d7 // vmovdqu ymm1, yword [rdi + 8*rdx + 64] + LONG $0x546ffec5; WORD $0x60d7 // vmovdqu ymm2, yword [rdi + 8*rdx + 96] + LONG $0x197de3c4; WORD $0x01d5 // vextractf128 xmm5, ymm2, 1 + LONG $0x197d63c4; WORD $0x01d6 // vextractf128 xmm6, ymm10, 1 + LONG $0xeed4d1c5 // vpaddq xmm5, xmm5, xmm6 + LONG $0xd469c1c4; BYTE $0xd2 // vpaddq xmm2, xmm2, xmm10 + LONG $0x186d63c4; WORD $0x01d5 // vinsertf128 ymm10, ymm2, xmm5, 1 + LONG $0x197de3c4; WORD $0x01ca // vextractf128 xmm2, ymm1, 1 + LONG $0x197d63c4; WORD $0x01cd // vextractf128 xmm5, ymm9, 1 + LONG $0xd5d4e9c5 // vpaddq xmm2, xmm2, xmm5 + LONG $0xd471c1c4; BYTE $0xc9 // vpaddq xmm1, xmm1, xmm9 + LONG $0x187563c4; WORD $0x01ca // vinsertf128 ymm9, ymm1, xmm2, 1 + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0x197de3c4; WORD $0x01da // vextractf128 xmm2, ymm3, 1 + LONG $0xcad4f1c5 // vpaddq xmm1, xmm1, xmm2 + LONG $0xc3d4f9c5 // vpaddq xmm0, xmm0, xmm3 + LONG $0x187de3c4; WORD $0x01d9 // vinsertf128 ymm3, ymm0, xmm1, 1 + LONG $0x197de3c4; WORD $0x01e0 // vextractf128 xmm0, ymm4, 1 + LONG $0x197d63c4; WORD $0x01c1 // vextractf128 xmm1, ymm8, 1 + LONG $0xc1d4f9c5 // vpaddq xmm0, xmm0, xmm1 + LONG $0xd459c1c4; BYTE $0xc8 // vpaddq xmm1, xmm4, xmm8 + LONG $0x187563c4; WORD $0x01c0 // vinsertf128 ymm8, ymm1, xmm0, 1 + +LBB3_11: + LONG $0x197d63c4; WORD $0x01c0 // vextractf128 xmm0, ymm8, 1 + LONG $0x197de3c4; WORD $0x01d9 // vextractf128 xmm1, ymm3, 1 + LONG $0xc0d4f1c5 // vpaddq xmm0, xmm1, xmm0 + LONG $0xd461c1c4; BYTE $0xc8 // vpaddq xmm1, xmm3, xmm8 + LONG $0x197d63c4; WORD $0x01ca // vextractf128 xmm2, ymm9, 1 + LONG $0x197d63c4; WORD $0x01d3 // vextractf128 xmm3, ymm10, 1 + LONG $0xd3d4e9c5 // vpaddq xmm2, xmm2, xmm3 + LONG $0xc2d4f9c5 // vpaddq xmm0, xmm0, xmm2 + LONG $0xd431c1c4; BYTE $0xd2 // vpaddq xmm2, xmm9, xmm10 + LONG $0xcad4f1c5 // vpaddq xmm1, xmm1, xmm2 + LONG $0xc0d4f1c5 // vpaddq xmm0, xmm1, xmm0 + LONG $0xc870f9c5; BYTE $0x4e // vpshufd xmm1, xmm0, 78 + LONG $0xc1d4f9c5 // vpaddq xmm0, xmm0, xmm1 + LONG $0x7ef9e1c4; BYTE $0xc1 // vmovq rcx, xmm0 + WORD $0x3948; BYTE $0xf0 // cmp rax, rsi + JE LBB3_13 + +LBB3_12: + LONG $0xc70c0348 // add rcx, qword [rdi + 8*rax] + LONG $0x01c08348 // add rax, 1 + WORD $0x3948; BYTE $0xc6 // cmp rsi, rax + JNE LBB3_12 + +LBB3_13: + LONG $0x2a8b61c4; BYTE $0xde // vcvtsi2sd xmm11, xmm14, rsi + WORD $0x8548; BYTE $0xf6 // test rsi, rsi + JLE LBB3_14 + LONG $0x2a8be1c4; BYTE $0xc9 // vcvtsi2sd xmm1, xmm14, rcx + LONG $0x5e7341c4; BYTE $0xe3 // vdivsd xmm12, xmm1, xmm11 + LONG $0x10fe8348 // cmp rsi, 16 + JAE LBB3_17 + LONG $0xd2efe9c5 // vpxor xmm2, xmm2, xmm2 + WORD $0xc031 // xor eax, eax + JMP LBB3_20 + +LBB3_14: + LONG $0xd2efe9c5 // vpxor xmm2, xmm2, xmm2 + LONG $0x5e6bc1c4; BYTE $0xc3 // vdivsd xmm0, xmm2, xmm11 + JMP LBB3_22 + +LBB3_17: + WORD $0x8948; BYTE $0xf0 // mov rax, rsi + LONG $0xf0e08348 // and rax, -16 + LONG $0x127bc1c4; BYTE $0xd4 // vmovddup xmm2, xmm12 + LONG $0x186d63c4; WORD $0x01ea // vinsertf128 ymm13, ymm2, xmm2, 1 + LONG $0x570941c4; BYTE $0xf6 // vxorpd xmm14, xmm14, xmm14 + WORD $0xc931 // xor ecx, ecx + LONG $0xe4efd9c5 // vpxor xmm4, xmm4, xmm4 + LONG $0xedefd1c5 // vpxor xmm5, xmm5, xmm5 + LONG $0xf6efc9c5 // vpxor xmm6, xmm6, xmm6 + +LBB3_18: + LONG $0x046f7ec5; BYTE $0xcf // vmovdqu ymm8, yword [rdi + 8*rcx] + LONG $0x546f7ec5; WORD $0x20cf // vmovdqu ymm10, yword [rdi + 8*rcx + 32] + LONG $0x4c6f7ec5; WORD $0x40cf // vmovdqu ymm9, yword [rdi + 8*rcx + 64] + LONG $0x197d63c4; WORD $0x01c0 // vextractf128 xmm0, ymm8, 1 + LONG $0x16f9e3c4; WORD $0x01c2 // vpextrq rdx, xmm0, 1 + LONG $0x7c6ffec5; WORD $0x60cf // vmovdqu ymm7, yword [rdi + 8*rcx + 96] + LONG $0x2a83e1c4; BYTE $0xca // vcvtsi2sd xmm1, xmm15, rdx + LONG $0x7ef9e1c4; BYTE $0xc2 // vmovq rdx, xmm0 + LONG $0x2a83e1c4; BYTE $0xc2 // vcvtsi2sd xmm0, xmm15, rdx + LONG $0x16f963c4; WORD $0x01c2 // vpextrq rdx, xmm8, 1 + LONG $0xc116f8c5 // vmovlhps xmm0, xmm0, xmm1 + LONG $0x2a83e1c4; BYTE $0xca // vcvtsi2sd xmm1, xmm15, rdx + LONG $0x7ef961c4; BYTE $0xc2 // vmovq rdx, xmm8 + LONG $0x2a83e1c4; BYTE $0xda // vcvtsi2sd xmm3, xmm15, rdx + LONG $0xc916e0c5 // vmovlhps xmm1, xmm3, xmm1 + LONG $0x197d63c4; WORD $0x01d3 // vextractf128 xmm3, ymm10, 1 + LONG $0x16f9e3c4; WORD $0x01da // vpextrq rdx, xmm3, 1 + LONG $0x2a83e1c4; BYTE $0xd2 // vcvtsi2sd xmm2, xmm15, rdx + LONG $0x187563c4; WORD $0x01c0 // vinsertf128 ymm8, ymm1, xmm0, 1 + LONG $0x7ef9e1c4; BYTE $0xda // vmovq rdx, xmm3 + LONG $0x2a83e1c4; BYTE $0xc2 // vcvtsi2sd xmm0, xmm15, rdx + LONG $0x16f963c4; WORD $0x01d2 // vpextrq rdx, xmm10, 1 + LONG $0x2a83e1c4; BYTE $0xca // vcvtsi2sd xmm1, xmm15, rdx + LONG $0xc216f8c5 // vmovlhps xmm0, xmm0, xmm2 + LONG $0x7ef961c4; BYTE $0xd2 // vmovq rdx, xmm10 + LONG $0x2a83e1c4; BYTE $0xd2 // vcvtsi2sd xmm2, xmm15, rdx + LONG $0xc916e8c5 // vmovlhps xmm1, xmm2, xmm1 + LONG $0x197d63c4; WORD $0x01ca // vextractf128 xmm2, ymm9, 1 + LONG $0x16f9e3c4; WORD $0x01d2 // vpextrq rdx, xmm2, 1 + LONG $0x187563c4; WORD $0x01d0 // vinsertf128 ymm10, ymm1, xmm0, 1 + LONG $0x2a83e1c4; BYTE $0xca // vcvtsi2sd xmm1, xmm15, rdx + LONG $0x7ef9e1c4; BYTE $0xd2 // vmovq rdx, xmm2 + LONG $0x2a83e1c4; BYTE $0xd2 // vcvtsi2sd xmm2, xmm15, rdx + LONG $0x16f963c4; WORD $0x01ca // vpextrq rdx, xmm9, 1 + LONG $0xc916e8c5 // vmovlhps xmm1, xmm2, xmm1 + LONG $0x2a83e1c4; BYTE $0xd2 // vcvtsi2sd xmm2, xmm15, rdx + LONG $0x7ef961c4; BYTE $0xca // vmovq rdx, xmm9 + LONG $0x2a83e1c4; BYTE $0xda // vcvtsi2sd xmm3, xmm15, rdx + LONG $0xd216e0c5 // vmovlhps xmm2, xmm3, xmm2 + LONG $0x197de3c4; WORD $0x01fb // vextractf128 xmm3, ymm7, 1 + LONG $0x16f9e3c4; WORD $0x01da // vpextrq rdx, xmm3, 1 + LONG $0x2a83e1c4; BYTE $0xc2 // vcvtsi2sd xmm0, xmm15, rdx + LONG $0x186de3c4; WORD $0x01c9 // vinsertf128 ymm1, ymm2, xmm1, 1 + LONG $0x7ef9e1c4; BYTE $0xda // vmovq rdx, xmm3 + LONG $0x2a83e1c4; BYTE $0xd2 // vcvtsi2sd xmm2, xmm15, rdx + LONG $0x16f9e3c4; WORD $0x01fa // vpextrq rdx, xmm7, 1 + LONG $0x2a83e1c4; BYTE $0xda // vcvtsi2sd xmm3, xmm15, rdx + LONG $0xc016e8c5 // vmovlhps xmm0, xmm2, xmm0 + LONG $0x7ef9e1c4; BYTE $0xfa // vmovq rdx, xmm7 + LONG $0x2a83e1c4; BYTE $0xd2 // vcvtsi2sd xmm2, xmm15, rdx + LONG $0xd316e8c5 // vmovlhps xmm2, xmm2, xmm3 + LONG $0x186de3c4; WORD $0x01c0 // vinsertf128 ymm0, ymm2, xmm0, 1 + LONG $0x5c3dc1c4; BYTE $0xd5 // vsubpd ymm2, ymm8, ymm13 + LONG $0x5c2dc1c4; BYTE $0xdd // vsubpd ymm3, ymm10, ymm13 + LONG $0x5c75c1c4; BYTE $0xcd // vsubpd ymm1, ymm1, ymm13 + LONG $0x5c7dc1c4; BYTE $0xc5 // vsubpd ymm0, ymm0, ymm13 + LONG $0xd259edc5 // vmulpd ymm2, ymm2, ymm2 + LONG $0x586d41c4; BYTE $0xf6 // vaddpd ymm14, ymm2, ymm14 + LONG $0xd359e5c5 // vmulpd ymm2, ymm3, ymm3 + LONG $0xe458edc5 // vaddpd ymm4, ymm2, ymm4 + LONG $0xc959f5c5 // vmulpd ymm1, ymm1, ymm1 + LONG $0xed58f5c5 // vaddpd ymm5, ymm1, ymm5 + LONG $0xc059fdc5 // vmulpd ymm0, ymm0, ymm0 + LONG $0xf658fdc5 // vaddpd ymm6, ymm0, ymm6 + LONG $0x10c18348 // add rcx, 16 + WORD $0x3948; BYTE $0xc8 // cmp rax, rcx + JNE LBB3_18 + LONG $0x585dc1c4; BYTE $0xc6 // vaddpd ymm0, ymm4, ymm14 + LONG $0xc058d5c5 // vaddpd ymm0, ymm5, ymm0 + LONG $0xc058cdc5 // vaddpd ymm0, ymm6, ymm0 + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0xc158fdc5 // vaddpd ymm0, ymm0, ymm1 + LONG $0xd07cfdc5 // vhaddpd ymm2, ymm0, ymm0 + WORD $0x3948; BYTE $0xf0 // cmp rax, rsi + JE LBB3_21 + +LBB3_20: + LONG $0x2a83e1c4; WORD $0xc704 // vcvtsi2sd xmm0, xmm15, qword [rdi + 8*rax] + LONG $0x5c7bc1c4; BYTE $0xc4 // vsubsd xmm0, xmm0, xmm12 + LONG $0xc059fbc5 // vmulsd xmm0, xmm0, xmm0 + LONG $0xd258fbc5 // vaddsd xmm2, xmm0, xmm2 + LONG $0x01c08348 // add rax, 1 + WORD $0x3948; BYTE $0xc6 // cmp rsi, rax + JNE LBB3_20 + +LBB3_21: + LONG $0x5e6bc1c4; BYTE $0xc3 // vdivsd xmm0, xmm2, xmm11 + +LBB3_22: + BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER + MOVQ X0, x+24(FP) + RET + +LBB3_6: + LONG $0xef3941c4; BYTE $0xc0 // vpxor xmm8, xmm8, xmm8 + WORD $0xd231 // xor edx, edx + LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 + LONG $0xef3141c4; BYTE $0xc9 // vpxor xmm9, xmm9, xmm9 + LONG $0xef2941c4; BYTE $0xd2 // vpxor xmm10, xmm10, xmm10 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JNE LBB3_10 + JMP LBB3_11 + +TEXT sampleVarianceSSE42<>(SB), NOSPLIT, $0-32 + + MOVQ addr+0(FP), DI + MOVQ len+8(FP), SI + MOVQ cap+16(FP), DX + + WORD $0x8548; BYTE $0xf6 // test rsi, rsi + JE LBB3_1 + JLE LBB3_25 + LONG $0x03fe8348 // cmp rsi, 3 + JA LBB3_5 + WORD $0xc031 // xor eax, eax + WORD $0xc931 // xor ecx, ecx + JMP LBB3_13 + +LBB3_1: + WORD $0x570f; BYTE $0xc0 // xorps xmm0, xmm0 + JMP LBB3_24 + +LBB3_25: + LONG $0x2a0f48f2; BYTE $0xce // cvtsi2sd xmm1, rsi + LONG $0xc0570f66 // xorpd xmm0, xmm0 + LONG $0xc15e0ff2 // divsd xmm0, xmm1 + JMP LBB3_24 + +LBB3_5: + WORD $0x8948; BYTE $0xf0 // mov rax, rsi + LONG $0xfce08348 // and rax, -4 + LONG $0xfc508d48 // lea rdx, [rax - 4] + WORD $0x8948; BYTE $0xd1 // mov rcx, rdx + LONG $0x02e9c148 // shr rcx, 2 + LONG $0x01c18348 // add rcx, 1 + WORD $0x8941; BYTE $0xc8 // mov r8d, ecx + LONG $0x03e08341 // and r8d, 3 + LONG $0x0cfa8348 // cmp rdx, 12 + JAE LBB3_7 + LONG $0xc0ef0f66 // pxor xmm0, xmm0 + WORD $0xd231 // xor edx, edx + LONG $0xc9ef0f66 // pxor xmm1, xmm1 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JNE LBB3_10 + JMP LBB3_12 + +LBB3_7: + LONG $0x000001ba; BYTE $0x00 // mov edx, 1 + WORD $0x2948; BYTE $0xca // sub rdx, rcx + LONG $0x100c8d49 // lea rcx, [r8 + rdx] + LONG $0xffc18348 // add rcx, -1 + LONG $0xc0ef0f66 // pxor xmm0, xmm0 + WORD $0xd231 // xor edx, edx + LONG $0xc9ef0f66 // pxor xmm1, xmm1 + +LBB3_8: + LONG $0x146f0ff3; BYTE $0xd7 // movdqu xmm2, oword [rdi + 8*rdx] + LONG $0xd0d40f66 // paddq xmm2, xmm0 + LONG $0x446f0ff3; WORD $0x10d7 // movdqu xmm0, oword [rdi + 8*rdx + 16] + LONG $0xc1d40f66 // paddq xmm0, xmm1 + LONG $0x4c6f0ff3; WORD $0x20d7 // movdqu xmm1, oword [rdi + 8*rdx + 32] + LONG $0x5c6f0ff3; WORD $0x30d7 // movdqu xmm3, oword [rdi + 8*rdx + 48] + LONG $0x646f0ff3; WORD $0x40d7 // movdqu xmm4, oword [rdi + 8*rdx + 64] + LONG $0xe1d40f66 // paddq xmm4, xmm1 + LONG $0xe2d40f66 // paddq xmm4, xmm2 + LONG $0x546f0ff3; WORD $0x50d7 // movdqu xmm2, oword [rdi + 8*rdx + 80] + LONG $0xd3d40f66 // paddq xmm2, xmm3 + LONG $0xd0d40f66 // paddq xmm2, xmm0 + LONG $0x446f0ff3; WORD $0x60d7 // movdqu xmm0, oword [rdi + 8*rdx + 96] + LONG $0xc4d40f66 // paddq xmm0, xmm4 + LONG $0x4c6f0ff3; WORD $0x70d7 // movdqu xmm1, oword [rdi + 8*rdx + 112] + LONG $0xcad40f66 // paddq xmm1, xmm2 + LONG $0x10c28348 // add rdx, 16 + LONG $0x04c18348 // add rcx, 4 + JNE LBB3_8 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JE LBB3_12 + +LBB3_10: + LONG $0xd70c8d48 // lea rcx, [rdi + 8*rdx] + LONG $0x10c18348 // add rcx, 16 + WORD $0xf749; BYTE $0xd8 // neg r8 + +LBB3_11: + LONG $0x516f0ff3; BYTE $0xf0 // movdqu xmm2, oword [rcx - 16] + LONG $0xc2d40f66 // paddq xmm0, xmm2 + LONG $0x116f0ff3 // movdqu xmm2, oword [rcx] + LONG $0xcad40f66 // paddq xmm1, xmm2 + LONG $0x20c18348 // add rcx, 32 + LONG $0x01c08349 // add r8, 1 + JNE LBB3_11 + +LBB3_12: + LONG $0xc1d40f66 // paddq xmm0, xmm1 + LONG $0xc8700f66; BYTE $0x4e // pshufd xmm1, xmm0, 78 + LONG $0xc8d40f66 // paddq xmm1, xmm0 + LONG $0x7e0f4866; BYTE $0xc9 // movq rcx, xmm1 + WORD $0x3948; BYTE $0xf0 // cmp rax, rsi + JE LBB3_14 + +LBB3_13: + LONG $0xc70c0348 // add rcx, qword [rdi + 8*rax] + LONG $0x01c08348 // add rax, 1 + WORD $0x3948; BYTE $0xc6 // cmp rsi, rax + JNE LBB3_13 + +LBB3_14: + WORD $0x570f; BYTE $0xc9 // xorps xmm1, xmm1 + LONG $0x2a0f48f2; BYTE $0xce // cvtsi2sd xmm1, rsi + WORD $0x8548; BYTE $0xf6 // test rsi, rsi + JLE LBB3_15 + WORD $0x570f; BYTE $0xd2 // xorps xmm2, xmm2 + LONG $0x2a0f48f2; BYTE $0xd1 // cvtsi2sd xmm2, rcx + LONG $0xd15e0ff2 // divsd xmm2, xmm1 + LONG $0xff4e8d48 // lea rcx, [rsi - 1] + WORD $0xf089 // mov eax, esi + WORD $0xe083; BYTE $0x03 // and eax, 3 + LONG $0x03f98348 // cmp rcx, 3 + JAE LBB3_18 + LONG $0xc0ef0f66 // pxor xmm0, xmm0 + WORD $0xc931 // xor ecx, ecx + WORD $0x8548; BYTE $0xc0 // test rax, rax + JNE LBB3_21 + JMP LBB3_23 + +LBB3_15: + LONG $0xc0ef0f66 // pxor xmm0, xmm0 + LONG $0xc15e0ff2 // divsd xmm0, xmm1 + JMP LBB3_24 + +LBB3_18: + WORD $0x2948; BYTE $0xc6 // sub rsi, rax + LONG $0xc0ef0f66 // pxor xmm0, xmm0 + WORD $0xc931 // xor ecx, ecx + +LBB3_19: + WORD $0x570f; BYTE $0xdb // xorps xmm3, xmm3 + LONG $0x2a0f48f2; WORD $0xcf1c // cvtsi2sd xmm3, qword [rdi + 8*rcx] + LONG $0xda5c0ff2 // subsd xmm3, xmm2 + LONG $0xdb590ff2 // mulsd xmm3, xmm3 + WORD $0x570f; BYTE $0xe4 // xorps xmm4, xmm4 + LONG $0x2a0f48f2; WORD $0xcf64; BYTE $0x08 // cvtsi2sd xmm4, qword [rdi + 8*rcx + 8] + LONG $0xd8580ff2 // addsd xmm3, xmm0 + LONG $0xe25c0ff2 // subsd xmm4, xmm2 + WORD $0x570f; BYTE $0xed // xorps xmm5, xmm5 + LONG $0x2a0f48f2; WORD $0xcf6c; BYTE $0x10 // cvtsi2sd xmm5, qword [rdi + 8*rcx + 16] + LONG $0xe4590ff2 // mulsd xmm4, xmm4 + LONG $0xea5c0ff2 // subsd xmm5, xmm2 + LONG $0xed590ff2 // mulsd xmm5, xmm5 + LONG $0xec580ff2 // addsd xmm5, xmm4 + LONG $0xeb580ff2 // addsd xmm5, xmm3 + WORD $0x570f; BYTE $0xc0 // xorps xmm0, xmm0 + LONG $0x2a0f48f2; WORD $0xcf44; BYTE $0x18 // cvtsi2sd xmm0, qword [rdi + 8*rcx + 24] + LONG $0xc25c0ff2 // subsd xmm0, xmm2 + LONG $0xc0590ff2 // mulsd xmm0, xmm0 + LONG $0xc5580ff2 // addsd xmm0, xmm5 + LONG $0x04c18348 // add rcx, 4 + WORD $0x3948; BYTE $0xce // cmp rsi, rcx + JNE LBB3_19 + WORD $0x8548; BYTE $0xc0 // test rax, rax + JE LBB3_23 + +LBB3_21: + LONG $0xcf0c8d48 // lea rcx, [rdi + 8*rcx] + WORD $0xd231 // xor edx, edx + +LBB3_22: + WORD $0x570f; BYTE $0xdb // xorps xmm3, xmm3 + LONG $0x2a0f48f2; WORD $0xd11c // cvtsi2sd xmm3, qword [rcx + 8*rdx] + LONG $0xda5c0ff2 // subsd xmm3, xmm2 + LONG $0xdb590ff2 // mulsd xmm3, xmm3 + LONG $0xc3580ff2 // addsd xmm0, xmm3 + LONG $0x01c28348 // add rdx, 1 + WORD $0x3948; BYTE $0xd0 // cmp rax, rdx + JNE LBB3_22 + +LBB3_23: + LONG $0xc15e0ff2 // divsd xmm0, xmm1 + +LBB3_24: + MOVQ X0, x+24(FP) + RET + +DATA LCDATA1<>+0x000(SB)/8, $0x8000000000000000 +DATA LCDATA1<>+0x008(SB)/8, $0x8000000000000000 +DATA LCDATA1<>+0x010(SB)/8, $0x8000000000000000 +DATA LCDATA1<>+0x018(SB)/8, $0x8000000000000000 +GLOBL LCDATA1<>(SB), 8, $32 + +TEXT sampleMaxAVX2<>(SB), NOSPLIT, $0-32 + + MOVQ addr+0(FP), DI + MOVQ len+8(FP), SI + MOVQ cap+16(FP), DX + LEAQ LCDATA1<>(SB), BP + + WORD $0x8548; BYTE $0xf6 // test rsi, rsi + JE LBB1_1 + QUAD $0x000000000000b848; WORD $0x8000 // mov rax, -9223372036854775808 + JLE LBB1_13 + LONG $0x10fe8348 // cmp rsi, 16 + JAE LBB1_5 + WORD $0xc931 // xor ecx, ecx + JMP LBB1_12 + +LBB1_1: + WORD $0xc031 // xor eax, eax + MOVQ AX, x+24(FP) + RET + +LBB1_5: + WORD $0x8948; BYTE $0xf1 // mov rcx, rsi + LONG $0xf0e18348 // and rcx, -16 + LONG $0xf0518d48 // lea rdx, [rcx - 16] + WORD $0x8948; BYTE $0xd0 // mov rax, rdx + LONG $0x04e8c148 // shr rax, 4 + LONG $0x01c08348 // add rax, 1 + WORD $0x8941; BYTE $0xc0 // mov r8d, eax + LONG $0x01e08341 // and r8d, 1 + WORD $0x8548; BYTE $0xd2 // test rdx, rdx + JE LBB1_6 + LONG $0x000001ba; BYTE $0x00 // mov edx, 1 + WORD $0x2948; BYTE $0xc2 // sub rdx, rax + WORD $0x014c; BYTE $0xc2 // add rdx, r8 + LONG $0xffc28348 // add rdx, -1 + LONG $0x597de2c4; WORD $0x0045 // vpbroadcastq ymm0, qword 0[rbp] /* [rip + .LCPI1_0] */ + WORD $0xc031 // xor eax, eax + LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 + LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 + LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 + +LBB1_8: + LONG $0x246ffec5; BYTE $0xc7 // vmovdqu ymm4, yword [rdi + 8*rax] + LONG $0x6c6ffec5; WORD $0x20c7 // vmovdqu ymm5, yword [rdi + 8*rax + 32] + LONG $0x746ffec5; WORD $0x40c7 // vmovdqu ymm6, yword [rdi + 8*rax + 64] + LONG $0x375de2c4; BYTE $0xf8 // vpcmpgtq ymm7, ymm4, ymm0 + LONG $0x4b7de3c4; WORD $0x70c4 // vblendvpd ymm0, ymm0, ymm4, ymm7 + LONG $0x646ffec5; WORD $0x60c7 // vmovdqu ymm4, yword [rdi + 8*rax + 96] + LONG $0x3755e2c4; BYTE $0xf9 // vpcmpgtq ymm7, ymm5, ymm1 + LONG $0x4b75e3c4; WORD $0x70cd // vblendvpd ymm1, ymm1, ymm5, ymm7 + LONG $0x374de2c4; BYTE $0xea // vpcmpgtq ymm5, ymm6, ymm2 + LONG $0x4b6de3c4; WORD $0x50d6 // vblendvpd ymm2, ymm2, ymm6, ymm5 + LONG $0x375de2c4; BYTE $0xeb // vpcmpgtq ymm5, ymm4, ymm3 + LONG $0x4b65e3c4; WORD $0x50dc // vblendvpd ymm3, ymm3, ymm4, ymm5 + QUAD $0x000080c7a46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdi + 8*rax + 128] + QUAD $0x0000a0c7ac6ffec5; BYTE $0x00 // vmovdqu ymm5, yword [rdi + 8*rax + 160] + QUAD $0x0000c0c7b46ffec5; BYTE $0x00 // vmovdqu ymm6, yword [rdi + 8*rax + 192] + LONG $0x375de2c4; BYTE $0xf8 // vpcmpgtq ymm7, ymm4, ymm0 + LONG $0x4b7de3c4; WORD $0x70c4 // vblendvpd ymm0, ymm0, ymm4, ymm7 + QUAD $0x0000e0c7a46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdi + 8*rax + 224] + LONG $0x3755e2c4; BYTE $0xf9 // vpcmpgtq ymm7, ymm5, ymm1 + LONG $0x4b75e3c4; WORD $0x70cd // vblendvpd ymm1, ymm1, ymm5, ymm7 + LONG $0x374de2c4; BYTE $0xea // vpcmpgtq ymm5, ymm6, ymm2 + LONG $0x4b6de3c4; WORD $0x50d6 // vblendvpd ymm2, ymm2, ymm6, ymm5 + LONG $0x375de2c4; BYTE $0xeb // vpcmpgtq ymm5, ymm4, ymm3 + LONG $0x4b65e3c4; WORD $0x50dc // vblendvpd ymm3, ymm3, ymm4, ymm5 + LONG $0x20c08348 // add rax, 32 + LONG $0x02c28348 // add rdx, 2 + JNE LBB1_8 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JE LBB1_11 + +LBB1_10: + LONG $0x646ffec5; WORD $0x60c7 // vmovdqu ymm4, yword [rdi + 8*rax + 96] + LONG $0x375de2c4; BYTE $0xeb // vpcmpgtq ymm5, ymm4, ymm3 + LONG $0x4b65e3c4; WORD $0x50dc // vblendvpd ymm3, ymm3, ymm4, ymm5 + LONG $0x646ffec5; WORD $0x40c7 // vmovdqu ymm4, yword [rdi + 8*rax + 64] + LONG $0x375de2c4; BYTE $0xea // vpcmpgtq ymm5, ymm4, ymm2 + LONG $0x4b6de3c4; WORD $0x50d4 // vblendvpd ymm2, ymm2, ymm4, ymm5 + LONG $0x646ffec5; WORD $0x20c7 // vmovdqu ymm4, yword [rdi + 8*rax + 32] + LONG $0x375de2c4; BYTE $0xe9 // vpcmpgtq ymm5, ymm4, ymm1 + LONG $0x4b75e3c4; WORD $0x50cc // vblendvpd ymm1, ymm1, ymm4, ymm5 + LONG $0x246ffec5; BYTE $0xc7 // vmovdqu ymm4, yword [rdi + 8*rax] + LONG $0x375de2c4; BYTE $0xe8 // vpcmpgtq ymm5, ymm4, ymm0 + LONG $0x4b7de3c4; WORD $0x50c4 // vblendvpd ymm0, ymm0, ymm4, ymm5 + +LBB1_11: + LONG $0x377de2c4; BYTE $0xe1 // vpcmpgtq ymm4, ymm0, ymm1 + LONG $0x4b75e3c4; WORD $0x40c0 // vblendvpd ymm0, ymm1, ymm0, ymm4 + LONG $0x377de2c4; BYTE $0xca // vpcmpgtq ymm1, ymm0, ymm2 + LONG $0x4b6de3c4; WORD $0x10c0 // vblendvpd ymm0, ymm2, ymm0, ymm1 + LONG $0x377de2c4; BYTE $0xcb // vpcmpgtq ymm1, ymm0, ymm3 + LONG $0x4b65e3c4; WORD $0x10c0 // vblendvpd ymm0, ymm3, ymm0, ymm1 + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0x377de2c4; BYTE $0xd1 // vpcmpgtq ymm2, ymm0, ymm1 + LONG $0x4b75e3c4; WORD $0x20c0 // vblendvpd ymm0, ymm1, ymm0, ymm2 + LONG $0x0479e3c4; WORD $0x4ec8 // vpermilps xmm1, xmm0, 78 + LONG $0x377de2c4; BYTE $0xd1 // vpcmpgtq ymm2, ymm0, ymm1 + LONG $0x4b75e3c4; WORD $0x20c0 // vblendvpd ymm0, ymm1, ymm0, ymm2 + LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0 + WORD $0x3948; BYTE $0xf1 // cmp rcx, rsi + JE LBB1_13 + +LBB1_12: + LONG $0xcf148b48 // mov rdx, qword [rdi + 8*rcx] + WORD $0x3948; BYTE $0xc2 // cmp rdx, rax + LONG $0xc24d0f48 // cmovge rax, rdx + LONG $0x01c18348 // add rcx, 1 + WORD $0x3948; BYTE $0xce // cmp rsi, rcx + JNE LBB1_12 + +LBB1_13: + BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER + MOVQ AX, x+24(FP) + RET + +LBB1_6: + LONG $0x597de2c4; WORD $0x0045 // vpbroadcastq ymm0, qword 0[rbp] /* [rip + .LCPI1_0] */ + WORD $0xc031 // xor eax, eax + LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 + LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 + LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JNE LBB1_10 + JMP LBB1_11 + +TEXT sampleMaxAVX<>(SB), NOSPLIT, $0-32 + + MOVQ addr+0(FP), DI + MOVQ len+8(FP), SI + MOVQ cap+16(FP), DX + LEAQ LCDATA1<>(SB), BP + + WORD $0x8548; BYTE $0xf6 // test rsi, rsi + JE LBB1_1 + QUAD $0x000000000000b848; WORD $0x8000 // mov rax, -9223372036854775808 + JLE LBB1_13 + LONG $0x10fe8348 // cmp rsi, 16 + JAE LBB1_5 + WORD $0xc931 // xor ecx, ecx + JMP LBB1_12 + +LBB1_1: + WORD $0xc031 // xor eax, eax + MOVQ AX, x+24(FP) + RET + +LBB1_5: + WORD $0x8948; BYTE $0xf1 // mov rcx, rsi + LONG $0xf0e18348 // and rcx, -16 + LONG $0xf0518d48 // lea rdx, [rcx - 16] + WORD $0x8948; BYTE $0xd0 // mov rax, rdx + LONG $0x04e8c148 // shr rax, 4 + LONG $0x01c08348 // add rax, 1 + WORD $0x8941; BYTE $0xc0 // mov r8d, eax + LONG $0x01e08341 // and r8d, 1 + WORD $0x8548; BYTE $0xd2 // test rdx, rdx + JE LBB1_6 + LONG $0x000001ba; BYTE $0x00 // mov edx, 1 + WORD $0x2948; BYTE $0xc2 // sub rdx, rax + WORD $0x014c; BYTE $0xc2 // add rdx, r8 + LONG $0xffc28348 // add rdx, -1 + LONG $0x4d6f7dc5; BYTE $0x00 // vmovdqa ymm9, yword 0[rbp] /* [rip + .LCPI1_0] */ + WORD $0xc031 // xor eax, eax + LONG $0x6f7dc1c4; BYTE $0xd9 // vmovdqa ymm3, ymm9 + LONG $0x6f7dc1c4; BYTE $0xd1 // vmovdqa ymm2, ymm9 + LONG $0x6f7d41c4; BYTE $0xc1 // vmovdqa ymm8, ymm9 + +LBB1_8: + LONG $0x246ffec5; BYTE $0xc7 // vmovdqu ymm4, yword [rdi + 8*rax] + LONG $0x6c6ffec5; WORD $0x20c7 // vmovdqu ymm5, yword [rdi + 8*rax + 32] + LONG $0x746ffec5; WORD $0x40c7 // vmovdqu ymm6, yword [rdi + 8*rax + 64] + LONG $0x197de3c4; WORD $0x01e7 // vextractf128 xmm7, ymm4, 1 + LONG $0x197d63c4; WORD $0x01c9 // vextractf128 xmm1, ymm9, 1 + LONG $0x3741e2c4; BYTE $0xc9 // vpcmpgtq xmm1, xmm7, xmm1 + LONG $0x3759c2c4; BYTE $0xf9 // vpcmpgtq xmm7, xmm4, xmm9 + LONG $0x1845e3c4; WORD $0x01c9 // vinsertf128 ymm1, ymm7, xmm1, 1 + LONG $0x4b3563c4; WORD $0x10cc // vblendvpd ymm9, ymm9, ymm4, ymm1 + LONG $0x4c6ffec5; WORD $0x60c7 // vmovdqu ymm1, yword [rdi + 8*rax + 96] + LONG $0x197de3c4; WORD $0x01ec // vextractf128 xmm4, ymm5, 1 + LONG $0x197de3c4; WORD $0x01df // vextractf128 xmm7, ymm3, 1 + LONG $0x3759e2c4; BYTE $0xe7 // vpcmpgtq xmm4, xmm4, xmm7 + LONG $0x3751e2c4; BYTE $0xfb // vpcmpgtq xmm7, xmm5, xmm3 + LONG $0x1845e3c4; WORD $0x01e4 // vinsertf128 ymm4, ymm7, xmm4, 1 + LONG $0x4b65e3c4; WORD $0x40dd // vblendvpd ymm3, ymm3, ymm5, ymm4 + LONG $0x197de3c4; WORD $0x01f4 // vextractf128 xmm4, ymm6, 1 + LONG $0x197de3c4; WORD $0x01d5 // vextractf128 xmm5, ymm2, 1 + LONG $0x3759e2c4; BYTE $0xe5 // vpcmpgtq xmm4, xmm4, xmm5 + LONG $0x3749e2c4; BYTE $0xea // vpcmpgtq xmm5, xmm6, xmm2 + LONG $0x1855e3c4; WORD $0x01e4 // vinsertf128 ymm4, ymm5, xmm4, 1 + LONG $0x4b6de3c4; WORD $0x40d6 // vblendvpd ymm2, ymm2, ymm6, ymm4 + LONG $0x197de3c4; WORD $0x01cc // vextractf128 xmm4, ymm1, 1 + LONG $0x197d63c4; WORD $0x01c5 // vextractf128 xmm5, ymm8, 1 + LONG $0x3759e2c4; BYTE $0xe5 // vpcmpgtq xmm4, xmm4, xmm5 + LONG $0x3771c2c4; BYTE $0xe8 // vpcmpgtq xmm5, xmm1, xmm8 + LONG $0x1855e3c4; WORD $0x01e4 // vinsertf128 ymm4, ymm5, xmm4, 1 + LONG $0x4b3de3c4; WORD $0x40c9 // vblendvpd ymm1, ymm8, ymm1, ymm4 + QUAD $0x000080c7a46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdi + 8*rax + 128] + QUAD $0x0000a0c7ac6ffec5; BYTE $0x00 // vmovdqu ymm5, yword [rdi + 8*rax + 160] + QUAD $0x0000c0c7b46ffec5; BYTE $0x00 // vmovdqu ymm6, yword [rdi + 8*rax + 192] + LONG $0x197d63c4; WORD $0x01cf // vextractf128 xmm7, ymm9, 1 + LONG $0x197de3c4; WORD $0x01e0 // vextractf128 xmm0, ymm4, 1 + LONG $0x3779e2c4; BYTE $0xc7 // vpcmpgtq xmm0, xmm0, xmm7 + LONG $0x3759c2c4; BYTE $0xf9 // vpcmpgtq xmm7, xmm4, xmm9 + LONG $0x1845e3c4; WORD $0x01c0 // vinsertf128 ymm0, ymm7, xmm0, 1 + LONG $0x4b3563c4; WORD $0x00cc // vblendvpd ymm9, ymm9, ymm4, ymm0 + QUAD $0x0000e0c7a46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdi + 8*rax + 224] + LONG $0x197de3c4; WORD $0x01df // vextractf128 xmm7, ymm3, 1 + LONG $0x197de3c4; WORD $0x01e8 // vextractf128 xmm0, ymm5, 1 + LONG $0x3779e2c4; BYTE $0xc7 // vpcmpgtq xmm0, xmm0, xmm7 + LONG $0x3751e2c4; BYTE $0xfb // vpcmpgtq xmm7, xmm5, xmm3 + LONG $0x1845e3c4; WORD $0x01c0 // vinsertf128 ymm0, ymm7, xmm0, 1 + LONG $0x4b65e3c4; WORD $0x00dd // vblendvpd ymm3, ymm3, ymm5, ymm0 + LONG $0x197de3c4; WORD $0x01d0 // vextractf128 xmm0, ymm2, 1 + LONG $0x197de3c4; WORD $0x01f5 // vextractf128 xmm5, ymm6, 1 + LONG $0x3751e2c4; BYTE $0xc0 // vpcmpgtq xmm0, xmm5, xmm0 + LONG $0x3749e2c4; BYTE $0xea // vpcmpgtq xmm5, xmm6, xmm2 + LONG $0x1855e3c4; WORD $0x01c0 // vinsertf128 ymm0, ymm5, xmm0, 1 + LONG $0x4b6de3c4; WORD $0x00d6 // vblendvpd ymm2, ymm2, ymm6, ymm0 + LONG $0x197de3c4; WORD $0x01c8 // vextractf128 xmm0, ymm1, 1 + LONG $0x197de3c4; WORD $0x01e5 // vextractf128 xmm5, ymm4, 1 + LONG $0x3751e2c4; BYTE $0xc0 // vpcmpgtq xmm0, xmm5, xmm0 + LONG $0x3759e2c4; BYTE $0xe9 // vpcmpgtq xmm5, xmm4, xmm1 + LONG $0x1855e3c4; WORD $0x01c0 // vinsertf128 ymm0, ymm5, xmm0, 1 + LONG $0x4b7563c4; WORD $0x00c4 // vblendvpd ymm8, ymm1, ymm4, ymm0 + LONG $0x20c08348 // add rax, 32 + LONG $0x02c28348 // add rdx, 2 + JNE LBB1_8 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JE LBB1_11 + +LBB1_10: + LONG $0x446ffec5; WORD $0x60c7 // vmovdqu ymm0, yword [rdi + 8*rax + 96] + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0x197d63c4; WORD $0x01c4 // vextractf128 xmm4, ymm8, 1 + LONG $0x3771e2c4; BYTE $0xcc // vpcmpgtq xmm1, xmm1, xmm4 + LONG $0x3779c2c4; BYTE $0xe0 // vpcmpgtq xmm4, xmm0, xmm8 + LONG $0x185de3c4; WORD $0x01c9 // vinsertf128 ymm1, ymm4, xmm1, 1 + LONG $0x4b3d63c4; WORD $0x10c0 // vblendvpd ymm8, ymm8, ymm0, ymm1 + LONG $0x446ffec5; WORD $0x40c7 // vmovdqu ymm0, yword [rdi + 8*rax + 64] + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0x197de3c4; WORD $0x01d4 // vextractf128 xmm4, ymm2, 1 + LONG $0x3771e2c4; BYTE $0xcc // vpcmpgtq xmm1, xmm1, xmm4 + LONG $0x3779e2c4; BYTE $0xe2 // vpcmpgtq xmm4, xmm0, xmm2 + LONG $0x185de3c4; WORD $0x01c9 // vinsertf128 ymm1, ymm4, xmm1, 1 + LONG $0x4b6de3c4; WORD $0x10d0 // vblendvpd ymm2, ymm2, ymm0, ymm1 + LONG $0x446ffec5; WORD $0x20c7 // vmovdqu ymm0, yword [rdi + 8*rax + 32] + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0x197de3c4; WORD $0x01dc // vextractf128 xmm4, ymm3, 1 + LONG $0x3771e2c4; BYTE $0xcc // vpcmpgtq xmm1, xmm1, xmm4 + LONG $0x3779e2c4; BYTE $0xe3 // vpcmpgtq xmm4, xmm0, xmm3 + LONG $0x185de3c4; WORD $0x01c9 // vinsertf128 ymm1, ymm4, xmm1, 1 + LONG $0x4b65e3c4; WORD $0x10d8 // vblendvpd ymm3, ymm3, ymm0, ymm1 + LONG $0x046ffec5; BYTE $0xc7 // vmovdqu ymm0, yword [rdi + 8*rax] + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0x197d63c4; WORD $0x01cc // vextractf128 xmm4, ymm9, 1 + LONG $0x3771e2c4; BYTE $0xcc // vpcmpgtq xmm1, xmm1, xmm4 + LONG $0x3779c2c4; BYTE $0xe1 // vpcmpgtq xmm4, xmm0, xmm9 + LONG $0x185de3c4; WORD $0x01c9 // vinsertf128 ymm1, ymm4, xmm1, 1 + LONG $0x4b3563c4; WORD $0x10c8 // vblendvpd ymm9, ymm9, ymm0, ymm1 + +LBB1_11: + LONG $0x197de3c4; WORD $0x01d8 // vextractf128 xmm0, ymm3, 1 + LONG $0x197d63c4; WORD $0x01c9 // vextractf128 xmm1, ymm9, 1 + LONG $0x3771e2c4; BYTE $0xc0 // vpcmpgtq xmm0, xmm1, xmm0 + LONG $0x3731e2c4; BYTE $0xcb // vpcmpgtq xmm1, xmm9, xmm3 + LONG $0x1875e3c4; WORD $0x01c0 // vinsertf128 ymm0, ymm1, xmm0, 1 + LONG $0x4b65c3c4; WORD $0x00c1 // vblendvpd ymm0, ymm3, ymm9, ymm0 + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0x197de3c4; WORD $0x01d3 // vextractf128 xmm3, ymm2, 1 + LONG $0x3771e2c4; BYTE $0xcb // vpcmpgtq xmm1, xmm1, xmm3 + LONG $0x3779e2c4; BYTE $0xda // vpcmpgtq xmm3, xmm0, xmm2 + LONG $0x1865e3c4; WORD $0x01c9 // vinsertf128 ymm1, ymm3, xmm1, 1 + LONG $0x4b6de3c4; WORD $0x10c0 // vblendvpd ymm0, ymm2, ymm0, ymm1 + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0x197d63c4; WORD $0x01c2 // vextractf128 xmm2, ymm8, 1 + LONG $0x3771e2c4; BYTE $0xca // vpcmpgtq xmm1, xmm1, xmm2 + LONG $0x3779c2c4; BYTE $0xd0 // vpcmpgtq xmm2, xmm0, xmm8 + LONG $0x186de3c4; WORD $0x01c9 // vinsertf128 ymm1, ymm2, xmm1, 1 + LONG $0x4b3de3c4; WORD $0x10c0 // vblendvpd ymm0, ymm8, ymm0, ymm1 + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0x3779e2c4; BYTE $0xd1 // vpcmpgtq xmm2, xmm0, xmm1 + LONG $0x3771e2c4; BYTE $0xd8 // vpcmpgtq xmm3, xmm1, xmm0 + LONG $0x186de3c4; WORD $0x01d3 // vinsertf128 ymm2, ymm2, xmm3, 1 + LONG $0x4b75e3c4; WORD $0x20c0 // vblendvpd ymm0, ymm1, ymm0, ymm2 + LONG $0x0479e3c4; WORD $0x4ec8 // vpermilps xmm1, xmm0, 78 + LONG $0x3779e2c4; BYTE $0xd1 // vpcmpgtq xmm2, xmm0, xmm1 + LONG $0x197de3c4; WORD $0x01c3 // vextractf128 xmm3, ymm0, 1 + LONG $0x3761e2c4; BYTE $0xd8 // vpcmpgtq xmm3, xmm3, xmm0 + LONG $0x186de3c4; WORD $0x01d3 // vinsertf128 ymm2, ymm2, xmm3, 1 + LONG $0x4b75e3c4; WORD $0x20c0 // vblendvpd ymm0, ymm1, ymm0, ymm2 + LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0 + WORD $0x3948; BYTE $0xf1 // cmp rcx, rsi + JE LBB1_13 + +LBB1_12: + LONG $0xcf148b48 // mov rdx, qword [rdi + 8*rcx] + WORD $0x3948; BYTE $0xc2 // cmp rdx, rax + LONG $0xc24d0f48 // cmovge rax, rdx + LONG $0x01c18348 // add rcx, 1 + WORD $0x3948; BYTE $0xce // cmp rsi, rcx + JNE LBB1_12 + +LBB1_13: + BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER + MOVQ AX, x+24(FP) + RET + +LBB1_6: + LONG $0x4d6f7dc5; BYTE $0x00 // vmovdqa ymm9, yword 0[rbp] /* [rip + .LCPI1_0] */ + WORD $0xc031 // xor eax, eax + LONG $0x6f7dc1c4; BYTE $0xd9 // vmovdqa ymm3, ymm9 + LONG $0x6f7dc1c4; BYTE $0xd1 // vmovdqa ymm2, ymm9 + LONG $0x6f7d41c4; BYTE $0xc1 // vmovdqa ymm8, ymm9 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JNE LBB1_10 + JMP LBB1_11 + +TEXT sampleMaxSSE42<>(SB), NOSPLIT, $0-32 + + MOVQ addr+0(FP), DI + MOVQ len+8(FP), SI + MOVQ cap+16(FP), DX + LEAQ LCDATA1<>(SB), BP + + WORD $0x8548; BYTE $0xf6 // test rsi, rsi + JE LBB1_1 + QUAD $0x000000000000b848; WORD $0x8000 // mov rax, -9223372036854775808 + JLE LBB1_13 + LONG $0x04fe8348 // cmp rsi, 4 + JAE LBB1_5 + WORD $0xc931 // xor ecx, ecx + JMP LBB1_12 + +LBB1_1: + WORD $0xc031 // xor eax, eax + JMP LBB1_13 + +LBB1_5: + WORD $0x8948; BYTE $0xf1 // mov rcx, rsi + LONG $0xfce18348 // and rcx, -4 + LONG $0xfc518d48 // lea rdx, [rcx - 4] + WORD $0x8948; BYTE $0xd0 // mov rax, rdx + LONG $0x02e8c148 // shr rax, 2 + LONG $0x01c08348 // add rax, 1 + WORD $0x8941; BYTE $0xc0 // mov r8d, eax + LONG $0x01e08341 // and r8d, 1 + WORD $0x8548; BYTE $0xd2 // test rdx, rdx + JE LBB1_6 + LONG $0x000001ba; BYTE $0x00 // mov edx, 1 + WORD $0x2948; BYTE $0xc2 // sub rdx, rax + LONG $0x10048d49 // lea rax, [r8 + rdx] + LONG $0xffc08348 // add rax, -1 + LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI1_0] */ + WORD $0xd231 // xor edx, edx + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + +LBB1_8: + LONG $0x1c6f0ff3; BYTE $0xd7 // movdqu xmm3, oword [rdi + 8*rdx] + LONG $0x646f0ff3; WORD $0x10d7 // movdqu xmm4, oword [rdi + 8*rdx + 16] + LONG $0x6c6f0ff3; WORD $0x20d7 // movdqu xmm5, oword [rdi + 8*rdx + 32] + LONG $0xc36f0f66 // movdqa xmm0, xmm3 + LONG $0x37380f66; BYTE $0xc1 // pcmpgtq xmm0, xmm1 + LONG $0x15380f66; BYTE $0xcb // blendvpd xmm1, xmm3, xmm0 + LONG $0x5c6f0ff3; WORD $0x30d7 // movdqu xmm3, oword [rdi + 8*rdx + 48] + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 + LONG $0x15380f66; BYTE $0xd4 // blendvpd xmm2, xmm4, xmm0 + LONG $0xc56f0f66 // movdqa xmm0, xmm5 + LONG $0x37380f66; BYTE $0xc1 // pcmpgtq xmm0, xmm1 + LONG $0x15380f66; BYTE $0xcd // blendvpd xmm1, xmm5, xmm0 + LONG $0xc36f0f66 // movdqa xmm0, xmm3 + LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 + LONG $0x15380f66; BYTE $0xd3 // blendvpd xmm2, xmm3, xmm0 + LONG $0x08c28348 // add rdx, 8 + LONG $0x02c08348 // add rax, 2 + JNE LBB1_8 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JE LBB1_11 + +LBB1_10: + LONG $0x5c6f0ff3; WORD $0x10d7 // movdqu xmm3, oword [rdi + 8*rdx + 16] + LONG $0xc36f0f66 // movdqa xmm0, xmm3 + LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 + LONG $0x15380f66; BYTE $0xd3 // blendvpd xmm2, xmm3, xmm0 + LONG $0x1c6f0ff3; BYTE $0xd7 // movdqu xmm3, oword [rdi + 8*rdx] + LONG $0xc36f0f66 // movdqa xmm0, xmm3 + LONG $0x37380f66; BYTE $0xc1 // pcmpgtq xmm0, xmm1 + LONG $0x15380f66; BYTE $0xcb // blendvpd xmm1, xmm3, xmm0 + +LBB1_11: + LONG $0xc16f0f66 // movdqa xmm0, xmm1 + LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 + LONG $0x15380f66; BYTE $0xd1 // blendvpd xmm2, xmm1, xmm0 + LONG $0xca700f66; BYTE $0x4e // pshufd xmm1, xmm2, 78 + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0x37380f66; BYTE $0xc1 // pcmpgtq xmm0, xmm1 + LONG $0x15380f66; BYTE $0xca // blendvpd xmm1, xmm2, xmm0 + LONG $0x7e0f4866; BYTE $0xc8 // movq rax, xmm1 + WORD $0x3948; BYTE $0xf1 // cmp rcx, rsi + JE LBB1_13 + +LBB1_12: + LONG $0xcf148b48 // mov rdx, qword [rdi + 8*rcx] + WORD $0x3948; BYTE $0xc2 // cmp rdx, rax + LONG $0xc24d0f48 // cmovge rax, rdx + LONG $0x01c18348 // add rcx, 1 + WORD $0x3948; BYTE $0xce // cmp rsi, rcx + JNE LBB1_12 + +LBB1_13: + MOVQ AX, x+24(FP) + RET + +LBB1_6: + LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI1_0] */ + WORD $0xd231 // xor edx, edx + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JNE LBB1_10 + JMP LBB1_11 + +DATA LCDATA2<>+0x000(SB)/8, $0x7fffffffffffffff +DATA LCDATA2<>+0x008(SB)/8, $0x7fffffffffffffff +DATA LCDATA2<>+0x010(SB)/8, $0x7fffffffffffffff +DATA LCDATA2<>+0x018(SB)/8, $0x7fffffffffffffff +GLOBL LCDATA2<>(SB), 8, $32 + +TEXT sampleMinAVX2<>(SB), NOSPLIT, $0-32 + + MOVQ addr+0(FP), DI + MOVQ len+8(FP), SI + MOVQ cap+16(FP), DX + LEAQ LCDATA2<>(SB), BP + + WORD $0x8548; BYTE $0xf6 // test rsi, rsi + JE LBB2_1 + QUAD $0xffffffffffffb848; WORD $0x7fff // mov rax, 9223372036854775807 + JLE LBB2_13 + LONG $0x10fe8348 // cmp rsi, 16 + JAE LBB2_5 + WORD $0xc931 // xor ecx, ecx + JMP LBB2_12 + +LBB2_1: + WORD $0xc031 // xor eax, eax + MOVQ AX, x+24(FP) + RET + +LBB2_5: + WORD $0x8948; BYTE $0xf1 // mov rcx, rsi + LONG $0xf0e18348 // and rcx, -16 + LONG $0xf0518d48 // lea rdx, [rcx - 16] + WORD $0x8948; BYTE $0xd0 // mov rax, rdx + LONG $0x04e8c148 // shr rax, 4 + LONG $0x01c08348 // add rax, 1 + WORD $0x8941; BYTE $0xc0 // mov r8d, eax + LONG $0x01e08341 // and r8d, 1 + WORD $0x8548; BYTE $0xd2 // test rdx, rdx + JE LBB2_6 + LONG $0x000001ba; BYTE $0x00 // mov edx, 1 + WORD $0x2948; BYTE $0xc2 // sub rdx, rax + WORD $0x014c; BYTE $0xc2 // add rdx, r8 + LONG $0xffc28348 // add rdx, -1 + LONG $0x597de2c4; WORD $0x0045 // vpbroadcastq ymm0, qword 0[rbp] /* [rip + .LCPI2_0] */ + WORD $0xc031 // xor eax, eax + LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 + LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 + LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 + +LBB2_8: + LONG $0x246ffec5; BYTE $0xc7 // vmovdqu ymm4, yword [rdi + 8*rax] + LONG $0x6c6ffec5; WORD $0x20c7 // vmovdqu ymm5, yword [rdi + 8*rax + 32] + LONG $0x746ffec5; WORD $0x40c7 // vmovdqu ymm6, yword [rdi + 8*rax + 64] + LONG $0x377de2c4; BYTE $0xfc // vpcmpgtq ymm7, ymm0, ymm4 + LONG $0x4b7de3c4; WORD $0x70c4 // vblendvpd ymm0, ymm0, ymm4, ymm7 + LONG $0x646ffec5; WORD $0x60c7 // vmovdqu ymm4, yword [rdi + 8*rax + 96] + LONG $0x3775e2c4; BYTE $0xfd // vpcmpgtq ymm7, ymm1, ymm5 + LONG $0x4b75e3c4; WORD $0x70cd // vblendvpd ymm1, ymm1, ymm5, ymm7 + LONG $0x376de2c4; BYTE $0xee // vpcmpgtq ymm5, ymm2, ymm6 + LONG $0x4b6de3c4; WORD $0x50d6 // vblendvpd ymm2, ymm2, ymm6, ymm5 + LONG $0x3765e2c4; BYTE $0xec // vpcmpgtq ymm5, ymm3, ymm4 + LONG $0x4b65e3c4; WORD $0x50dc // vblendvpd ymm3, ymm3, ymm4, ymm5 + QUAD $0x000080c7a46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdi + 8*rax + 128] + QUAD $0x0000a0c7ac6ffec5; BYTE $0x00 // vmovdqu ymm5, yword [rdi + 8*rax + 160] + QUAD $0x0000c0c7b46ffec5; BYTE $0x00 // vmovdqu ymm6, yword [rdi + 8*rax + 192] + LONG $0x377de2c4; BYTE $0xfc // vpcmpgtq ymm7, ymm0, ymm4 + LONG $0x4b7de3c4; WORD $0x70c4 // vblendvpd ymm0, ymm0, ymm4, ymm7 + QUAD $0x0000e0c7a46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdi + 8*rax + 224] + LONG $0x3775e2c4; BYTE $0xfd // vpcmpgtq ymm7, ymm1, ymm5 + LONG $0x4b75e3c4; WORD $0x70cd // vblendvpd ymm1, ymm1, ymm5, ymm7 + LONG $0x376de2c4; BYTE $0xee // vpcmpgtq ymm5, ymm2, ymm6 + LONG $0x4b6de3c4; WORD $0x50d6 // vblendvpd ymm2, ymm2, ymm6, ymm5 + LONG $0x3765e2c4; BYTE $0xec // vpcmpgtq ymm5, ymm3, ymm4 + LONG $0x4b65e3c4; WORD $0x50dc // vblendvpd ymm3, ymm3, ymm4, ymm5 + LONG $0x20c08348 // add rax, 32 + LONG $0x02c28348 // add rdx, 2 + JNE LBB2_8 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JE LBB2_11 + +LBB2_10: + LONG $0x646ffec5; WORD $0x60c7 // vmovdqu ymm4, yword [rdi + 8*rax + 96] + LONG $0x3765e2c4; BYTE $0xec // vpcmpgtq ymm5, ymm3, ymm4 + LONG $0x4b65e3c4; WORD $0x50dc // vblendvpd ymm3, ymm3, ymm4, ymm5 + LONG $0x646ffec5; WORD $0x40c7 // vmovdqu ymm4, yword [rdi + 8*rax + 64] + LONG $0x376de2c4; BYTE $0xec // vpcmpgtq ymm5, ymm2, ymm4 + LONG $0x4b6de3c4; WORD $0x50d4 // vblendvpd ymm2, ymm2, ymm4, ymm5 + LONG $0x646ffec5; WORD $0x20c7 // vmovdqu ymm4, yword [rdi + 8*rax + 32] + LONG $0x3775e2c4; BYTE $0xec // vpcmpgtq ymm5, ymm1, ymm4 + LONG $0x4b75e3c4; WORD $0x50cc // vblendvpd ymm1, ymm1, ymm4, ymm5 + LONG $0x246ffec5; BYTE $0xc7 // vmovdqu ymm4, yword [rdi + 8*rax] + LONG $0x377de2c4; BYTE $0xec // vpcmpgtq ymm5, ymm0, ymm4 + LONG $0x4b7de3c4; WORD $0x50c4 // vblendvpd ymm0, ymm0, ymm4, ymm5 + +LBB2_11: + LONG $0x3775e2c4; BYTE $0xe0 // vpcmpgtq ymm4, ymm1, ymm0 + LONG $0x4b75e3c4; WORD $0x40c0 // vblendvpd ymm0, ymm1, ymm0, ymm4 + LONG $0x376de2c4; BYTE $0xc8 // vpcmpgtq ymm1, ymm2, ymm0 + LONG $0x4b6de3c4; WORD $0x10c0 // vblendvpd ymm0, ymm2, ymm0, ymm1 + LONG $0x3765e2c4; BYTE $0xc8 // vpcmpgtq ymm1, ymm3, ymm0 + LONG $0x4b65e3c4; WORD $0x10c0 // vblendvpd ymm0, ymm3, ymm0, ymm1 + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0x3775e2c4; BYTE $0xd0 // vpcmpgtq ymm2, ymm1, ymm0 + LONG $0x4b75e3c4; WORD $0x20c0 // vblendvpd ymm0, ymm1, ymm0, ymm2 + LONG $0x0479e3c4; WORD $0x4ec8 // vpermilps xmm1, xmm0, 78 + LONG $0x3775e2c4; BYTE $0xd0 // vpcmpgtq ymm2, ymm1, ymm0 + LONG $0x4b75e3c4; WORD $0x20c0 // vblendvpd ymm0, ymm1, ymm0, ymm2 + LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0 + WORD $0x3948; BYTE $0xf1 // cmp rcx, rsi + JE LBB2_13 + +LBB2_12: + LONG $0xcf148b48 // mov rdx, qword [rdi + 8*rcx] + WORD $0x3948; BYTE $0xc2 // cmp rdx, rax + LONG $0xc24e0f48 // cmovle rax, rdx + LONG $0x01c18348 // add rcx, 1 + WORD $0x3948; BYTE $0xce // cmp rsi, rcx + JNE LBB2_12 + +LBB2_13: + BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER + MOVQ AX, x+24(FP) + RET + +LBB2_6: + LONG $0x597de2c4; WORD $0x0045 // vpbroadcastq ymm0, qword 0[rbp] /* [rip + .LCPI2_0] */ + WORD $0xc031 // xor eax, eax + LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 + LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 + LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JNE LBB2_10 + JMP LBB2_11 + +TEXT sampleMinAVX<>(SB), NOSPLIT, $0-32 + + MOVQ addr+0(FP), DI + MOVQ len+8(FP), SI + MOVQ cap+16(FP), DX + LEAQ LCDATA2<>(SB), BP + + WORD $0x8548; BYTE $0xf6 // test rsi, rsi + JE LBB2_1 + QUAD $0xffffffffffffb848; WORD $0x7fff // mov rax, 9223372036854775807 + JLE LBB2_13 + LONG $0x10fe8348 // cmp rsi, 16 + JAE LBB2_5 + WORD $0xc931 // xor ecx, ecx + JMP LBB2_12 + +LBB2_1: + WORD $0xc031 // xor eax, eax + MOVQ AX, x+24(FP) + RET + +LBB2_5: + WORD $0x8948; BYTE $0xf1 // mov rcx, rsi + LONG $0xf0e18348 // and rcx, -16 + LONG $0xf0518d48 // lea rdx, [rcx - 16] + WORD $0x8948; BYTE $0xd0 // mov rax, rdx + LONG $0x04e8c148 // shr rax, 4 + LONG $0x01c08348 // add rax, 1 + WORD $0x8941; BYTE $0xc0 // mov r8d, eax + LONG $0x01e08341 // and r8d, 1 + WORD $0x8548; BYTE $0xd2 // test rdx, rdx + JE LBB2_6 + LONG $0x000001ba; BYTE $0x00 // mov edx, 1 + WORD $0x2948; BYTE $0xc2 // sub rdx, rax + WORD $0x014c; BYTE $0xc2 // add rdx, r8 + LONG $0xffc28348 // add rdx, -1 + LONG $0x4d6f7dc5; BYTE $0x00 // vmovdqa ymm9, yword 0[rbp] /* [rip + .LCPI2_0] */ + WORD $0xc031 // xor eax, eax + LONG $0x6f7dc1c4; BYTE $0xd9 // vmovdqa ymm3, ymm9 + LONG $0x6f7dc1c4; BYTE $0xd1 // vmovdqa ymm2, ymm9 + LONG $0x6f7d41c4; BYTE $0xc1 // vmovdqa ymm8, ymm9 + +LBB2_8: + LONG $0x246ffec5; BYTE $0xc7 // vmovdqu ymm4, yword [rdi + 8*rax] + LONG $0x6c6ffec5; WORD $0x20c7 // vmovdqu ymm5, yword [rdi + 8*rax + 32] + LONG $0x746ffec5; WORD $0x40c7 // vmovdqu ymm6, yword [rdi + 8*rax + 64] + LONG $0x197de3c4; WORD $0x01e7 // vextractf128 xmm7, ymm4, 1 + LONG $0x197d63c4; WORD $0x01c9 // vextractf128 xmm1, ymm9, 1 + LONG $0x3771e2c4; BYTE $0xcf // vpcmpgtq xmm1, xmm1, xmm7 + LONG $0x3731e2c4; BYTE $0xfc // vpcmpgtq xmm7, xmm9, xmm4 + LONG $0x1845e3c4; WORD $0x01c9 // vinsertf128 ymm1, ymm7, xmm1, 1 + LONG $0x4b3563c4; WORD $0x10cc // vblendvpd ymm9, ymm9, ymm4, ymm1 + LONG $0x4c6ffec5; WORD $0x60c7 // vmovdqu ymm1, yword [rdi + 8*rax + 96] + LONG $0x197de3c4; WORD $0x01ec // vextractf128 xmm4, ymm5, 1 + LONG $0x197de3c4; WORD $0x01df // vextractf128 xmm7, ymm3, 1 + LONG $0x3741e2c4; BYTE $0xe4 // vpcmpgtq xmm4, xmm7, xmm4 + LONG $0x3761e2c4; BYTE $0xfd // vpcmpgtq xmm7, xmm3, xmm5 + LONG $0x1845e3c4; WORD $0x01e4 // vinsertf128 ymm4, ymm7, xmm4, 1 + LONG $0x4b65e3c4; WORD $0x40dd // vblendvpd ymm3, ymm3, ymm5, ymm4 + LONG $0x197de3c4; WORD $0x01f4 // vextractf128 xmm4, ymm6, 1 + LONG $0x197de3c4; WORD $0x01d5 // vextractf128 xmm5, ymm2, 1 + LONG $0x3751e2c4; BYTE $0xe4 // vpcmpgtq xmm4, xmm5, xmm4 + LONG $0x3769e2c4; BYTE $0xee // vpcmpgtq xmm5, xmm2, xmm6 + LONG $0x1855e3c4; WORD $0x01e4 // vinsertf128 ymm4, ymm5, xmm4, 1 + LONG $0x4b6de3c4; WORD $0x40d6 // vblendvpd ymm2, ymm2, ymm6, ymm4 + LONG $0x197de3c4; WORD $0x01cc // vextractf128 xmm4, ymm1, 1 + LONG $0x197d63c4; WORD $0x01c5 // vextractf128 xmm5, ymm8, 1 + LONG $0x3751e2c4; BYTE $0xe4 // vpcmpgtq xmm4, xmm5, xmm4 + LONG $0x3739e2c4; BYTE $0xe9 // vpcmpgtq xmm5, xmm8, xmm1 + LONG $0x1855e3c4; WORD $0x01e4 // vinsertf128 ymm4, ymm5, xmm4, 1 + LONG $0x4b3de3c4; WORD $0x40c9 // vblendvpd ymm1, ymm8, ymm1, ymm4 + QUAD $0x000080c7a46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdi + 8*rax + 128] + QUAD $0x0000a0c7ac6ffec5; BYTE $0x00 // vmovdqu ymm5, yword [rdi + 8*rax + 160] + QUAD $0x0000c0c7b46ffec5; BYTE $0x00 // vmovdqu ymm6, yword [rdi + 8*rax + 192] + LONG $0x197d63c4; WORD $0x01cf // vextractf128 xmm7, ymm9, 1 + LONG $0x197de3c4; WORD $0x01e0 // vextractf128 xmm0, ymm4, 1 + LONG $0x3741e2c4; BYTE $0xc0 // vpcmpgtq xmm0, xmm7, xmm0 + LONG $0x3731e2c4; BYTE $0xfc // vpcmpgtq xmm7, xmm9, xmm4 + LONG $0x1845e3c4; WORD $0x01c0 // vinsertf128 ymm0, ymm7, xmm0, 1 + LONG $0x4b3563c4; WORD $0x00cc // vblendvpd ymm9, ymm9, ymm4, ymm0 + QUAD $0x0000e0c7a46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdi + 8*rax + 224] + LONG $0x197de3c4; WORD $0x01df // vextractf128 xmm7, ymm3, 1 + LONG $0x197de3c4; WORD $0x01e8 // vextractf128 xmm0, ymm5, 1 + LONG $0x3741e2c4; BYTE $0xc0 // vpcmpgtq xmm0, xmm7, xmm0 + LONG $0x3761e2c4; BYTE $0xfd // vpcmpgtq xmm7, xmm3, xmm5 + LONG $0x1845e3c4; WORD $0x01c0 // vinsertf128 ymm0, ymm7, xmm0, 1 + LONG $0x4b65e3c4; WORD $0x00dd // vblendvpd ymm3, ymm3, ymm5, ymm0 + LONG $0x197de3c4; WORD $0x01d0 // vextractf128 xmm0, ymm2, 1 + LONG $0x197de3c4; WORD $0x01f5 // vextractf128 xmm5, ymm6, 1 + LONG $0x3779e2c4; BYTE $0xc5 // vpcmpgtq xmm0, xmm0, xmm5 + LONG $0x3769e2c4; BYTE $0xee // vpcmpgtq xmm5, xmm2, xmm6 + LONG $0x1855e3c4; WORD $0x01c0 // vinsertf128 ymm0, ymm5, xmm0, 1 + LONG $0x4b6de3c4; WORD $0x00d6 // vblendvpd ymm2, ymm2, ymm6, ymm0 + LONG $0x197de3c4; WORD $0x01c8 // vextractf128 xmm0, ymm1, 1 + LONG $0x197de3c4; WORD $0x01e5 // vextractf128 xmm5, ymm4, 1 + LONG $0x3779e2c4; BYTE $0xc5 // vpcmpgtq xmm0, xmm0, xmm5 + LONG $0x3771e2c4; BYTE $0xec // vpcmpgtq xmm5, xmm1, xmm4 + LONG $0x1855e3c4; WORD $0x01c0 // vinsertf128 ymm0, ymm5, xmm0, 1 + LONG $0x4b7563c4; WORD $0x00c4 // vblendvpd ymm8, ymm1, ymm4, ymm0 + LONG $0x20c08348 // add rax, 32 + LONG $0x02c28348 // add rdx, 2 + JNE LBB2_8 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JE LBB2_11 + +LBB2_10: + LONG $0x446ffec5; WORD $0x60c7 // vmovdqu ymm0, yword [rdi + 8*rax + 96] + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0x197d63c4; WORD $0x01c4 // vextractf128 xmm4, ymm8, 1 + LONG $0x3759e2c4; BYTE $0xc9 // vpcmpgtq xmm1, xmm4, xmm1 + LONG $0x3739e2c4; BYTE $0xe0 // vpcmpgtq xmm4, xmm8, xmm0 + LONG $0x185de3c4; WORD $0x01c9 // vinsertf128 ymm1, ymm4, xmm1, 1 + LONG $0x4b3d63c4; WORD $0x10c0 // vblendvpd ymm8, ymm8, ymm0, ymm1 + LONG $0x446ffec5; WORD $0x40c7 // vmovdqu ymm0, yword [rdi + 8*rax + 64] + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0x197de3c4; WORD $0x01d4 // vextractf128 xmm4, ymm2, 1 + LONG $0x3759e2c4; BYTE $0xc9 // vpcmpgtq xmm1, xmm4, xmm1 + LONG $0x3769e2c4; BYTE $0xe0 // vpcmpgtq xmm4, xmm2, xmm0 + LONG $0x185de3c4; WORD $0x01c9 // vinsertf128 ymm1, ymm4, xmm1, 1 + LONG $0x4b6de3c4; WORD $0x10d0 // vblendvpd ymm2, ymm2, ymm0, ymm1 + LONG $0x446ffec5; WORD $0x20c7 // vmovdqu ymm0, yword [rdi + 8*rax + 32] + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0x197de3c4; WORD $0x01dc // vextractf128 xmm4, ymm3, 1 + LONG $0x3759e2c4; BYTE $0xc9 // vpcmpgtq xmm1, xmm4, xmm1 + LONG $0x3761e2c4; BYTE $0xe0 // vpcmpgtq xmm4, xmm3, xmm0 + LONG $0x185de3c4; WORD $0x01c9 // vinsertf128 ymm1, ymm4, xmm1, 1 + LONG $0x4b65e3c4; WORD $0x10d8 // vblendvpd ymm3, ymm3, ymm0, ymm1 + LONG $0x046ffec5; BYTE $0xc7 // vmovdqu ymm0, yword [rdi + 8*rax] + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0x197d63c4; WORD $0x01cc // vextractf128 xmm4, ymm9, 1 + LONG $0x3759e2c4; BYTE $0xc9 // vpcmpgtq xmm1, xmm4, xmm1 + LONG $0x3731e2c4; BYTE $0xe0 // vpcmpgtq xmm4, xmm9, xmm0 + LONG $0x185de3c4; WORD $0x01c9 // vinsertf128 ymm1, ymm4, xmm1, 1 + LONG $0x4b3563c4; WORD $0x10c8 // vblendvpd ymm9, ymm9, ymm0, ymm1 + +LBB2_11: + LONG $0x197d63c4; WORD $0x01c8 // vextractf128 xmm0, ymm9, 1 + LONG $0x197de3c4; WORD $0x01d9 // vextractf128 xmm1, ymm3, 1 + LONG $0x3771e2c4; BYTE $0xc0 // vpcmpgtq xmm0, xmm1, xmm0 + LONG $0x3761c2c4; BYTE $0xc9 // vpcmpgtq xmm1, xmm3, xmm9 + LONG $0x1875e3c4; WORD $0x01c0 // vinsertf128 ymm0, ymm1, xmm0, 1 + LONG $0x4b65c3c4; WORD $0x00c1 // vblendvpd ymm0, ymm3, ymm9, ymm0 + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0x197de3c4; WORD $0x01d3 // vextractf128 xmm3, ymm2, 1 + LONG $0x3761e2c4; BYTE $0xc9 // vpcmpgtq xmm1, xmm3, xmm1 + LONG $0x3769e2c4; BYTE $0xd8 // vpcmpgtq xmm3, xmm2, xmm0 + LONG $0x1865e3c4; WORD $0x01c9 // vinsertf128 ymm1, ymm3, xmm1, 1 + LONG $0x4b6de3c4; WORD $0x10c0 // vblendvpd ymm0, ymm2, ymm0, ymm1 + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0x197d63c4; WORD $0x01c2 // vextractf128 xmm2, ymm8, 1 + LONG $0x3769e2c4; BYTE $0xc9 // vpcmpgtq xmm1, xmm2, xmm1 + LONG $0x3739e2c4; BYTE $0xd0 // vpcmpgtq xmm2, xmm8, xmm0 + LONG $0x186de3c4; WORD $0x01c9 // vinsertf128 ymm1, ymm2, xmm1, 1 + LONG $0x4b3de3c4; WORD $0x10c0 // vblendvpd ymm0, ymm8, ymm0, ymm1 + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0x3771e2c4; BYTE $0xd0 // vpcmpgtq xmm2, xmm1, xmm0 + LONG $0x3779e2c4; BYTE $0xd9 // vpcmpgtq xmm3, xmm0, xmm1 + LONG $0x186de3c4; WORD $0x01d3 // vinsertf128 ymm2, ymm2, xmm3, 1 + LONG $0x4b75e3c4; WORD $0x20c0 // vblendvpd ymm0, ymm1, ymm0, ymm2 + LONG $0x0479e3c4; WORD $0x4ec8 // vpermilps xmm1, xmm0, 78 + LONG $0x3771e2c4; BYTE $0xd0 // vpcmpgtq xmm2, xmm1, xmm0 + LONG $0x197de3c4; WORD $0x01c3 // vextractf128 xmm3, ymm0, 1 + LONG $0x3779e2c4; BYTE $0xdb // vpcmpgtq xmm3, xmm0, xmm3 + LONG $0x186de3c4; WORD $0x01d3 // vinsertf128 ymm2, ymm2, xmm3, 1 + LONG $0x4b75e3c4; WORD $0x20c0 // vblendvpd ymm0, ymm1, ymm0, ymm2 + LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0 + WORD $0x3948; BYTE $0xf1 // cmp rcx, rsi + JE LBB2_13 + +LBB2_12: + LONG $0xcf148b48 // mov rdx, qword [rdi + 8*rcx] + WORD $0x3948; BYTE $0xc2 // cmp rdx, rax + LONG $0xc24e0f48 // cmovle rax, rdx + LONG $0x01c18348 // add rcx, 1 + WORD $0x3948; BYTE $0xce // cmp rsi, rcx + JNE LBB2_12 + +LBB2_13: + BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER + MOVQ AX, x+24(FP) + RET + +LBB2_6: + LONG $0x4d6f7dc5; BYTE $0x00 // vmovdqa ymm9, yword 0[rbp] /* [rip + .LCPI2_0] */ + WORD $0xc031 // xor eax, eax + LONG $0x6f7dc1c4; BYTE $0xd9 // vmovdqa ymm3, ymm9 + LONG $0x6f7dc1c4; BYTE $0xd1 // vmovdqa ymm2, ymm9 + LONG $0x6f7d41c4; BYTE $0xc1 // vmovdqa ymm8, ymm9 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JNE LBB2_10 + JMP LBB2_11 + +TEXT sampleMinSSE42<>(SB), NOSPLIT, $0-32 + + MOVQ addr+0(FP), DI + MOVQ len+8(FP), SI + MOVQ cap+16(FP), DX + LEAQ LCDATA2<>(SB), BP + + WORD $0x8548; BYTE $0xf6 // test rsi, rsi + JE LBB2_1 + QUAD $0xffffffffffffb848; WORD $0x7fff // mov rax, 9223372036854775807 + JLE LBB2_13 + LONG $0x04fe8348 // cmp rsi, 4 + JAE LBB2_5 + WORD $0xc931 // xor ecx, ecx + JMP LBB2_12 + +LBB2_1: + WORD $0xc031 // xor eax, eax + JMP LBB2_13 + +LBB2_5: + WORD $0x8948; BYTE $0xf1 // mov rcx, rsi + LONG $0xfce18348 // and rcx, -4 + LONG $0xfc518d48 // lea rdx, [rcx - 4] + WORD $0x8948; BYTE $0xd0 // mov rax, rdx + LONG $0x02e8c148 // shr rax, 2 + LONG $0x01c08348 // add rax, 1 + WORD $0x8941; BYTE $0xc0 // mov r8d, eax + LONG $0x01e08341 // and r8d, 1 + WORD $0x8548; BYTE $0xd2 // test rdx, rdx + JE LBB2_6 + LONG $0x000001ba; BYTE $0x00 // mov edx, 1 + WORD $0x2948; BYTE $0xc2 // sub rdx, rax + LONG $0x10048d49 // lea rax, [r8 + rdx] + LONG $0xffc08348 // add rax, -1 + LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI2_0] */ + WORD $0xd231 // xor edx, edx + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + +LBB2_8: + LONG $0x1c6f0ff3; BYTE $0xd7 // movdqu xmm3, oword [rdi + 8*rdx] + LONG $0x646f0ff3; WORD $0x10d7 // movdqu xmm4, oword [rdi + 8*rdx + 16] + LONG $0x6c6f0ff3; WORD $0x20d7 // movdqu xmm5, oword [rdi + 8*rdx + 32] + LONG $0xc16f0f66 // movdqa xmm0, xmm1 + LONG $0x37380f66; BYTE $0xc3 // pcmpgtq xmm0, xmm3 + LONG $0x15380f66; BYTE $0xcb // blendvpd xmm1, xmm3, xmm0 + LONG $0x5c6f0ff3; WORD $0x30d7 // movdqu xmm3, oword [rdi + 8*rdx + 48] + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0x37380f66; BYTE $0xc4 // pcmpgtq xmm0, xmm4 + LONG $0x15380f66; BYTE $0xd4 // blendvpd xmm2, xmm4, xmm0 + LONG $0xc1280f66 // movapd xmm0, xmm1 + LONG $0x37380f66; BYTE $0xc5 // pcmpgtq xmm0, xmm5 + LONG $0x15380f66; BYTE $0xcd // blendvpd xmm1, xmm5, xmm0 + LONG $0xc2280f66 // movapd xmm0, xmm2 + LONG $0x37380f66; BYTE $0xc3 // pcmpgtq xmm0, xmm3 + LONG $0x15380f66; BYTE $0xd3 // blendvpd xmm2, xmm3, xmm0 + LONG $0x08c28348 // add rdx, 8 + LONG $0x02c08348 // add rax, 2 + JNE LBB2_8 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JE LBB2_11 + +LBB2_10: + LONG $0x5c6f0ff3; WORD $0x10d7 // movdqu xmm3, oword [rdi + 8*rdx + 16] + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0x37380f66; BYTE $0xc3 // pcmpgtq xmm0, xmm3 + LONG $0x15380f66; BYTE $0xd3 // blendvpd xmm2, xmm3, xmm0 + LONG $0x1c6f0ff3; BYTE $0xd7 // movdqu xmm3, oword [rdi + 8*rdx] + LONG $0xc16f0f66 // movdqa xmm0, xmm1 + LONG $0x37380f66; BYTE $0xc3 // pcmpgtq xmm0, xmm3 + LONG $0x15380f66; BYTE $0xcb // blendvpd xmm1, xmm3, xmm0 + +LBB2_11: + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0x37380f66; BYTE $0xc1 // pcmpgtq xmm0, xmm1 + LONG $0x15380f66; BYTE $0xd1 // blendvpd xmm2, xmm1, xmm0 + LONG $0xca700f66; BYTE $0x4e // pshufd xmm1, xmm2, 78 + LONG $0xc16f0f66 // movdqa xmm0, xmm1 + LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 + LONG $0x15380f66; BYTE $0xca // blendvpd xmm1, xmm2, xmm0 + LONG $0x7e0f4866; BYTE $0xc8 // movq rax, xmm1 + WORD $0x3948; BYTE $0xf1 // cmp rcx, rsi + JE LBB2_13 + +LBB2_12: + LONG $0xcf148b48 // mov rdx, qword [rdi + 8*rcx] + WORD $0x3948; BYTE $0xc2 // cmp rdx, rax + LONG $0xc24e0f48 // cmovle rax, rdx + LONG $0x01c18348 // add rcx, 1 + WORD $0x3948; BYTE $0xce // cmp rsi, rcx + JNE LBB2_12 + +LBB2_13: + MOVQ AX, x+24(FP) + RET + +LBB2_6: + LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI2_0] */ + WORD $0xd231 // xor edx, edx + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + WORD $0x854d; BYTE $0xc0 // test r8, r8 + JNE LBB2_10 + JMP LBB2_11 diff --git a/sample_default.s b/sample_default.s new file mode 100644 index 0000000..a33d8c8 --- /dev/null +++ b/sample_default.s @@ -0,0 +1,15 @@ +// +build !amd64,!gccgo + +#include "textflag.h" + +TEXT ·SampleSum(SB), NOSPLIT, $0-32 + JMP ·sampleSum(SB) + +TEXT ·SampleVariance(SB), NOSPLIT, $0-32 + JMP ·sampleVariance(SB) + +TEXT ·SampleMin(SB), NOSPLIT, $0-32 + JMP ·sampleMin(SB) + +TEXT ·SampleMax(SB), NOSPLIT, $0-32 + JMP ·sampleMax(SB) diff --git a/sample_test.go b/sample_test.go index d60e99c..cbc73a6 100644 --- a/sample_test.go +++ b/sample_test.go @@ -1,12 +1,21 @@ package metrics import ( + "math" "math/rand" "runtime" "testing" "time" ) +func float64NotEqual(a, b float64) bool { + v := math.Abs(a - b) + if b == 0.0 && v > 0.00001 { + return false + } + return math.Abs(v/b) > 0.00001 +} + // Benchmark{Compute,Copy}{1000,1000000} demonstrate that, even for relatively // expensive computations like Variance, the cost of copying the Sample, as // approximated by a make and copy, is much greater than the cost of the @@ -54,6 +63,83 @@ func BenchmarkCopy1000000(b *testing.B) { } } +func BenchmarkSampleCalc4KSampleSum(b *testing.B) { + s := make([]int64, 4096) + for i := 0; i < len(s); i++ { + s[i] = int64(i) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + SampleSum(s) + } +} + +func BenchmarkSampleCalc4KSampleMax(b *testing.B) { + s := make([]int64, 4096) + for i := 0; i < len(s); i++ { + s[i] = int64(i) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + SampleMax(s) + } +} + +func BenchmarkSampleCalc4KSampleMin(b *testing.B) { + s := make([]int64, 4096) + for i := 0; i < len(s); i++ { + s[i] = int64(i) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + SampleMin(s) + } +} + +func BenchmarkSampleCalc4KSampleMean(b *testing.B) { + s := make([]int64, 4096) + for i := 0; i < len(s); i++ { + s[i] = int64(i) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + SampleMean(s) + } +} + +func BenchmarkSampleCalc4KSampleStdDev(b *testing.B) { + s := make([]int64, 4096) + for i := 0; i < len(s); i++ { + s[i] = int64(i) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + SampleStdDev(s) + } +} + +func BenchmarkSampleCalc4KSampleVariance(b *testing.B) { + s := make([]int64, 4096) + for i := 0; i < len(s); i++ { + s[i] = int64(i) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + SampleVariance(s) + } +} + +func BenchmarkSampleCalc4KSamplePercentile(b *testing.B) { + s := make([]int64, 4096) + for i := 0; i < len(s); i++ { + s[i] = int64(i) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + SamplePercentile(s, 0.5) + } +} + func BenchmarkExpDecaySample257(b *testing.B) { benchmarkSample(b, NewExpDecaySample(257, 0.015)) } @@ -285,20 +371,20 @@ func testExpDecaySampleStatistics(t *testing.T, s Sample) { if max := s.Max(); 10000 != max { t.Errorf("s.Max(): 10000 != %v\n", max) } - if mean := s.Mean(); 4965.98 != mean { + if mean := s.Mean(); float64NotEqual(4965.98, mean) { t.Errorf("s.Mean(): 4965.98 != %v\n", mean) } - if stdDev := s.StdDev(); 2959.825156930727 != stdDev { + if stdDev := s.StdDev(); float64NotEqual(2959.825156930727, stdDev) { t.Errorf("s.StdDev(): 2959.825156930727 != %v\n", stdDev) } ps := s.Percentiles([]float64{0.5, 0.75, 0.99}) - if 4615 != ps[0] { + if float64NotEqual(4615, ps[0]) { t.Errorf("median: 4615 != %v\n", ps[0]) } - if 7672 != ps[1] { + if float64NotEqual(7672, ps[1]) { t.Errorf("75th percentile: 7672 != %v\n", ps[1]) } - if 9998.99 != ps[2] { + if float64NotEqual(9998.99, ps[2]) { t.Errorf("99th percentile: 9998.99 != %v\n", ps[2]) } } @@ -313,20 +399,20 @@ func testUniformSampleStatistics(t *testing.T, s Sample) { if max := s.Max(); 9989 != max { t.Errorf("s.Max(): 9989 != %v\n", max) } - if mean := s.Mean(); 4748.14 != mean { + if mean := s.Mean(); float64NotEqual(4748.14, mean) { t.Errorf("s.Mean(): 4748.14 != %v\n", mean) } - if stdDev := s.StdDev(); 2826.684117548333 != stdDev { + if stdDev := s.StdDev(); float64NotEqual(2826.684117548333, stdDev) { t.Errorf("s.StdDev(): 2826.684117548333 != %v\n", stdDev) } ps := s.Percentiles([]float64{0.5, 0.75, 0.99}) - if 4599 != ps[0] { + if float64NotEqual(4599, ps[0]) { t.Errorf("median: 4599 != %v\n", ps[0]) } - if 7380.5 != ps[1] { + if float64NotEqual(7380.5, ps[1]) { t.Errorf("75th percentile: 7380.5 != %v\n", ps[1]) } - if 9986.429999999998 != ps[2] { + if float64NotEqual(9986.429999999998, ps[2]) { t.Errorf("99th percentile: 9986.429999999998 != %v\n", ps[2]) } } @@ -361,3 +447,60 @@ func TestUniformSampleConcurrentUpdateCount(t *testing.T) { } quit <- struct{}{} } + +func TestSampleFunction(t *testing.T) { + var cases []struct { + slice []int64 + sum int64 + max int64 + min int64 + va float64 + } + for i := 0; i < 4100; i++ { + var slice []int64 + for j := 0; j < i; j++ { + slice = append(slice, rand.Int63()) + } + sum := sampleSum(slice) + max := sampleMax(slice) + min := sampleMin(slice) + va := sampleVariance(slice) + + cases = append(cases, struct { + slice []int64 + sum int64 + max int64 + min int64 + va float64 + }{ + slice, + sum, + max, + min, + va, + }) + } + + f0, f1, f2, empty := x86HasAVX2, x86HasAVX, x86HasSSE42, false + defer func() { + x86HasAVX2, x86HasAVX, x86HasSSE42 = f0, f1, f2 + }() + + for i, flag := range []*bool{&empty, &x86HasAVX2, &x86HasAVX, &x86HasSSE42} { + *flag = false + for j, v := range cases { + if sum := SampleSum(v.slice); sum != v.sum { + t.Fatalf("SampleSum %d:%d test failed, expect %v, got %v", i, j, v.sum, sum) + } + if max := SampleMax(v.slice); max != v.max { + t.Fatalf("SampleMax %d:%d test failed, expect %v, got %v", i, j, v.max, max) + } + if min := SampleMin(v.slice); min != v.min { + t.Fatalf("SampleMin %d:%d test failed, expect %v, got %v", i, j, v.min, min) + } + if va := SampleVariance(v.slice); float64NotEqual(va, v.va) { + t.Fatalf("SampleVariance %d:%d test failed, expect %v, got %v %v", i, j, v.va, va, math.Abs(va-v.va)) + } + } + } +} diff --git a/timer_test.go b/timer_test.go index f85c9b8..159e99e 100644 --- a/timer_test.go +++ b/timer_test.go @@ -27,7 +27,7 @@ func TestTimerExtremes(t *testing.T) { tm := NewTimer() tm.Update(math.MaxInt64) tm.Update(0) - if stdDev := tm.StdDev(); 4.611686018427388e+18 != stdDev { + if stdDev := tm.StdDev(); float64NotEqual(4.611686018427388e+18, stdDev) { t.Errorf("tm.StdDev(): 4.611686018427388e+18 != %v\n", stdDev) } } @@ -63,32 +63,32 @@ func TestTimerZero(t *testing.T) { if max := tm.Max(); 0 != max { t.Errorf("tm.Max(): 0 != %v\n", max) } - if mean := tm.Mean(); 0.0 != mean { + if mean := tm.Mean(); float64NotEqual(0.0, mean) { t.Errorf("tm.Mean(): 0.0 != %v\n", mean) } - if stdDev := tm.StdDev(); 0.0 != stdDev { + if stdDev := tm.StdDev(); float64NotEqual(0.0, stdDev) { t.Errorf("tm.StdDev(): 0.0 != %v\n", stdDev) } ps := tm.Percentiles([]float64{0.5, 0.75, 0.99}) - if 0.0 != ps[0] { + if float64NotEqual(0.0, ps[0]) { t.Errorf("median: 0.0 != %v\n", ps[0]) } - if 0.0 != ps[1] { + if float64NotEqual(0.0, ps[1]) { t.Errorf("75th percentile: 0.0 != %v\n", ps[1]) } - if 0.0 != ps[2] { + if float64NotEqual(0.0, ps[2]) { t.Errorf("99th percentile: 0.0 != %v\n", ps[2]) } - if rate1 := tm.Rate1(); 0.0 != rate1 { + if rate1 := tm.Rate1(); float64NotEqual(0.0, rate1) { t.Errorf("tm.Rate1(): 0.0 != %v\n", rate1) } - if rate5 := tm.Rate5(); 0.0 != rate5 { + if rate5 := tm.Rate5(); float64NotEqual(0.0, rate5) { t.Errorf("tm.Rate5(): 0.0 != %v\n", rate5) } - if rate15 := tm.Rate15(); 0.0 != rate15 { + if rate15 := tm.Rate15(); float64NotEqual(0.0, rate15) { t.Errorf("tm.Rate15(): 0.0 != %v\n", rate15) } - if rateMean := tm.RateMean(); 0.0 != rateMean { + if rateMean := tm.RateMean(); float64NotEqual(0.0, rateMean) { t.Errorf("tm.RateMean(): 0.0 != %v\n", rateMean) } }