Skip to content

Commit d5388e2

Browse files
AWSjswinneycherrymui
authored andcommitted
runtime: improve memmove performance on arm64
Replace the memmove implementation for moves of 17 bytes or larger with an implementation from ARM optimized software. The moves of 16 bytes or fewer are unchanged, but the registers used are updated to match the rest of the implementation. This implementation makes use of new optimizations: - software pipelined loop for large (>128 byte) moves - medium size moves (17..128 bytes) have a new implementation - address realignment when src or dst is unaligned - preference for aligned src (loads) or dst (stores) depending on CPU To support preference for aligned loads or aligned stores, a new CPU flag is added. This flag indicates that the detected micro architecture performs better with aligned loads. Some tested CPUs did not exhibit a significant difference and are left with the default behavior of realigning based on the destination address (stores). Neoverse N1 (Tested on Graviton 2) name old time/op new time/op delta Memmove/0-4 1.88ns ± 1% 1.87ns ± 1% -0.58% (p=0.020 n=10+10) Memmove/1-4 4.40ns ± 0% 4.40ns ± 0% ~ (all equal) Memmove/8-4 3.88ns ± 3% 3.80ns ± 0% -1.97% (p=0.001 n=10+9) Memmove/16-4 3.90ns ± 3% 3.80ns ± 0% -2.49% (p=0.000 n=10+9) Memmove/32-4 4.80ns ± 0% 4.40ns ± 0% -8.33% (p=0.000 n=9+8) Memmove/64-4 5.86ns ± 0% 5.00ns ± 0% -14.76% (p=0.000 n=8+8) Memmove/128-4 8.46ns ± 0% 8.06ns ± 0% -4.62% (p=0.000 n=10+10) Memmove/256-4 12.4ns ± 0% 12.2ns ± 0% -1.61% (p=0.000 n=10+10) Memmove/512-4 19.5ns ± 0% 19.1ns ± 0% -2.05% (p=0.000 n=10+10) Memmove/1024-4 33.7ns ± 0% 33.5ns ± 0% -0.59% (p=0.000 n=10+10) Memmove/2048-4 62.1ns ± 0% 59.0ns ± 0% -4.99% (p=0.000 n=10+10) Memmove/4096-4 117ns ± 1% 110ns ± 0% -5.66% (p=0.000 n=10+10) MemmoveUnalignedDst/64-4 6.41ns ± 0% 5.62ns ± 0% -12.32% (p=0.000 n=10+7) MemmoveUnalignedDst/128-4 9.40ns ± 0% 8.34ns ± 0% -11.24% (p=0.000 n=10+10) MemmoveUnalignedDst/256-4 12.8ns ± 0% 12.8ns ± 0% ~ (all equal) MemmoveUnalignedDst/512-4 20.4ns ± 0% 19.7ns ± 0% -3.43% (p=0.000 n=9+10) MemmoveUnalignedDst/1024-4 34.1ns ± 0% 35.1ns ± 0% +2.93% (p=0.000 n=9+9) MemmoveUnalignedDst/2048-4 61.5ns ± 0% 60.4ns ± 0% -1.77% (p=0.000 n=10+10) MemmoveUnalignedDst/4096-4 122ns ± 0% 113ns ± 0% -7.38% (p=0.002 n=8+10) MemmoveUnalignedSrc/64-4 7.25ns ± 1% 6.26ns ± 0% -13.64% (p=0.000 n=9+9) MemmoveUnalignedSrc/128-4 10.5ns ± 0% 9.7ns ± 0% -7.52% (p=0.000 n=10+10) MemmoveUnalignedSrc/256-4 17.1ns ± 0% 17.3ns ± 0% +1.17% (p=0.000 n=10+10) MemmoveUnalignedSrc/512-4 27.0ns ± 0% 27.0ns ± 0% ~ (all equal) MemmoveUnalignedSrc/1024-4 46.7ns ± 0% 35.7ns ± 0% -23.55% (p=0.000 n=10+9) MemmoveUnalignedSrc/2048-4 85.2ns ± 0% 61.2ns ± 0% -28.17% (p=0.000 n=10+8) MemmoveUnalignedSrc/4096-4 162ns ± 0% 113ns ± 0% -30.25% (p=0.000 n=10+10) name old speed new speed delta Memmove/4096-4 35.2GB/s ± 0% 37.1GB/s ± 0% +5.56% (p=0.000 n=10+9) MemmoveUnalignedSrc/1024-4 21.9GB/s ± 0% 28.7GB/s ± 0% +30.90% (p=0.000 n=10+10) MemmoveUnalignedSrc/2048-4 24.0GB/s ± 0% 33.5GB/s ± 0% +39.18% (p=0.000 n=10+9) MemmoveUnalignedSrc/4096-4 25.3GB/s ± 0% 36.2GB/s ± 0% +43.50% (p=0.000 n=10+7) Cortex-A72 (Graviton 1) name old time/op new time/op delta Memmove/0-4 3.06ns ± 3% 3.08ns ± 1% ~ (p=0.958 n=10+9) Memmove/1-4 8.72ns ± 0% 7.85ns ± 0% -9.98% (p=0.002 n=8+10) Memmove/8-4 8.29ns ± 0% 8.29ns ± 0% ~ (all equal) Memmove/16-4 8.29ns ± 0% 8.29ns ± 0% ~ (all equal) Memmove/32-4 8.19ns ± 2% 8.29ns ± 0% ~ (p=0.114 n=10+10) Memmove/64-4 18.3ns ± 4% 10.0ns ± 0% -45.36% (p=0.000 n=10+10) Memmove/128-4 14.8ns ± 0% 17.4ns ± 0% +17.77% (p=0.000 n=10+10) Memmove/256-4 21.8ns ± 0% 23.1ns ± 0% +5.96% (p=0.000 n=10+10) Memmove/512-4 35.8ns ± 0% 37.2ns ± 0% +3.91% (p=0.000 n=10+10) Memmove/1024-4 63.7ns ± 0% 67.2ns ± 0% +5.49% (p=0.000 n=10+10) Memmove/2048-4 126ns ± 0% 123ns ± 0% -2.38% (p=0.000 n=10+10) Memmove/4096-4 238ns ± 1% 243ns ± 1% +1.93% (p=0.000 n=10+10) MemmoveUnalignedDst/64-4 19.3ns ± 1% 12.0ns ± 1% -37.49% (p=0.000 n=10+10) MemmoveUnalignedDst/128-4 17.2ns ± 0% 17.4ns ± 0% +1.16% (p=0.000 n=10+10) MemmoveUnalignedDst/256-4 28.2ns ± 8% 29.2ns ± 0% ~ (p=0.352 n=10+10) MemmoveUnalignedDst/512-4 49.8ns ± 3% 48.9ns ± 0% ~ (p=1.000 n=10+10) MemmoveUnalignedDst/1024-4 89.5ns ± 0% 80.5ns ± 1% -10.02% (p=0.000 n=10+10) MemmoveUnalignedDst/2048-4 180ns ± 0% 127ns ± 0% -29.44% (p=0.000 n=9+10) MemmoveUnalignedDst/4096-4 347ns ± 0% 244ns ± 0% -29.59% (p=0.000 n=10+9) MemmoveUnalignedSrc/128-4 16.1ns ± 0% 21.8ns ± 0% +35.40% (p=0.000 n=10+10) MemmoveUnalignedSrc/256-4 24.9ns ± 8% 26.6ns ± 0% +6.70% (p=0.015 n=10+10) MemmoveUnalignedSrc/512-4 39.4ns ± 6% 40.6ns ± 0% ~ (p=0.352 n=10+10) MemmoveUnalignedSrc/1024-4 72.5ns ± 0% 83.0ns ± 1% +14.44% (p=0.000 n=9+10) MemmoveUnalignedSrc/2048-4 129ns ± 1% 128ns ± 1% ~ (p=0.179 n=10+10) MemmoveUnalignedSrc/4096-4 241ns ± 0% 253ns ± 1% +4.99% (p=0.000 n=9+9) Cortex-A53 (Raspberry Pi 3) name old time/op new time/op delta Memmove/0-4 11.0ns ± 0% 11.0ns ± 1% ~ (p=0.294 n=8+10) Memmove/1-4 29.6ns ± 0% 28.0ns ± 1% -5.41% (p=0.000 n=9+10) Memmove/8-4 23.5ns ± 0% 22.1ns ± 0% -6.11% (p=0.000 n=8+8) Memmove/16-4 23.7ns ± 1% 22.1ns ± 0% -6.59% (p=0.000 n=10+8) Memmove/32-4 27.9ns ± 0% 27.1ns ± 0% -3.13% (p=0.000 n=8+8) Memmove/64-4 33.8ns ± 0% 31.5ns ± 1% -6.99% (p=0.000 n=8+10) Memmove/128-4 45.6ns ± 0% 44.2ns ± 1% -3.23% (p=0.000 n=9+10) Memmove/256-4 69.3ns ± 0% 69.3ns ± 0% ~ (p=0.072 n=8+8) Memmove/512-4 127ns ± 0% 110ns ± 0% -13.39% (p=0.000 n=8+8) Memmove/1024-4 222ns ± 0% 205ns ± 1% -7.66% (p=0.000 n=7+10) Memmove/2048-4 411ns ± 0% 366ns ± 0% -10.98% (p=0.000 n=8+9) Memmove/4096-4 795ns ± 1% 695ns ± 1% -12.63% (p=0.000 n=10+10) MemmoveUnalignedDst/64-4 44.0ns ± 0% 40.5ns ± 0% -7.93% (p=0.000 n=8+8) MemmoveUnalignedDst/128-4 59.6ns ± 0% 54.9ns ± 0% -7.85% (p=0.000 n=9+9) MemmoveUnalignedDst/256-4 98.2ns ±11% 90.0ns ± 1% ~ (p=0.130 n=10+10) MemmoveUnalignedDst/512-4 161ns ± 2% 145ns ± 1% -9.96% (p=0.000 n=10+10) MemmoveUnalignedDst/1024-4 281ns ± 0% 265ns ± 0% -5.65% (p=0.000 n=9+8) MemmoveUnalignedDst/2048-4 528ns ± 0% 482ns ± 0% -8.73% (p=0.000 n=8+9) MemmoveUnalignedDst/4096-4 1.02µs ± 1% 0.92µs ± 0% -10.00% (p=0.000 n=10+8) MemmoveUnalignedSrc/64-4 42.4ns ± 1% 40.5ns ± 0% -4.39% (p=0.000 n=10+8) MemmoveUnalignedSrc/128-4 57.4ns ± 0% 57.0ns ± 1% -0.75% (p=0.048 n=9+10) MemmoveUnalignedSrc/256-4 88.1ns ± 1% 89.6ns ± 0% +1.70% (p=0.000 n=9+8) MemmoveUnalignedSrc/512-4 160ns ± 2% 144ns ± 0% -9.89% (p=0.000 n=10+8) MemmoveUnalignedSrc/1024-4 286ns ± 0% 266ns ± 1% -6.69% (p=0.000 n=8+10) MemmoveUnalignedSrc/2048-4 525ns ± 0% 483ns ± 1% -7.96% (p=0.000 n=9+10) MemmoveUnalignedSrc/4096-4 1.01µs ± 0% 0.92µs ± 1% -9.40% (p=0.000 n=8+10) Change-Id: Ia1144e9d4dfafdece6e167c5e576bf80f254c8ab Reviewed-on: https://go-review.googlesource.com/c/go/+/243357 TryBot-Result: Go Bot <[email protected]> Reviewed-by: Martin Möhrmann <[email protected]> Reviewed-by: eric fang <[email protected]> Reviewed-by: Cherry Zhang <[email protected]>
1 parent 7be8358 commit d5388e2

File tree

6 files changed

+314
-150
lines changed

6 files changed

+314
-150
lines changed

src/internal/cpu/cpu.go

+28-26
Original file line numberDiff line numberDiff line change
@@ -56,32 +56,34 @@ var ARM struct {
5656
// The booleans in ARM64 contain the correspondingly named cpu feature bit.
5757
// The struct is padded to avoid false sharing.
5858
var ARM64 struct {
59-
_ CacheLinePad
60-
HasFP bool
61-
HasASIMD bool
62-
HasEVTSTRM bool
63-
HasAES bool
64-
HasPMULL bool
65-
HasSHA1 bool
66-
HasSHA2 bool
67-
HasCRC32 bool
68-
HasATOMICS bool
69-
HasFPHP bool
70-
HasASIMDHP bool
71-
HasCPUID bool
72-
HasASIMDRDM bool
73-
HasJSCVT bool
74-
HasFCMA bool
75-
HasLRCPC bool
76-
HasDCPOP bool
77-
HasSHA3 bool
78-
HasSM3 bool
79-
HasSM4 bool
80-
HasASIMDDP bool
81-
HasSHA512 bool
82-
HasSVE bool
83-
HasASIMDFHM bool
84-
_ CacheLinePad
59+
_ CacheLinePad
60+
HasFP bool
61+
HasASIMD bool
62+
HasEVTSTRM bool
63+
HasAES bool
64+
HasPMULL bool
65+
HasSHA1 bool
66+
HasSHA2 bool
67+
HasCRC32 bool
68+
HasATOMICS bool
69+
HasFPHP bool
70+
HasASIMDHP bool
71+
HasCPUID bool
72+
HasASIMDRDM bool
73+
HasJSCVT bool
74+
HasFCMA bool
75+
HasLRCPC bool
76+
HasDCPOP bool
77+
HasSHA3 bool
78+
HasSM3 bool
79+
HasSM4 bool
80+
HasASIMDDP bool
81+
HasSHA512 bool
82+
HasSVE bool
83+
HasASIMDFHM bool
84+
IsNeoverseN1 bool
85+
IsZeus bool
86+
_ CacheLinePad
8587
}
8688

8789
var MIPS64X struct {

src/internal/cpu/cpu_arm64.go

+25
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ const (
1818
hwcap_SHA2 = 1 << 6
1919
hwcap_CRC32 = 1 << 7
2020
hwcap_ATOMICS = 1 << 8
21+
hwcap_CPUID = 1 << 11
2122
)
2223

2324
func doinit() {
@@ -28,6 +29,8 @@ func doinit() {
2829
{Name: "sha2", Feature: &ARM64.HasSHA2},
2930
{Name: "crc32", Feature: &ARM64.HasCRC32},
3031
{Name: "atomics", Feature: &ARM64.HasATOMICS},
32+
{Name: "isNeoverseN1", Feature: &ARM64.IsNeoverseN1},
33+
{Name: "isZeus", Feature: &ARM64.IsZeus},
3134
}
3235

3336
switch GOOS {
@@ -40,12 +43,32 @@ func doinit() {
4043
ARM64.HasSHA1 = isSet(HWCap, hwcap_SHA1)
4144
ARM64.HasSHA2 = isSet(HWCap, hwcap_SHA2)
4245
ARM64.HasCRC32 = isSet(HWCap, hwcap_CRC32)
46+
ARM64.HasCPUID = isSet(HWCap, hwcap_CPUID)
4347

4448
// The Samsung S9+ kernel reports support for atomics, but not all cores
4549
// actually support them, resulting in SIGILL. See issue #28431.
4650
// TODO(elias.naur): Only disable the optimization on bad chipsets on android.
4751
ARM64.HasATOMICS = isSet(HWCap, hwcap_ATOMICS) && GOOS != "android"
4852

53+
// Check to see if executing on a NeoverseN1 and in order to do that,
54+
// check the AUXV for the CPUID bit. The getMIDR function executes an
55+
// instruction which would normally be an illegal instruction, but it's
56+
// trapped by the kernel, the value sanitized and then returned. Without
57+
// the CPUID bit the kernel will not trap the instruction and the process
58+
// will be terminated with SIGILL.
59+
if ARM64.HasCPUID {
60+
midr := getMIDR()
61+
part_num := uint16((midr >> 4) & 0xfff)
62+
implementor := byte((midr >> 24) & 0xff)
63+
64+
if implementor == 'A' && part_num == 0xd0c {
65+
ARM64.IsNeoverseN1 = true
66+
}
67+
if implementor == 'A' && part_num == 0xd40 {
68+
ARM64.IsZeus = true
69+
}
70+
}
71+
4972
case "freebsd":
5073
// Retrieve info from system register ID_AA64ISAR0_EL1.
5174
isar0 := getisar0()
@@ -93,3 +116,5 @@ func isSet(hwc uint, value uint) bool {
93116
}
94117

95118
func getisar0() uint64
119+
120+
func getMIDR() uint64

src/internal/cpu/cpu_arm64.s

+6
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,9 @@ TEXT ·getisar0(SB),NOSPLIT,$0
1010
MRS ID_AA64ISAR0_EL1, R0
1111
MOVD R0, ret+0(FP)
1212
RET
13+
14+
// func getMIDR() uint64
15+
TEXT ·getMIDR(SB), NOSPLIT, $0-8
16+
MRS MIDR_EL1, R0
17+
MOVD R0, ret+0(FP)
18+
RET

src/runtime/cpuflags_arm64.go

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// Copyright 2020 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package runtime
6+
7+
import (
8+
"internal/cpu"
9+
)
10+
11+
var arm64UseAlignedLoads bool
12+
13+
func init() {
14+
if cpu.ARM64.IsNeoverseN1 || cpu.ARM64.IsZeus {
15+
arm64UseAlignedLoads = true
16+
}
17+
}

0 commit comments

Comments
 (0)