Skip to content

Commit 918d4d4

Browse files
mengzhuocherrymui
authored andcommitted
runtime: improve MIPS64x memclr
Using MIPS MSA VLD/VST to improve mips64x large memclr. name old time/op new time/op delta Memclr/5 23.2ns ± 0% 21.5ns ± 0% -7.33% (p=0.000 n=9+8) Memclr/16 20.1ns ± 0% 17.1ns ± 0% -14.93% (p=0.000 n=10+10) Memclr/64 27.2ns ± 0% 19.1ns ± 0% -29.70% (p=0.000 n=9+9) Memclr/256 76.8ns ± 0% 24.1ns ± 0% -68.66% (p=0.000 n=10+10) Memclr/4096 1.12µs ± 1% 0.18µs ± 0% -84.32% (p=0.000 n=10+8) Memclr/65536 18.0µs ± 0% 2.8µs ± 0% -84.29% (p=0.000 n=10+10) Memclr/1M 288µs ± 0% 45µs ± 0% -84.20% (p=0.000 n=10+10) Memclr/4M 1.15ms ± 0% 0.18ms ± 0% -84.21% (p=0.000 n=9+10) Memclr/8M 2.34ms ± 0% 1.39ms ± 0% -40.55% (p=0.000 n=10+8) Memclr/16M 4.72ms ± 0% 4.74ms ± 0% +0.52% (p=0.000 n=9+10) Memclr/64M 18.9ms ± 0% 18.9ms ± 0% ~ (p=0.436 n=10+10) GoMemclr/5 13.7ns ± 0% 16.9ns ± 0% +23.36% (p=0.000 n=10+10) GoMemclr/16 14.3ns ± 0% 9.0ns ± 0% -37.27% (p=0.000 n=10+9) GoMemclr/64 26.9ns ± 0% 13.7ns ± 0% -49.07% (p=0.000 n=10+10) GoMemclr/256 77.8ns ± 0% 13.0ns ± 0% -83.24% (p=0.000 n=9+10) name old speed new speed delta Memclr/5 215MB/s ± 0% 232MB/s ± 0% +7.74% (p=0.000 n=9+9) Memclr/16 795MB/s ± 0% 935MB/s ± 0% +17.60% (p=0.000 n=10+10) Memclr/64 2.35GB/s ± 0% 3.35GB/s ± 0% +42.33% (p=0.000 n=10+9) Memclr/256 3.34GB/s ± 0% 10.65GB/s ± 0% +219.16% (p=0.000 n=10+10) Memclr/4096 3.65GB/s ± 1% 23.30GB/s ± 0% +538.36% (p=0.000 n=10+10) Memclr/65536 3.65GB/s ± 0% 23.21GB/s ± 0% +536.59% (p=0.000 n=10+10) Memclr/1M 3.64GB/s ± 0% 23.07GB/s ± 0% +532.96% (p=0.000 n=10+10) Memclr/4M 3.64GB/s ± 0% 23.08GB/s ± 0% +533.36% (p=0.000 n=9+10) Memclr/8M 3.58GB/s ± 0% 6.02GB/s ± 0% +68.20% (p=0.000 n=10+8) Memclr/16M 3.56GB/s ± 0% 3.54GB/s ± 0% -0.51% (p=0.000 n=9+10) Memclr/64M 3.55GB/s ± 0% 3.55GB/s ± 0% ~ (p=0.436 n=10+10) GoMemclr/5 364MB/s ± 0% 296MB/s ± 0% -18.76% (p=0.000 n=9+10) GoMemclr/16 1.12GB/s ± 0% 1.78GB/s ± 0% +58.86% (p=0.000 n=10+10) GoMemclr/64 2.38GB/s ± 0% 4.66GB/s ± 0% +96.27% (p=0.000 n=10+9) GoMemclr/256 3.29GB/s ± 0% 19.62GB/s ± 0% +496.45% (p=0.000 n=10+9) Change-Id: I457858368f2875fd66818a41d2f0c190a850e8f1 Reviewed-on: https://go-review.googlesource.com/c/go/+/218177 Run-TryBot: Cherry Zhang <[email protected]> Reviewed-by: Cherry Zhang <[email protected]>
1 parent d1ecfcc commit 918d4d4

File tree

2 files changed

+57
-0
lines changed

2 files changed

+57
-0
lines changed

src/runtime/cpuflags.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ const (
1717
offsetX86HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2)
1818

1919
offsetARMHasIDIVA = unsafe.Offsetof(cpu.ARM.HasIDIVA)
20+
21+
offsetMIPS64XHasMSA = unsafe.Offsetof(cpu.MIPS64X.HasMSA)
2022
)
2123

2224
var (

src/runtime/memclr_mips64x.s

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
// +build mips64 mips64le
66

7+
#include "go_asm.h"
78
#include "textflag.h"
89

910
// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
@@ -12,6 +13,60 @@ TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-16
1213
MOVV n+8(FP), R2
1314
ADDV R1, R2, R4
1415

16+
// if less than 16 bytes or no MSA, do words check
17+
SGTU $16, R2, R3
18+
BNE R3, no_msa
19+
MOVBU internal∕cpu·MIPS64X+const_offsetMIPS64XHasMSA(SB), R3
20+
BEQ R3, R0, no_msa
21+
22+
VMOVB $0, W0
23+
24+
SGTU $128, R2, R3
25+
BEQ R3, msa_large
26+
27+
AND $15, R2, R5
28+
XOR R2, R5, R6
29+
ADDVU R1, R6
30+
31+
msa_small:
32+
VMOVB W0, (R1)
33+
ADDVU $16, R1
34+
SGTU R6, R1, R3
35+
BNE R3, R0, msa_small
36+
BEQ R5, R0, done
37+
VMOVB W0, -16(R4)
38+
JMP done
39+
40+
msa_large:
41+
AND $127, R2, R5
42+
XOR R2, R5, R6
43+
ADDVU R1, R6
44+
45+
msa_large_loop:
46+
VMOVB W0, (R1)
47+
VMOVB W0, 16(R1)
48+
VMOVB W0, 32(R1)
49+
VMOVB W0, 48(R1)
50+
VMOVB W0, 64(R1)
51+
VMOVB W0, 80(R1)
52+
VMOVB W0, 96(R1)
53+
VMOVB W0, 112(R1)
54+
55+
ADDVU $128, R1
56+
SGTU R6, R1, R3
57+
BNE R3, R0, msa_large_loop
58+
BEQ R5, R0, done
59+
VMOVB W0, -128(R4)
60+
VMOVB W0, -112(R4)
61+
VMOVB W0, -96(R4)
62+
VMOVB W0, -80(R4)
63+
VMOVB W0, -64(R4)
64+
VMOVB W0, -48(R4)
65+
VMOVB W0, -32(R4)
66+
VMOVB W0, -16(R4)
67+
JMP done
68+
69+
no_msa:
1570
// if less than 8 bytes, do one byte at a time
1671
SGTU $8, R2, R3
1772
BNE R3, out

0 commit comments

Comments
 (0)