Skip to content

Commit c62c69d

Browse files
committed
crypto/internal/fips140/subtle: provide riscv64 assembly implementation for xorBytes
Provide a riscv64 assembly implementation of xorBytes, which can process up to 64 bytes per loop and has better handling for unaligned inputs. This provides a considerable performance gain compared to the generic code. On a StarFive VisionFive 2: │ subtle.1 │ subtle.2 │ │ sec/op │ sec/op vs base │ XORBytes/8Bytes-4 59.54n ± 0% 58.15n ± 0% -2.33% (p=0.000 n=10) XORBytes/128Bytes-4 125.60n ± 0% 74.93n ± 0% -40.35% (p=0.000 n=10) XORBytes/2048Bytes-4 1088.5n ± 0% 602.4n ± 0% -44.66% (p=0.000 n=10) XORBytes/8192Bytes-4 4.163µ ± 0% 2.271µ ± 0% -45.45% (p=0.000 n=10) XORBytes/32768Bytes-4 35.47µ ± 0% 28.12µ ± 0% -20.74% (p=0.000 n=10) XORBytesUnaligned/8Bytes0Offset-4 59.80n ± 0% 57.48n ± 0% -3.86% (p=0.000 n=10) XORBytesUnaligned/8Bytes1Offset-4 72.97n ± 0% 57.48n ± 0% -21.23% (p=0.000 n=10) XORBytesUnaligned/8Bytes2Offset-4 72.97n ± 0% 57.50n ± 0% -21.21% (p=0.000 n=10) XORBytesUnaligned/8Bytes3Offset-4 72.99n ± 0% 57.48n ± 0% -21.26% (p=0.000 n=10) XORBytesUnaligned/8Bytes4Offset-4 72.96n ± 0% 57.44n ± 0% -21.28% (p=0.000 n=10) XORBytesUnaligned/8Bytes5Offset-4 72.93n ± 0% 57.48n ± 0% -21.18% (p=0.000 n=10) XORBytesUnaligned/8Bytes6Offset-4 72.97n ± 0% 57.47n ± 0% -21.25% (p=0.000 n=10) XORBytesUnaligned/8Bytes7Offset-4 72.96n ± 0% 57.47n ± 0% -21.24% (p=0.000 n=10) XORBytesUnaligned/128Bytes0Offset-4 125.30n ± 0% 74.18n ± 0% -40.80% (p=0.000 n=10) XORBytesUnaligned/128Bytes1Offset-4 557.4n ± 0% 131.1n ± 0% -76.48% (p=0.000 n=10) XORBytesUnaligned/128Bytes2Offset-4 557.3n ± 0% 132.5n ± 0% -76.22% (p=0.000 n=10) XORBytesUnaligned/128Bytes3Offset-4 557.6n ± 0% 133.7n ± 0% -76.02% (p=0.000 n=10) XORBytesUnaligned/128Bytes4Offset-4 557.4n ± 0% 125.0n ± 0% -77.57% (p=0.000 n=10) XORBytesUnaligned/128Bytes5Offset-4 557.7n ± 0% 125.7n ± 0% -77.46% (p=0.000 n=10) XORBytesUnaligned/128Bytes6Offset-4 557.5n ± 0% 127.0n ± 0% -77.22% (p=0.000 n=10) XORBytesUnaligned/128Bytes7Offset-4 557.7n ± 0% 128.3n ± 0% -76.99% (p=0.000 n=10) XORBytesUnaligned/2048Bytes0Offset-4 1088.5n ± 0% 601.9n ± 0% -44.71% (p=0.000 n=10) XORBytesUnaligned/2048Bytes1Offset-4 8243.0n ± 0% 655.7n ± 0% -92.05% (p=0.000 n=10) XORBytesUnaligned/2048Bytes2Offset-4 8244.0n ± 0% 657.1n ± 0% -92.03% (p=0.000 n=10) XORBytesUnaligned/2048Bytes3Offset-4 8247.5n ± 0% 658.7n ± 0% -92.01% (p=0.000 n=10) XORBytesUnaligned/2048Bytes4Offset-4 8243.0n ± 0% 649.8n ± 0% -92.12% (p=0.000 n=10) XORBytesUnaligned/2048Bytes5Offset-4 8247.0n ± 0% 650.2n ± 0% -92.12% (p=0.000 n=10) XORBytesUnaligned/2048Bytes6Offset-4 8243.0n ± 0% 651.6n ± 0% -92.09% (p=0.000 n=10) XORBytesUnaligned/2048Bytes7Offset-4 8244.0n ± 0% 652.8n ± 0% -92.08% (p=0.000 n=10) geomean 410.1n 147.2n -64.10% │ subtle.1 │ subtle.2 │ │ B/s │ B/s vs base │ XORBytes/8Bytes-4 128.1Mi ± 0% 131.2Mi ± 0% +2.40% (p=0.000 n=10) XORBytes/128Bytes-4 971.6Mi ± 0% 1629.2Mi ± 0% +67.69% (p=0.000 n=10) XORBytes/2048Bytes-4 1.752Gi ± 0% 3.166Gi ± 0% +80.68% (p=0.000 n=10) XORBytes/8192Bytes-4 1.833Gi ± 0% 3.360Gi ± 0% +83.35% (p=0.000 n=10) XORBytes/32768Bytes-4 881.0Mi ± 0% 1111.5Mi ± 0% +26.16% (p=0.000 n=10) XORBytesUnaligned/8Bytes0Offset-4 127.6Mi ± 0% 132.7Mi ± 0% +4.02% (p=0.000 n=10) XORBytesUnaligned/8Bytes1Offset-4 104.5Mi ± 0% 132.7Mi ± 0% +26.95% (p=0.000 n=10) XORBytesUnaligned/8Bytes2Offset-4 104.6Mi ± 0% 132.7Mi ± 0% +26.92% (p=0.000 n=10) XORBytesUnaligned/8Bytes3Offset-4 104.5Mi ± 0% 132.8Mi ± 0% +27.01% (p=0.000 n=10) XORBytesUnaligned/8Bytes4Offset-4 104.6Mi ± 0% 132.8Mi ± 0% +27.02% (p=0.000 n=10) XORBytesUnaligned/8Bytes5Offset-4 104.6Mi ± 0% 132.7Mi ± 0% +26.89% (p=0.000 n=10) XORBytesUnaligned/8Bytes6Offset-4 104.5Mi ± 0% 132.8Mi ± 0% +26.99% (p=0.000 n=10) XORBytesUnaligned/8Bytes7Offset-4 104.6Mi ± 0% 132.8Mi ± 0% +26.97% (p=0.000 n=10) XORBytesUnaligned/128Bytes0Offset-4 974.4Mi ± 0% 1645.7Mi ± 0% +68.90% (p=0.000 n=10) XORBytesUnaligned/128Bytes1Offset-4 219.0Mi ± 0% 931.3Mi ± 0% +325.23% (p=0.000 n=10) XORBytesUnaligned/128Bytes2Offset-4 219.0Mi ± 0% 921.2Mi ± 0% +320.57% (p=0.000 n=10) XORBytesUnaligned/128Bytes3Offset-4 218.9Mi ± 0% 912.9Mi ± 0% +316.97% (p=0.000 n=10) XORBytesUnaligned/128Bytes4Offset-4 219.0Mi ± 0% 976.4Mi ± 0% +345.85% (p=0.000 n=10) XORBytesUnaligned/128Bytes5Offset-4 218.9Mi ± 0% 971.2Mi ± 0% +343.70% (p=0.000 n=10) XORBytesUnaligned/128Bytes6Offset-4 219.0Mi ± 0% 961.1Mi ± 0% +338.86% (p=0.000 n=10) XORBytesUnaligned/128Bytes7Offset-4 218.9Mi ± 0% 951.1Mi ± 0% +334.52% (p=0.000 n=10) XORBytesUnaligned/2048Bytes0Offset-4 1.752Gi ± 0% 3.169Gi ± 0% +80.83% (p=0.000 n=10) XORBytesUnaligned/2048Bytes1Offset-4 236.9Mi ± 0% 2978.6Mi ± 0% +1157.10% (p=0.000 n=10) XORBytesUnaligned/2048Bytes2Offset-4 236.9Mi ± 0% 2972.1Mi ± 0% +1154.48% (p=0.000 n=10) XORBytesUnaligned/2048Bytes3Offset-4 236.8Mi ± 0% 2965.1Mi ± 0% +1152.05% (p=0.000 n=10) XORBytesUnaligned/2048Bytes4Offset-4 236.9Mi ± 0% 3005.9Mi ± 0% +1168.65% (p=0.000 n=10) XORBytesUnaligned/2048Bytes5Offset-4 236.8Mi ± 0% 3004.0Mi ± 0% +1168.42% (p=0.000 n=10) XORBytesUnaligned/2048Bytes6Offset-4 236.9Mi ± 0% 2997.2Mi ± 0% +1164.96% (p=0.000 n=10) XORBytesUnaligned/2048Bytes7Offset-4 236.9Mi ± 0% 2991.9Mi ± 0% +1162.93% (p=0.000 n=10) geomean 260.4Mi 806.7Mi +209.73% Change-Id: I9bec9c8f48df7284f8414ac745615c2a093e9ae9 Reviewed-on: https://go-review.googlesource.com/c/go/+/639858 Reviewed-by: Mark Ryan <[email protected]> Reviewed-by: Cherry Mui <[email protected]> Reviewed-by: Meng Zhuo <[email protected]> TryBot-Bypass: Joel Sing <[email protected]> Reviewed-by: Dmitri Shuralyov <[email protected]>
1 parent 63ae416 commit c62c69d

File tree

3 files changed

+171
-2
lines changed

3 files changed

+171
-2
lines changed

src/crypto/internal/fips140/subtle/xor_asm.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build (amd64 || arm64 || loong64 || ppc64 || ppc64le) && !purego
5+
//go:build (amd64 || arm64 || loong64 || ppc64 || ppc64le || riscv64) && !purego
66

77
package subtle
88

src/crypto/internal/fips140/subtle/xor_generic.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build (!amd64 && !arm64 && !loong64 && !ppc64 && !ppc64le) || purego
5+
//go:build (!amd64 && !arm64 && !loong64 && !ppc64 && !ppc64le && !riscv64) || purego
66

77
package subtle
88

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
// Copyright 2025 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build !purego
6+
7+
#include "textflag.h"
8+
9+
// func xorBytes(dst, a, b *byte, n int)
10+
TEXT ·xorBytes(SB), NOSPLIT|NOFRAME, $0
11+
MOV dst+0(FP), X10
12+
MOV a+8(FP), X11
13+
MOV b+16(FP), X12
14+
MOV n+24(FP), X13
15+
16+
MOV $32, X15
17+
BLT X13, X15, loop4_check
18+
19+
// Check alignment - if alignment differs we have to do one byte at a time.
20+
AND $7, X10, X5
21+
AND $7, X11, X6
22+
AND $7, X12, X7
23+
BNE X5, X6, loop4_check
24+
BNE X5, X7, loop4_check
25+
BEQZ X5, loop64_check
26+
27+
// Check one byte at a time until we reach 8 byte alignment.
28+
MOV $8, X8
29+
SUB X5, X8
30+
SUB X8, X13
31+
align:
32+
MOVBU 0(X11), X16
33+
MOVBU 0(X12), X17
34+
XOR X16, X17
35+
MOVB X17, 0(X10)
36+
ADD $1, X10
37+
ADD $1, X11
38+
ADD $1, X12
39+
SUB $1, X8
40+
BNEZ X8, align
41+
42+
loop64_check:
43+
MOV $64, X15
44+
BLT X13, X15, tail32_check
45+
PCALIGN $16
46+
loop64:
47+
MOV 0(X11), X16
48+
MOV 0(X12), X17
49+
MOV 8(X11), X18
50+
MOV 8(X12), X19
51+
XOR X16, X17
52+
XOR X18, X19
53+
MOV X17, 0(X10)
54+
MOV X19, 8(X10)
55+
MOV 16(X11), X20
56+
MOV 16(X12), X21
57+
MOV 24(X11), X22
58+
MOV 24(X12), X23
59+
XOR X20, X21
60+
XOR X22, X23
61+
MOV X21, 16(X10)
62+
MOV X23, 24(X10)
63+
MOV 32(X11), X16
64+
MOV 32(X12), X17
65+
MOV 40(X11), X18
66+
MOV 40(X12), X19
67+
XOR X16, X17
68+
XOR X18, X19
69+
MOV X17, 32(X10)
70+
MOV X19, 40(X10)
71+
MOV 48(X11), X20
72+
MOV 48(X12), X21
73+
MOV 56(X11), X22
74+
MOV 56(X12), X23
75+
XOR X20, X21
76+
XOR X22, X23
77+
MOV X21, 48(X10)
78+
MOV X23, 56(X10)
79+
ADD $64, X10
80+
ADD $64, X11
81+
ADD $64, X12
82+
SUB $64, X13
83+
BGE X13, X15, loop64
84+
BEQZ X13, done
85+
86+
tail32_check:
87+
MOV $32, X15
88+
BLT X13, X15, tail16_check
89+
MOV 0(X11), X16
90+
MOV 0(X12), X17
91+
MOV 8(X11), X18
92+
MOV 8(X12), X19
93+
XOR X16, X17
94+
XOR X18, X19
95+
MOV X17, 0(X10)
96+
MOV X19, 8(X10)
97+
MOV 16(X11), X20
98+
MOV 16(X12), X21
99+
MOV 24(X11), X22
100+
MOV 24(X12), X23
101+
XOR X20, X21
102+
XOR X22, X23
103+
MOV X21, 16(X10)
104+
MOV X23, 24(X10)
105+
ADD $32, X10
106+
ADD $32, X11
107+
ADD $32, X12
108+
SUB $32, X13
109+
BEQZ X13, done
110+
111+
tail16_check:
112+
MOV $16, X15
113+
BLT X13, X15, loop4_check
114+
MOV 0(X11), X16
115+
MOV 0(X12), X17
116+
MOV 8(X11), X18
117+
MOV 8(X12), X19
118+
XOR X16, X17
119+
XOR X18, X19
120+
MOV X17, 0(X10)
121+
MOV X19, 8(X10)
122+
ADD $16, X10
123+
ADD $16, X11
124+
ADD $16, X12
125+
SUB $16, X13
126+
BEQZ X13, done
127+
128+
loop4_check:
129+
MOV $4, X15
130+
BLT X13, X15, loop1
131+
PCALIGN $16
132+
loop4:
133+
MOVBU 0(X11), X16
134+
MOVBU 0(X12), X17
135+
MOVBU 1(X11), X18
136+
MOVBU 1(X12), X19
137+
XOR X16, X17
138+
XOR X18, X19
139+
MOVB X17, 0(X10)
140+
MOVB X19, 1(X10)
141+
MOVBU 2(X11), X20
142+
MOVBU 2(X12), X21
143+
MOVBU 3(X11), X22
144+
MOVBU 3(X12), X23
145+
XOR X20, X21
146+
XOR X22, X23
147+
MOVB X21, 2(X10)
148+
MOVB X23, 3(X10)
149+
ADD $4, X10
150+
ADD $4, X11
151+
ADD $4, X12
152+
SUB $4, X13
153+
BGE X13, X15, loop4
154+
155+
PCALIGN $16
156+
loop1:
157+
BEQZ X13, done
158+
MOVBU 0(X11), X16
159+
MOVBU 0(X12), X17
160+
XOR X16, X17
161+
MOVB X17, 0(X10)
162+
ADD $1, X10
163+
ADD $1, X11
164+
ADD $1, X12
165+
SUB $1, X13
166+
JMP loop1
167+
168+
done:
169+
RET

0 commit comments

Comments
 (0)