Skip to content

Commit 2de561a

Browse files
committed
x/crypto/chacha20, x/crypto/poly1305: Add MIPSLE assembly version
Add assembly optimized versions for ChaCha20 and Poly1305 crypto algorithms for MIPSLE. The algorithms have been ported from other ASM implementations, both of which are dual licensed under “GPL-2.0 OR MIT” - https://github.com/torvalds/linux/blob/1b294a1f35616977caddaddf3e9d28e576a1adbc/arch/mips/crypto/chacha-core.S - https://github.com/WireGuard/wireguard-monolithic-historical/blob/edad0d6e99e5133b1e8e865d727a25fff6399cb4/src/crypto/zinc/poly1305/poly1305-mips.S The following are benchmarks done on a MT7688. It compares the base go implementation with the assembly version, once with a MIPS32r1 IS and once with MIPS32r2 IS. goos: linux goarch: mipsle pkg: golang.org/x/crypto/chacha20 │ old.txt │ asm.txt │ asm-mips32r2.txt │ │ B/s │ B/s vs base │ B/s vs base │ ChaCha20/64 4.015Mi ± 1% 10.376Mi ± 1% +158.43% (p=0.000 n=10) 13.485Mi ± 2% +235.87% (p=0.000 n=10) ChaCha20/256 4.473Mi ± 1% 12.846Mi ± 1% +187.21% (p=0.000 n=10) 18.859Mi ± 3% +321.64% (p=0.000 n=10) ChaCha20/10x25 3.119Mi ± 1% 6.104Mi ± 2% +95.72% (p=0.000 n=10) 7.181Mi ± 3% +130.28% (p=0.000 n=10) ChaCha20/4096 4.659Mi ± 4% 13.609Mi ± 4% +192.12% (p=0.000 n=10) 20.270Mi ± 5% +335.11% (p=0.000 n=10) ChaCha20/100x40 4.020Mi ± 2% 9.918Mi ± 3% +146.74% (p=0.000 n=10) 13.433Mi ± 5% +234.16% (p=0.000 n=10) ChaCha20/65536 4.301Mi ± 1% 9.727Mi ± 1% +126.16% (p=0.000 n=10) 12.393Mi ± 0% +188.14% (p=0.000 n=10) ChaCha20/1000x65 4.187Mi ± 1% 10.076Mi ± 2% +140.66% (p=0.000 n=10) 13.032Mi ± 2% +211.28% (p=0.000 n=10) geomean 4.082Mi 10.11Mi +147.56% 13.47Mi +229.90% pkg: golang.org/x/crypto/internal/poly1305 │ old.txt │ asm.txt │ asm-mips32r2.txt │ │ B/s │ B/s vs base │ B/s vs base │ 64 5.307Mi ± 0% 21.009Mi ± 0% +295.87% (p=0.000 n=10) 20.938Mi ± 0% +294.52% (p=0.000 n=10) 1K 6.566Mi ± 1% 66.676Mi ± 0% +915.47% (p=0.000 n=10) 66.042Mi ± 0% +905.81% (p=0.000 n=10) 2M 5.140Mi ± 1% 47.135Mi ± 0% +816.98% (p=0.000 n=10) 47.016Mi ± 0% +814.66% (p=0.000 n=10) 64Unaligned 5.322Mi ± 1% 21.024Mi ± 0% +295.07% (p=0.000 n=10) 20.871Mi ± 1% +292.20% (p=0.000 n=10) 1KUnaligned 6.561Mi ± 0% 66.614Mi ± 0% +915.26% (p=0.000 n=10) 66.333Mi ± 0% +910.97% (p=0.000 n=10) 2MUnaligned 5.140Mi ± 1% 47.197Mi ± 1% +818.18% (p=0.000 n=10) 47.126Mi ± 0% +816.79% (p=0.000 n=10) Write64 6.599Mi ± 0% 57.268Mi ± 0% +767.77% (p=0.000 n=10) 57.368Mi ± 0% +769.29% (p=0.000 n=10) Write1K 6.819Mi ± 0% 79.408Mi ± 0% +1064.55% (p=0.000 n=10) 79.246Mi ± 0% +1062.17% (p=0.000 n=10) Write2M 5.140Mi ± 0% 47.169Mi ± 0% +817.63% (p=0.000 n=10) 47.116Mi ± 0% +816.60% (p=0.000 n=10) Write64Unaligned 6.428Mi ± 3% 56.992Mi ± 1% +786.65% (p=0.000 n=10) 56.424Mi ± 1% +777.82% (p=0.000 n=10) Write1KUnaligned 6.814Mi ± 2% 79.293Mi ± 0% +1063.68% (p=0.000 n=10) 79.513Mi ± 0% +1066.90% (p=0.000 n=10) Write2MUnaligned 5.016Mi ± 2% 47.183Mi ± 1% +840.59% (p=0.000 n=10) 47.183Mi ± 0% +840.59% (p=0.000 n=10) geomean 5.858Mi 49.17Mi +739.29% 49.02Mi +736.70% pkg: golang.org/x/crypto/chacha20poly1305 │ old.txt │ asm.txt │ asm-mips32r2.txt │ │ B/s │ B/s vs base │ B/s vs base │ Chacha20Poly1305/Open-64 1.230Mi ± 4% 3.042Mi ± 1% +147.29% (p=0.000 n=10) 3.548Mi ± 2% +188.37% (p=0.000 n=10) Chacha20Poly1305/Seal-64 1.144Mi ± 1% 3.462Mi ± 1% +202.50% (p=0.000 n=10) 3.810Mi ± 1% +232.92% (p=0.000 n=10) Chacha20Poly1305/Open-64-X 908.2Ki ± 1% 1718.8Ki ± 2% +89.25% (p=0.000 n=10) 1840.8Ki ± 2% +102.69% (p=0.000 n=10) Chacha20Poly1305/Seal-64-X 839.8Ki ± 1% 1894.5Ki ± 2% +125.58% (p=0.000 n=10) 2006.8Ki ± 2% +138.95% (p=0.000 n=10) Chacha20Poly1305/Open-1024 2.594Mi ± 3% 9.975Mi ± 1% +284.56% (p=0.000 n=10) 13.208Mi ± 3% +409.19% (p=0.000 n=10) Chacha20Poly1305/Seal-1024 2.551Mi ± 1% 10.600Mi ± 2% +315.51% (p=0.000 n=10) 14.353Mi ± 3% +462.62% (p=0.000 n=10) Chacha20Poly1305/Open-1024-X 2.470Mi ± 0% 8.569Mi ± 0% +246.91% (p=0.000 n=10) 10.705Mi ± 2% +333.40% (p=0.000 n=10) Chacha20Poly1305/Seal-1024-X 2.413Mi ± 1% 9.036Mi ± 1% +274.51% (p=0.000 n=10) 11.330Mi ± 1% +369.57% (p=0.000 n=10) Chacha20Poly1305/Open-1350 2.594Mi ± 3% 9.899Mi ± 2% +281.62% (p=0.000 n=10) 13.237Mi ± 2% +410.29% (p=0.000 n=10) Chacha20Poly1305/Seal-1350 2.556Mi ± 1% 10.471Mi ± 1% +309.70% (p=0.000 n=10) 13.452Mi ± 1% +426.31% (p=0.000 n=10) Chacha20Poly1305/Open-1350-X 2.503Mi ± 2% 8.817Mi ± 1% +252.19% (p=0.000 n=10) 11.382Mi ± 1% +354.67% (p=0.000 n=10) Chacha20Poly1305/Seal-1350-X 2.460Mi ± 0% 9.093Mi ± 1% +269.57% (p=0.000 n=10) 11.873Mi ± 2% +382.56% (p=0.000 n=10) Chacha20Poly1305/Open-2048 2.694Mi ± 2% 11.024Mi ± 2% +309.20% (p=0.000 n=10) 14.963Mi ± 1% +455.40% (p=0.000 n=10) Chacha20Poly1305/Seal-2048 2.699Mi ± 0% 11.477Mi ± 2% +325.27% (p=0.000 n=10) 15.240Mi ± 1% +464.66% (p=0.000 n=10) Chacha20Poly1305/Open-2048-X 2.637Mi ± 1% 10.056Mi ± 1% +281.37% (p=0.000 n=10) 13.375Mi ± 1% +407.23% (p=0.000 n=10) Chacha20Poly1305/Seal-2048-X 2.627Mi ± 1% 10.328Mi ± 2% +293.10% (p=0.000 n=10) 13.819Mi ± 2% +425.95% (p=0.000 n=10) Chacha20Poly1305/Open-4096 2.732Mi ± 5% 11.225Mi ± 4% +310.82% (p=0.000 n=10) 16.041Mi ± 4% +487.09% (p=0.000 n=10) Chacha20Poly1305/Seal-4096 2.704Mi ± 2% 10.839Mi ± 7% +300.88% (p=0.000 n=10) 15.693Mi ± 7% +480.42% (p=0.000 n=10) Chacha20Poly1305/Open-4096-X 2.670Mi ± 1% 10.381Mi ± 4% +288.75% (p=0.000 n=10) 15.035Mi ± 4% +463.04% (p=0.000 n=10) Chacha20Poly1305/Seal-4096-X 2.680Mi ± 1% 10.867Mi ± 5% +305.52% (p=0.000 n=10) 15.421Mi ± 7% +475.44% (p=0.000 n=10) Chacha20Poly1305/Open-8192 2.708Mi ± 2% 11.053Mi ± 3% +308.10% (p=0.000 n=10) 15.926Mi ± 5% +488.03% (p=0.000 n=10) Chacha20Poly1305/Seal-8192 2.632Mi ± 4% 10.896Mi ± 6% +313.95% (p=0.000 n=10) 16.031Mi ± 5% +509.06% (p=0.000 n=10) Chacha20Poly1305/Open-8192-X 2.666Mi ± 4% 10.948Mi ± 4% +310.73% (p=0.000 n=10) 15.855Mi ± 3% +494.81% (p=0.000 n=10) Chacha20Poly1305/Seal-8192-X 2.637Mi ± 2% 10.805Mi ± 2% +309.76% (p=0.000 n=10) 14.725Mi ± 6% +458.41% (p=0.000 n=10) Chacha20Poly1305/Open-16384 2.499Mi ± 4% 10.405Mi ± 13% +316.41% (p=0.000 n=10) 13.628Mi ± 7% +445.42% (p=0.000 n=10) Chacha20Poly1305/Seal-16384 2.484Mi ± 4% 9.069Mi ± 4% +265.07% (p=0.000 n=10) 12.131Mi ± 3% +388.29% (p=0.000 n=10) Chacha20Poly1305/Open-16384-X 2.389Mi ± 7% 10.028Mi ± 5% +319.76% (p=0.000 n=10) 14.472Mi ± 3% +505.79% (p=0.000 n=10) Chacha20Poly1305/Seal-16384-X 2.475Mi ± 4% 9.084Mi ± 2% +267.05% (p=0.000 n=10) 12.212Mi ± 6% +393.45% (p=0.000 n=10) geomean 2.259Mi 8.271Mi +266.21% 10.90Mi +382.79% Fixes golang/go#39139
1 parent 67b1361 commit 2de561a

File tree

7 files changed

+487
-3
lines changed

7 files changed

+487
-3
lines changed

chacha20/chacha_mipsle.go

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// Copyright 2024 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build gc && !purego
6+
7+
package chacha20
8+
9+
const bufSize = blockSize
10+
11+
//go:noescape
12+
func xorKeyStream(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
13+
14+
func (s *Cipher) xorKeyStreamBlocks(dst, src []byte) {
15+
xorKeyStream(dst, src, &s.key, &s.nonce, &s.counter)
16+
}

chacha20/chacha_mipsle.s

+185
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
// Copyright 2024 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
// Ported from https://github.com/torvalds/linux/blob/1b294a1f35616977caddaddf3e9d28e576a1adbc/arch/mips/crypto/chacha-core.S
6+
// which is licensed under:
7+
// # ====================================================================
8+
// # SPDX-License-Identifier: GPL-2.0 OR MIT
9+
// #
10+
// # Copyright (C) 2016-2018 René van Dorst <[email protected]>. All Rights Reserved.
11+
// # Copyright (C) 2015-2019 Jason A. Donenfeld <[email protected]>. All Rights Reserved.
12+
// # ====================================================================
13+
14+
//go:build gc && !purego
15+
16+
#include "textflag.h"
17+
18+
#define X0 R1
19+
#define X1 R2
20+
#define X2 R3
21+
#define X3 R4
22+
#define X4 R5
23+
#define X5 R6
24+
#define X6 R7
25+
#define X7 R8
26+
#define X8 R9
27+
#define X9 R10
28+
#define X10 R11
29+
#define X11 R12
30+
#define X12 R13
31+
#define X13 R14
32+
#define X14 R15
33+
#define X15 R16
34+
35+
#define DST R17
36+
#define SRC R18
37+
#define SRC_LEN R19
38+
#define KEY R20
39+
#define NONCE R21
40+
#define CTR R22
41+
42+
#define LOOP_I R24
43+
#define TMP R25
44+
45+
#ifdef GOMIPS_r2
46+
#define hasROTR
47+
#endif
48+
#ifdef GOMIPS_r5
49+
#define hasROTR
50+
#endif
51+
52+
#ifdef hasROTR
53+
#define ROTL(S, R) \
54+
ROTR $(32-S), R
55+
#else
56+
#define ROTL(S, R) \
57+
SLL $(S), R, TMP \
58+
SRL $(32-S), R \
59+
OR TMP, R
60+
#endif
61+
62+
#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
63+
ADDU K, A \
64+
ADDU L, B \
65+
ADDU M, C \
66+
ADDU N, D \
67+
XOR A, V \
68+
XOR B, W \
69+
XOR C, Y \
70+
XOR D, Z \
71+
ROTL (S, V) \
72+
ROTL (S, W) \
73+
ROTL (S, Y) \
74+
ROTL (S, Z)
75+
76+
#define FOR_STATE(OP, OP_MEM) \
77+
OP ( $0x61707865, X0 ) \ // expa
78+
OP ( $0x3320646e, X1 ) \ // nd 3
79+
OP ( $0x79622d32, X2 ) \ // 2-by
80+
OP ( $0x6b206574, X3 ) \ // te k
81+
OP_MEM ( 0(KEY), X4 ) \
82+
OP_MEM ( 4(KEY), X5 ) \
83+
OP_MEM ( 8(KEY), X6 ) \
84+
OP_MEM ( 12(KEY), X7 ) \
85+
OP_MEM ( 16(KEY), X8 ) \
86+
OP_MEM ( 20(KEY), X9 ) \
87+
OP_MEM ( 24(KEY), X10 ) \
88+
OP_MEM ( 28(KEY), X11 ) \
89+
OP ( CTR, X12 ) \
90+
OP_MEM ( 0(NONCE), X13 ) \
91+
OP_MEM ( 4(NONCE), X14 ) \
92+
OP_MEM ( 8(NONCE), X15 )
93+
94+
#define movw(x, y) \
95+
MOVW x, y
96+
97+
#define ADD(V, REG) \
98+
ADDU V, REG
99+
100+
#define ADD_MEM(ADDR, REG) \
101+
MOVW ADDR, TMP \
102+
ADDU TMP, REG
103+
104+
// XOR_STREAM_WORD works with unaligned memory, this is quite important since the strams might not be aligned.
105+
// Especially during the use in TLS the memory is often unaligned.
106+
#define XOR_STREAM_WORD( OFF, REG) \
107+
MOVWL (4*OFF + 3)(SRC), TMP \
108+
MOVWR (4*OFF)(SRC), TMP \
109+
XOR REG, TMP \
110+
MOVWL TMP, (4*OFF + 3)(DST) \
111+
MOVWR TMP, (4*OFF)(DST)
112+
113+
// func xorKeyStream(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
114+
TEXT ·xorKeyStream(SB), NOSPLIT|NOFRAME, $0
115+
MOVW dst+0(FP), DST
116+
MOVW src+12(FP), SRC
117+
MOVW src_len+16(FP), SRC_LEN
118+
MOVW key+24(FP), KEY
119+
MOVW nonce+28(FP), NONCE
120+
MOVW counter+32(FP), CTR
121+
122+
// load counter
123+
MOVW (CTR), CTR
124+
125+
chacha:
126+
127+
// load initial State into X*
128+
FOR_STATE ( movw, movw )
129+
130+
// set number of rounds
131+
MOVW $20, LOOP_I
132+
133+
loop:
134+
AXR( X0,X1,X2,X3, X4,X5,X6,X7, X12,X13,X14,X15, 16)
135+
AXR( X8,X9,X10,X11, X12,X13,X14,X15, X4,X5,X6,X7, 12)
136+
AXR( X0,X1,X2,X3, X4,X5,X6,X7, X12,X13,X14,X15, 8)
137+
AXR( X8,X9,X10,X11, X12,X13,X14,X15, X4,X5,X6,X7, 7)
138+
AXR( X0,X1,X2,X3, X5,X6,X7,X4, X15,X12,X13,X14, 16)
139+
AXR( X10,X11,X8,X9, X15,X12,X13,X14, X5,X6,X7,X4, 12)
140+
AXR( X0,X1,X2,X3, X5,X6,X7,X4, X15,X12,X13,X14, 8)
141+
AXR( X10,X11,X8,X9, X15,X12,X13,X14, X5,X6,X7,X4, 7)
142+
143+
ADDU $-2, LOOP_I
144+
BNE LOOP_I, loop
145+
146+
// add back the initial state to generate the key stream
147+
FOR_STATE ( ADD, ADD_MEM )
148+
149+
// xor the key stream with the source and write out the result
150+
XOR_STREAM_WORD (0, X0)
151+
XOR_STREAM_WORD (1, X1)
152+
XOR_STREAM_WORD (2, X2)
153+
XOR_STREAM_WORD (3, X3)
154+
XOR_STREAM_WORD (4, X4)
155+
XOR_STREAM_WORD (5, X5)
156+
XOR_STREAM_WORD (6, X6)
157+
XOR_STREAM_WORD (7, X7)
158+
XOR_STREAM_WORD (8, X8)
159+
XOR_STREAM_WORD (9, X9)
160+
XOR_STREAM_WORD (10, X10)
161+
XOR_STREAM_WORD (11, X11)
162+
XOR_STREAM_WORD (12, X12)
163+
XOR_STREAM_WORD (13, X13)
164+
XOR_STREAM_WORD (14, X14)
165+
XOR_STREAM_WORD (15, X15)
166+
167+
// decrement length
168+
ADDU $-64, SRC_LEN, SRC_LEN
169+
170+
// increment pointers
171+
MOVW $64(DST), DST
172+
MOVW $64(SRC), SRC
173+
174+
// increment counter
175+
ADDU $1, CTR
176+
177+
// loop if there's still data
178+
BNE SRC_LEN, chacha
179+
180+
// store Counter
181+
MOVW counter+32(FP), TMP
182+
MOVW CTR, (TMP)
183+
184+
RET
185+

chacha20/chacha_noasm.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build (!arm64 && !s390x && !ppc64le) || !gc || purego
5+
//go:build (!arm64 && !s390x && !ppc64le && !mipsle) || !gc || purego
66

77
package chacha20
88

chacha20poly1305/chacha20poly1305_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ func benchamarkChaCha20Poly1305Open(b *testing.B, buf []byte, nonceSize int) {
202202
}
203203

204204
func BenchmarkChacha20Poly1305(b *testing.B) {
205-
for _, length := range []int{64, 1350, 8 * 1024} {
205+
for _, length := range []int{64, 1024, 1350, 2 * 1024, 4 * 1024, 8 * 1024, 16 * 1024} {
206206
b.Run("Open-"+strconv.Itoa(length), func(b *testing.B) {
207207
benchamarkChaCha20Poly1305Open(b, make([]byte, length), NonceSize)
208208
})

internal/poly1305/mac_noasm.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build (!amd64 && !ppc64le && !s390x) || !gc || purego
5+
//go:build (!amd64 && !ppc64le && !s390x && !mipsle) || !gc || purego
66

77
package poly1305
88

internal/poly1305/sum_mipsle.go

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
// Copyright 2024 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build gc && !purego
6+
7+
package poly1305
8+
9+
// mac is a wrapper for macGeneric that redirects calls that would have gone to
10+
// updateGeneric to update.
11+
//
12+
// Its Write and Sum methods are otherwise identical to the macGeneric ones, but
13+
// using function pointers would carry a major performance cost.
14+
type mac struct{ macGeneric }
15+
16+
func (h *mac) Write(p []byte) (int, error) {
17+
nn := len(p)
18+
if h.offset > 0 {
19+
n := copy(h.buffer[h.offset:], p)
20+
if h.offset+n < TagSize {
21+
h.offset += n
22+
return nn, nil
23+
}
24+
p = p[n:]
25+
h.offset = 0
26+
update(&h.macState, h.buffer[:], 1)
27+
}
28+
if n := len(p) - (len(p) % TagSize); n > 0 {
29+
update(&h.macState, p[:n], 1)
30+
p = p[n:]
31+
}
32+
if len(p) > 0 {
33+
h.offset += copy(h.buffer[h.offset:], p)
34+
}
35+
return nn, nil
36+
}
37+
38+
func (h *mac) Sum(out *[16]byte) {
39+
state := h.macState
40+
if n := h.offset; n > 0 {
41+
h.buffer[n] = 1
42+
n++
43+
for ; n < TagSize; n++ {
44+
h.buffer[n] = 0
45+
}
46+
47+
update(&state, h.buffer[:], 0)
48+
}
49+
finalize(out, &state.h, &state.s)
50+
}
51+
52+
//go:noescape
53+
func update(state *macState, msg []byte, padbit uint32)

0 commit comments

Comments
 (0)