Skip to content

Commit dbd6c38

Browse files
authored
s2: Don't use stack for index tables (#1014)
* s2: Don't use stack for index tables Provide a pooled array pointer for tables instead of using stack. Seems like Go is still unstable with large stacks, so use alternative method.
1 parent f73ab1e commit dbd6c38

File tree

8 files changed

+11284
-11011
lines changed

8 files changed

+11284
-11011
lines changed

s2/_generate/gen.go

Lines changed: 38 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ func main() {
8181
o.maxSkip = 100
8282
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm", 17, 14, 7, 7, limit14B)
8383
o.maxSkip = 0
84-
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm64K", 16, 14, 7, 7, 64<<10-1)
84+
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm64K", 16, 13, 7, 7, 64<<10-1)
8585
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm12B", 14, 12, 6, 6, limit12B)
8686
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm10B", 12, 10, 5, 6, limit10B)
8787
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm8B", 10, 8, 4, 6, limit8B)
@@ -146,6 +146,15 @@ func assert(fn func(ok LabelRef)) {
146146
}
147147
}
148148

149+
type regTable struct {
150+
r reg.Register
151+
disp int
152+
}
153+
154+
func (r regTable) Idx(idx reg.GPVirtual, scale uint8) Mem {
155+
return Mem{Base: r.r, Index: idx, Scale: scale, Disp: r.disp}
156+
}
157+
149158
type options struct {
150159
snappy bool
151160
bmi1 bool
@@ -163,7 +172,15 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m
163172
if o.skipOutput {
164173
dstTxt = ""
165174
}
166-
TEXT(name, 0, "func("+dstTxt+"src []byte) int")
175+
176+
var tableSize = 4 * (1 << tableBits)
177+
// Memzero needs at least 128 bytes.
178+
if tableSize < 128 {
179+
panic("tableSize must be at least 128 bytes")
180+
}
181+
182+
arrPtr := fmt.Sprintf(",tmp *[%d]byte", tableSize)
183+
TEXT(name, 0, "func("+dstTxt+"src []byte"+arrPtr+") int")
167184
Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.",
168185
fmt.Sprintf("Maximum input %d bytes.", maxLen),
169186
"It assumes that the varint-encoded length of the decompressed bytes has already been written.", "")
@@ -173,7 +190,6 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m
173190
o.maxOffset = maxLen - 1
174191
var literalMaxOverhead = maxLitOverheadFor(maxLen)
175192

176-
var tableSize = 4 * (1 << tableBits)
177193
// Memzero needs at least 128 bytes.
178194
if tableSize < 128 {
179195
panic("tableSize must be at least 128 bytes")
@@ -209,8 +225,8 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m
209225
// nextSTempL keeps nextS while other functions are being called.
210226
nextSTempL := AllocLocal(4)
211227

212-
// Alloc table last
213-
table := AllocLocal(tableSize)
228+
// Load pointer to temp table
229+
table := regTable{r: Load(Param("tmp"), GP64())}
214230

215231
dst := GP64()
216232
if !o.skipOutput {
@@ -236,7 +252,7 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m
236252
iReg := GP64()
237253
MOVQ(U32(tableSize/8/16), iReg)
238254
tablePtr := GP64()
239-
LEAQ(table, tablePtr)
255+
MOVQ(table.r, tablePtr)
240256
zeroXmm := XMM()
241257
PXOR(zeroXmm, zeroXmm)
242258

@@ -855,7 +871,17 @@ func maxLitOverheadFor(n int) int {
855871
}
856872

857873
func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, skipLog, lHashBytes, maxLen int) {
858-
TEXT(name, 0, "func(dst, src []byte) int")
874+
var lTableSize = 4 * (1 << lTableBits)
875+
var sTableSize = 4 * (1 << sTableBits)
876+
tableSize := lTableSize + sTableSize
877+
878+
// Memzero needs at least 128 bytes.
879+
if tableSize < 128 {
880+
panic("tableSize must be at least 128 bytes")
881+
}
882+
arrPtr := fmt.Sprintf(", tmp *[%d]byte", tableSize)
883+
884+
TEXT(name, 0, "func(dst, src []byte"+arrPtr+") int")
859885
Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.",
860886
fmt.Sprintf("Maximum input %d bytes.", maxLen),
861887
"It assumes that the varint-encoded length of the decompressed bytes has already been written.", "")
@@ -870,9 +896,6 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk
870896
o.maxLen = maxLen
871897
o.maxOffset = maxLen - 1
872898

873-
var lTableSize = 4 * (1 << lTableBits)
874-
var sTableSize = 4 * (1 << sTableBits)
875-
876899
// Memzero needs at least 128 bytes.
877900
if (lTableSize + sTableSize) < 128 {
878901
panic("tableSize must be at least 128 bytes")
@@ -905,9 +928,9 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk
905928
// nextSTempL keeps nextS while other functions are being called.
906929
nextSTempL := AllocLocal(4)
907930

908-
// Alloc table last, lTab must be before sTab.
909-
lTab := AllocLocal(lTableSize)
910-
sTab := AllocLocal(sTableSize)
931+
table := Load(Param("tmp"), GP64())
932+
lTab := regTable{r: table}
933+
sTab := regTable{r: table, disp: lTableSize}
911934

912935
dst := GP64()
913936
{
@@ -930,7 +953,7 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk
930953
iReg := GP64()
931954
MOVQ(U32((sTableSize+lTableSize)/8/16), iReg)
932955
tablePtr := GP64()
933-
LEAQ(lTab, tablePtr)
956+
MOVQ(table, tablePtr)
934957
zeroXmm := XMM()
935958
PXOR(zeroXmm, zeroXmm)
936959

@@ -2916,7 +2939,7 @@ func (o options) cvtLZ4BlockAsm(lz4s bool) {
29162939
TEXT("cvt"+srcAlgo+"Block"+snap, NOSPLIT, "func(dst, src []byte) (uncompressed int, dstUsed int)")
29172940
Doc("cvt"+srcAlgo+"Block converts an "+srcAlgo+" block to "+dstAlgo, "")
29182941
Pragma("noescape")
2919-
o.outputMargin = 10
2942+
o.outputMargin = 8
29202943
o.maxOffset = math.MaxUint16
29212944

29222945
const (

s2/encode.go

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ import (
99
"encoding/binary"
1010
"math"
1111
"math/bits"
12+
"sync"
13+
14+
"github.com/klauspost/compress/internal/race"
1215
)
1316

1417
// Encode returns the encoded form of src. The returned slice may be a sub-
@@ -52,6 +55,8 @@ func Encode(dst, src []byte) []byte {
5255
return dst[:d]
5356
}
5457

58+
var estblockPool [2]sync.Pool
59+
5560
// EstimateBlockSize will perform a very fast compression
5661
// without outputting the result and return the compressed output size.
5762
// The function returns -1 if no improvement could be achieved.
@@ -61,9 +66,25 @@ func EstimateBlockSize(src []byte) (d int) {
6166
return -1
6267
}
6368
if len(src) <= 1024 {
64-
d = calcBlockSizeSmall(src)
69+
const sz, pool = 2048, 0
70+
tmp, ok := estblockPool[pool].Get().(*[sz]byte)
71+
if !ok {
72+
tmp = &[sz]byte{}
73+
}
74+
race.WriteSlice(tmp[:])
75+
defer estblockPool[pool].Put(tmp)
76+
77+
d = calcBlockSizeSmall(src, tmp)
6578
} else {
66-
d = calcBlockSize(src)
79+
const sz, pool = 32768, 1
80+
tmp, ok := estblockPool[pool].Get().(*[sz]byte)
81+
if !ok {
82+
tmp = &[sz]byte{}
83+
}
84+
race.WriteSlice(tmp[:])
85+
defer estblockPool[pool].Put(tmp)
86+
87+
d = calcBlockSize(src, tmp)
6788
}
6889

6990
if d == 0 {

0 commit comments

Comments
 (0)