Skip to content

Commit 9c46265

Browse files
author
Lars T Hansen
committed
Bug 1656226 - Implement the experimental opcodes. r=jseward
Implement some of the experimental SIMD opcodes that are supported by all of V8, LLVM, and Binaryen, for maximum compatibility with test content we might be exposed to. Most/all of these will probably make it into the spec, as they lead to substantial speedups in some programs, and they are deterministic. For spec and cpu mapping details, see: WebAssembly/simd#122 (pmax/pmin) WebAssembly/simd#232 (rounding) WebAssembly/simd#127 (dot product) WebAssembly/simd#237 (load zero) The wasm bytecode values used here come from the binaryen changes that are linked from those tickets, that's the best documentation right now. Current binaryen opcode mappings are here: https://github.com/WebAssembly/binaryen/blob/master/src/wasm-binary.h Also: Drive-by fix for signatures of vroundss and vroundsd, these are unary operations and should follow the conventions for these with src/dest arguments, not src0/src1/dest. Also: Drive-by fix to add variants of vmovss and vmovsd on x64 that take Operand source and FloatRegister destination. Differential Revision: https://phabricator.services.mozilla.com/D85982
1 parent 5b91ef7 commit 9c46265

21 files changed

+766
-45
lines changed

js/src/jit-test/lib/wasm-binary.js

+32-2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ const I32Code = 0x7f;
3939
const I64Code = 0x7e;
4040
const F32Code = 0x7d;
4141
const F64Code = 0x7c;
42+
const V128Code = 0x7b;
4243
const AnyFuncCode = 0x70;
4344
const AnyrefCode = 0x6f;
4445
const OptRefCode = 0x6c;
@@ -53,6 +54,7 @@ const CallCode = 0x10;
5354
const CallIndirectCode = 0x11;
5455
const DropCode = 0x1a;
5556
const SelectCode = 0x1b;
57+
const LocalGetCode = 0x20;
5658
const I32Load = 0x28;
5759
const I64Load = 0x29;
5860
const F32Load = 0x2a;
@@ -102,6 +104,27 @@ const RefNullCode = 0xd0;
102104
const RefIsNullCode = 0xd1;
103105
const RefFuncCode = 0xd2;
104106

107+
// SIMD opcodes
108+
const V128LoadCode = 0x00;
109+
const V128StoreCode = 0x0b;
110+
111+
// Experimental SIMD opcodes as of August, 2020.
112+
const I32x4DotSI16x8Code = 0xba;
113+
const F32x4CeilCode = 0xd8;
114+
const F32x4FloorCode = 0xd9;
115+
const F32x4TruncCode = 0xda;
116+
const F32x4NearestCode = 0xdb;
117+
const F64x2CeilCode = 0xdc;
118+
const F64x2FloorCode = 0xdd;
119+
const F64x2TruncCode = 0xde;
120+
const F64x2NearestCode = 0xdf;
121+
const F32x4PMinCode = 0xea;
122+
const F32x4PMaxCode = 0xeb;
123+
const F64x2PMinCode = 0xf6;
124+
const F64x2PMaxCode = 0xf7;
125+
const V128Load32ZeroCode = 0xfc;
126+
const V128Load64ZeroCode = 0xfd;
127+
105128
const FirstInvalidOpcode = 0xc5;
106129
const LastInvalidOpcode = 0xfa;
107130
const GcPrefix = 0xfb;
@@ -300,8 +323,15 @@ function exportSection(exports) {
300323
body.push(...varU32(exports.length));
301324
for (let exp of exports) {
302325
body.push(...string(exp.name));
303-
body.push(...varU32(FunctionCode));
304-
body.push(...varU32(exp.funcIndex));
326+
if (exp.hasOwnProperty("funcIndex")) {
327+
body.push(...varU32(FunctionCode));
328+
body.push(...varU32(exp.funcIndex));
329+
} else if (exp.hasOwnProperty("memIndex")) {
330+
body.push(...varU32(MemoryCode));
331+
body.push(...varU32(exp.memIndex));
332+
} else {
333+
throw "Bad export " + exp;
334+
}
305335
}
306336
return { name: exportId, body };
307337
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
// Experimental opcodes. We have no text parsing support for these yet. The
2+
// tests will be cleaned up and moved into ad-hack.js if the opcodes are
3+
// adopted.
4+
5+
// When simd is enabled by default in release builds we will flip the value of
6+
// SimdExperimentalEnabled to false in RELEASE_OR_BETA builds. At that point,
7+
// these tests will start failing in release or beta builds, and a guard
8+
// asserting !RELEASE_OR_BETA will have to be added above. That is how it
9+
// should be.
10+
11+
load(libdir + "wasm-binary.js");
12+
13+
function wasmEval(bytes, imports) {
14+
return new WebAssembly.Instance(new WebAssembly.Module(bytes), imports);
15+
}
16+
17+
function get(arr, loc, len) {
18+
let res = [];
19+
for ( let i=0; i < len; i++ ) {
20+
res.push(arr[loc+i]);
21+
}
22+
return res;
23+
}
24+
25+
function set(arr, loc, vals) {
26+
for ( let i=0; i < vals.length; i++ ) {
27+
if (arr instanceof BigInt64Array) {
28+
arr[loc+i] = BigInt(vals[i]);
29+
} else {
30+
arr[loc+i] = vals[i];
31+
}
32+
}
33+
}
34+
35+
function assertSame(got, expected) {
36+
assertEq(got.length, expected.length);
37+
for ( let i=0; i < got.length; i++ ) {
38+
let g = got[i];
39+
let e = expected[i];
40+
if (typeof g != typeof e) {
41+
if (typeof g == "bigint")
42+
e = BigInt(e);
43+
else if (typeof e == "bigint")
44+
g = BigInt(g);
45+
}
46+
assertEq(g, e);
47+
}
48+
}
49+
50+
function iota(len) {
51+
let xs = [];
52+
for ( let i=0 ; i < len ; i++ )
53+
xs.push(i);
54+
return xs;
55+
}
56+
57+
function pmin(x, y) { return y < x ? y : x }
58+
function pmax(x, y) { return x < y ? y : x }
59+
60+
function ffloor(x) { return Math.fround(Math.floor(x)) }
61+
function fceil(x) { return Math.fround(Math.ceil(x)) }
62+
function ftrunc(x) { return Math.fround(Math.sign(x)*Math.floor(Math.abs(x))) }
63+
function fnearest(x) { return Math.fround(Math.round(x)) }
64+
65+
function dfloor(x) { return Math.floor(x) }
66+
function dceil(x) { return Math.ceil(x) }
67+
function dtrunc(x) { return Math.sign(x)*Math.floor(Math.abs(x)) }
68+
function dnearest(x) { return Math.round(x) }
69+
70+
const v2vSig = {args:[], ret:VoidCode};
71+
72+
function V128Load(addr) {
73+
return [I32ConstCode, varS32(addr),
74+
SimdPrefix, V128LoadCode, 4, varU32(0)]
75+
}
76+
77+
function V128StoreExpr(addr, v) {
78+
return [I32ConstCode, varS32(addr),
79+
...v,
80+
SimdPrefix, V128StoreCode, 4, varU32(0)];
81+
}
82+
83+
// Pseudo-min/max, https://github.com/WebAssembly/simd/pull/122
84+
var fxs = [5, 1, -4, 2];
85+
var fys = [6, 0, -7, 3];
86+
var dxs = [5, 1];
87+
var dys = [6, 0];
88+
89+
for ( let [opcode, xs, ys, operator] of [[F32x4PMinCode, fxs, fys, pmin],
90+
[F32x4PMaxCode, fxs, fys, pmax],
91+
[F64x2PMinCode, dxs, dys, pmin],
92+
[F64x2PMaxCode, dxs, dys, pmax]] ) {
93+
var k = xs.length;
94+
var ans = iota(k).map((i) => operator(xs[i], ys[i]))
95+
96+
var ins = wasmEval(moduleWithSections([
97+
sigSection([v2vSig]),
98+
declSection([0]),
99+
memorySection(1),
100+
exportSection([{funcIndex: 0, name: "run"},
101+
{memIndex: 0, name: "mem"}]),
102+
bodySection([
103+
funcBody({locals:[],
104+
body: [...V128StoreExpr(0, [...V128Load(16),
105+
...V128Load(32),
106+
SimdPrefix, varU32(opcode)])]})])]));
107+
108+
var mem = new (k == 4 ? Float32Array : Float64Array)(ins.exports.mem.buffer);
109+
set(mem, k, xs);
110+
set(mem, 2*k, ys);
111+
ins.exports.run();
112+
var result = get(mem, 0, k);
113+
assertSame(result, ans);
114+
}
115+
116+
// Widening integer dot product, https://github.com/WebAssembly/simd/pull/127
117+
118+
var ins = wasmEval(moduleWithSections([
119+
sigSection([v2vSig]),
120+
declSection([0]),
121+
memorySection(1),
122+
exportSection([{funcIndex: 0, name: "run"},
123+
{memIndex: 0, name: "mem"}]),
124+
bodySection([
125+
funcBody({locals:[],
126+
body: [...V128StoreExpr(0, [...V128Load(16),
127+
...V128Load(32),
128+
SimdPrefix, varU32(I32x4DotSI16x8Code)])]})])]));
129+
130+
var xs = [5, 1, -4, 2, 20, -15, 12, 3];
131+
var ys = [6, 0, -7, 3, 8, -1, -3, 7];
132+
var ans = [xs[0]*ys[0] + xs[1]*ys[1],
133+
xs[2]*ys[2] + xs[3]*ys[3],
134+
xs[4]*ys[4] + xs[5]*ys[5],
135+
xs[6]*ys[6] + xs[7]*ys[7]];
136+
137+
var mem16 = new Int16Array(ins.exports.mem.buffer);
138+
var mem32 = new Int32Array(ins.exports.mem.buffer);
139+
set(mem16, 8, xs);
140+
set(mem16, 16, ys);
141+
ins.exports.run();
142+
var result = get(mem32, 0, 4);
143+
assertSame(result, ans);
144+
145+
// Rounding, https://github.com/WebAssembly/simd/pull/232
146+
147+
var fxs = [5.1, -1.1, -4.3, 0];
148+
var dxs = [5.1, -1.1];
149+
150+
for ( let [opcode, xs, operator] of [[F32x4CeilCode, fxs, fceil],
151+
[F32x4FloorCode, fxs, ffloor],
152+
[F32x4TruncCode, fxs, ftrunc],
153+
[F32x4NearestCode, fxs, fnearest],
154+
[F64x2CeilCode, dxs, dceil],
155+
[F64x2FloorCode, dxs, dfloor],
156+
[F64x2TruncCode, dxs, dtrunc],
157+
[F64x2NearestCode, dxs, dnearest]] ) {
158+
var k = xs.length;
159+
var ans = xs.map(operator);
160+
161+
var ins = wasmEval(moduleWithSections([
162+
sigSection([v2vSig]),
163+
declSection([0]),
164+
memorySection(1),
165+
exportSection([{funcIndex: 0, name: "run"},
166+
{memIndex: 0, name: "mem"}]),
167+
bodySection([
168+
funcBody({locals:[],
169+
body: [...V128StoreExpr(0, [...V128Load(16),
170+
SimdPrefix, varU32(opcode)])]})])]));
171+
172+
var mem = new (k == 4 ? Float32Array : Float64Array)(ins.exports.mem.buffer);
173+
set(mem, k, xs);
174+
ins.exports.run();
175+
var result = get(mem, 0, k);
176+
assertSame(result, ans);
177+
}
178+
179+
// Zero-extending SIMD load, https://github.com/WebAssembly/simd/pull/237
180+
181+
for ( let [opcode, k, log2align, cons, cast] of [[V128Load32ZeroCode, 4, 2, Int32Array, Number],
182+
[V128Load64ZeroCode, 2, 3, BigInt64Array, BigInt]] ) {
183+
var ins = wasmEval(moduleWithSections([
184+
sigSection([v2vSig]),
185+
declSection([0]),
186+
memorySection(1),
187+
exportSection([{funcIndex: 0, name: "run"},
188+
{memIndex: 0, name: "mem"}]),
189+
bodySection([
190+
funcBody({locals:[],
191+
body: [...V128StoreExpr(0, [I32ConstCode, varU32(16),
192+
SimdPrefix, varU32(opcode), log2align, varU32(0)])]})])]));
193+
194+
var mem = new cons(ins.exports.mem.buffer);
195+
mem[k] = cast(37);
196+
ins.exports.run();
197+
var result = get(mem, 0, k);
198+
assertSame(result, iota(k).map((v) => v == 0 ? 37 : 0));
199+
}
200+

js/src/jit/MacroAssembler.h

+48
Original file line numberDiff line numberDiff line change
@@ -2524,6 +2524,54 @@ class MacroAssembler : public MacroAssemblerSpecific {
25242524
inline void unsignedWidenLowInt32x4(FloatRegister src, FloatRegister dest)
25252525
DEFINED_ON(x86_shared);
25262526

2527+
// Compare-based minimum/maximum (experimental as of August, 2020)
2528+
// https://github.com/WebAssembly/simd/pull/122
2529+
2530+
inline void pseudoMinFloat32x4(FloatRegister rhs, FloatRegister lhsDest)
2531+
DEFINED_ON(x86_shared);
2532+
2533+
inline void pseudoMinFloat64x2(FloatRegister rhs, FloatRegister lhsDest)
2534+
DEFINED_ON(x86_shared);
2535+
2536+
inline void pseudoMaxFloat32x4(FloatRegister rhs, FloatRegister lhsDest)
2537+
DEFINED_ON(x86_shared);
2538+
2539+
inline void pseudoMaxFloat64x2(FloatRegister rhs, FloatRegister lhsDest)
2540+
DEFINED_ON(x86_shared);
2541+
2542+
// Widening/pairwise integer dot product (experimental as of August, 2020)
2543+
// https://github.com/WebAssembly/simd/pull/127
2544+
2545+
inline void widenDotInt16x8(FloatRegister rhs, FloatRegister lhsDest)
2546+
DEFINED_ON(x86_shared);
2547+
2548+
// Floating point rounding (experimental as of August, 2020)
2549+
// https://github.com/WebAssembly/simd/pull/232
2550+
2551+
inline void ceilFloat32x4(FloatRegister src, FloatRegister dest)
2552+
DEFINED_ON(x86_shared);
2553+
2554+
inline void ceilFloat64x2(FloatRegister src, FloatRegister dest)
2555+
DEFINED_ON(x86_shared);
2556+
2557+
inline void floorFloat32x4(FloatRegister src, FloatRegister dest)
2558+
DEFINED_ON(x86_shared);
2559+
2560+
inline void floorFloat64x2(FloatRegister src, FloatRegister dest)
2561+
DEFINED_ON(x86_shared);
2562+
2563+
inline void truncFloat32x4(FloatRegister src, FloatRegister dest)
2564+
DEFINED_ON(x86_shared);
2565+
2566+
inline void truncFloat64x2(FloatRegister src, FloatRegister dest)
2567+
DEFINED_ON(x86_shared);
2568+
2569+
inline void nearestFloat32x4(FloatRegister src, FloatRegister dest)
2570+
DEFINED_ON(x86_shared);
2571+
2572+
inline void nearestFloat64x2(FloatRegister src, FloatRegister dest)
2573+
DEFINED_ON(x86_shared);
2574+
25272575
public:
25282576
// ========================================================================
25292577
// Truncate floating point.

js/src/jit/arm/MacroAssembler-arm.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -6028,6 +6028,7 @@ void MacroAssemblerARM::wasmLoadImpl(const wasm::MemoryAccessDesc& access,
60286028
}
60296029
}
60306030
} else {
6031+
MOZ_ASSERT(!access.isZeroExtendSimd128Load());
60316032
bool isFloat = output.isFloat();
60326033
if (isFloat) {
60336034
MOZ_ASSERT((byteSize == 4) == output.fpu().isSingle());

js/src/jit/arm64/MacroAssembler-arm64.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -360,9 +360,11 @@ void MacroAssemblerCompat::wasmLoadImpl(const wasm::MemoryAccessDesc& access,
360360
Ldr(SelectGPReg(outany, out64), srcAddr);
361361
break;
362362
case Scalar::Float32:
363+
MOZ_ASSERT(!access.isZeroExtendSimd128Load());
363364
Ldr(SelectFPReg(outany, out64, 32), srcAddr);
364365
break;
365366
case Scalar::Float64:
367+
MOZ_ASSERT(!access.isZeroExtendSimd128Load());
366368
Ldr(SelectFPReg(outany, out64, 64), srcAddr);
367369
break;
368370
case Scalar::Uint8Clamped:

js/src/jit/mips-shared/MacroAssembler-mips-shared.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -2126,9 +2126,11 @@ void MacroAssemblerMIPSShared::wasmLoadImpl(
21262126
isSigned = false;
21272127
break;
21282128
case Scalar::Float64:
2129+
MOZ_ASSERT(!access.isZeroExtendSimd128Load());
21292130
isFloat = true;
21302131
break;
21312132
case Scalar::Float32:
2133+
MOZ_ASSERT(!access.isZeroExtendSimd128Load());
21322134
isFloat = true;
21332135
break;
21342136
default:

js/src/jit/shared/Assembler-shared.h

+10-1
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,7 @@ class MemoryAccessDesc {
492492
Scalar::Type type_;
493493
jit::Synchronization sync_;
494494
wasm::BytecodeOffset trapOffset_;
495+
bool zeroExtendSimd128Load_;
495496

496497
public:
497498
explicit MemoryAccessDesc(
@@ -502,7 +503,8 @@ class MemoryAccessDesc {
502503
align_(align),
503504
type_(type),
504505
sync_(sync),
505-
trapOffset_(trapOffset) {
506+
trapOffset_(trapOffset),
507+
zeroExtendSimd128Load_(false) {
506508
MOZ_ASSERT(mozilla::IsPowerOfTwo(align));
507509
}
508510

@@ -513,6 +515,13 @@ class MemoryAccessDesc {
513515
const jit::Synchronization& sync() const { return sync_; }
514516
BytecodeOffset trapOffset() const { return trapOffset_; }
515517
bool isAtomic() const { return !sync_.isNone(); }
518+
bool isZeroExtendSimd128Load() const { return zeroExtendSimd128Load_; }
519+
520+
void setZeroExtendSimd128Load() {
521+
MOZ_ASSERT(type() == Scalar::Float32 || type() == Scalar::Float64);
522+
MOZ_ASSERT(!isAtomic());
523+
zeroExtendSimd128Load_ = true;
524+
}
516525

517526
void clearOffset() { offset_ = 0; }
518527
void setOffset(uint32_t offset) { offset_ = offset; }

js/src/jit/x64/MacroAssembler-x64.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -596,10 +596,12 @@ void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
596596
movl(srcAddr, out.gpr());
597597
break;
598598
case Scalar::Float32:
599-
loadFloat32(srcAddr, out.fpu());
599+
// vmovss does the right thing also for access.isZeroExtendSimdLoad()
600+
vmovss(srcAddr, out.fpu());
600601
break;
601602
case Scalar::Float64:
602-
loadDouble(srcAddr, out.fpu());
603+
// vmovsd does the right thing also for access.isZeroExtendSimdLoad()
604+
vmovsd(srcAddr, out.fpu());
603605
break;
604606
case Scalar::Simd128:
605607
MacroAssemblerX64::loadUnalignedSimd128(srcAddr, out.fpu());

0 commit comments

Comments
 (0)