Skip to content

Commit 93b28c2

Browse files
tlivelyHans Winderix
authored and
Hans Winderix
committed
[WebAssembly] Implement prototype v128.load{32,64}_zero instructions
Specified in WebAssembly/simd#237, these instructions load the first vector lane from memory and zero the other lanes. Since these instructions are not officially part of the SIMD proposal, they are only available on an opt-in basis via LLVM intrinsics and clang builtin functions. If these instructions are merged to the proposal, this implementation will change so that the instructions will be generated from normal IR. At that point the intrinsics and builtin functions would be removed. This PR also changes the opcodes for the experimental f32x4.qfm{a,s} instructions because their opcodes conflicted with those of the v128.load{32,64}_zero instructions. The new opcodes were chosen to match those used in V8. Differential Revision: https://reviews.llvm.org/D84820
1 parent b3ce49b commit 93b28c2

File tree

10 files changed

+334
-12
lines changed

10 files changed

+334
-12
lines changed

clang/include/clang/Basic/BuiltinsWebAssembly.def

+3
Original file line numberDiff line numberDiff line change
@@ -169,5 +169,8 @@ TARGET_BUILTIN(__builtin_wasm_narrow_u_i8x16_i16x8, "V16cV8sV8s", "nc", "simd128
169169
TARGET_BUILTIN(__builtin_wasm_narrow_s_i16x8_i32x4, "V8sV4iV4i", "nc", "simd128")
170170
TARGET_BUILTIN(__builtin_wasm_narrow_u_i16x8_i32x4, "V8sV4iV4i", "nc", "simd128")
171171

172+
TARGET_BUILTIN(__builtin_wasm_load32_zero, "V4ii*", "nU", "simd128")
173+
TARGET_BUILTIN(__builtin_wasm_load64_zero, "V2LLiLLi*", "nU", "simd128")
174+
172175
#undef BUILTIN
173176
#undef TARGET_BUILTIN

clang/lib/CodeGen/CGBuiltin.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -16497,6 +16497,16 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
1649716497
CGM.getIntrinsic(IntNo, {ConvertType(E->getType()), Low->getType()});
1649816498
return Builder.CreateCall(Callee, {Low, High});
1649916499
}
16500+
case WebAssembly::BI__builtin_wasm_load32_zero: {
16501+
Value *Ptr = EmitScalarExpr(E->getArg(0));
16502+
Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_load32_zero);
16503+
return Builder.CreateCall(Callee, {Ptr});
16504+
}
16505+
case WebAssembly::BI__builtin_wasm_load64_zero: {
16506+
Value *Ptr = EmitScalarExpr(E->getArg(0));
16507+
Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_load64_zero);
16508+
return Builder.CreateCall(Callee, {Ptr});
16509+
}
1650016510
case WebAssembly::BI__builtin_wasm_shuffle_v8x16: {
1650116511
Value *Ops[18];
1650216512
size_t OpIdx = 0;

clang/test/CodeGen/builtins-wasm.c

+12
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,18 @@ i16x8 narrow_u_i16x8_i32x4(i32x4 low, i32x4 high) {
737737
// WEBASSEMBLY: ret
738738
}
739739

740+
i32x4 load32_zero(int *p) {
741+
return __builtin_wasm_load32_zero(p);
742+
// WEBASSEMBLY: call <4 x i32> @llvm.wasm.load32.zero(i32* %p)
743+
// WEBASSEMBLY: ret
744+
}
745+
746+
i64x2 load64_zero(long long *p) {
747+
return __builtin_wasm_load64_zero(p);
748+
// WEBASSEMBLY: call <2 x i64> @llvm.wasm.load64.zero(i64* %p)
749+
// WEBASSEMBLY: ret
750+
}
751+
740752
i8x16 swizzle_v8x16(i8x16 x, i8x16 y) {
741753
return __builtin_wasm_swizzle_v8x16(x, y);
742754
// WEBASSEMBLY: call <16 x i8> @llvm.wasm.swizzle(<16 x i8> %x, <16 x i8> %y)

llvm/include/llvm/IR/IntrinsicsWebAssembly.td

+14
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,20 @@ def int_wasm_nearest :
190190
[LLVMMatchType<0>],
191191
[IntrNoMem, IntrSpeculatable]>;
192192

193+
// TODO: Replace these intrinsic with normal ISel patterns once the
194+
// load_zero instructions are merged to the proposal.
195+
def int_wasm_load32_zero :
196+
Intrinsic<[llvm_v4i32_ty],
197+
[LLVMPointerType<llvm_i32_ty>],
198+
[IntrReadMem, IntrArgMemOnly],
199+
"", [SDNPMemOperand]>;
200+
201+
def int_wasm_load64_zero :
202+
Intrinsic<[llvm_v2i64_ty],
203+
[LLVMPointerType<llvm_i64_ty>],
204+
[IntrReadMem, IntrArgMemOnly],
205+
"", [SDNPMemOperand]>;
206+
193207
//===----------------------------------------------------------------------===//
194208
// Thread-local storage intrinsics
195209
//===----------------------------------------------------------------------===//

llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h

+2
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,7 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
232232
WASM_LOAD_STORE(ATOMIC_NOTIFY)
233233
WASM_LOAD_STORE(ATOMIC_WAIT_I32)
234234
WASM_LOAD_STORE(LOAD_SPLAT_v32x4)
235+
WASM_LOAD_STORE(LOAD_ZERO_v4i32)
235236
return 2;
236237
WASM_LOAD_STORE(LOAD_I64)
237238
WASM_LOAD_STORE(LOAD_F64)
@@ -254,6 +255,7 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
254255
WASM_LOAD_STORE(LOAD_EXTEND_U_v4i32)
255256
WASM_LOAD_STORE(LOAD_EXTEND_S_v2i64)
256257
WASM_LOAD_STORE(LOAD_EXTEND_U_v2i64)
258+
WASM_LOAD_STORE(LOAD_ZERO_v2i64)
257259
return 3;
258260
WASM_LOAD_STORE(LOAD_V128)
259261
WASM_LOAD_STORE(STORE_V128)

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

+9
Original file line numberDiff line numberDiff line change
@@ -675,6 +675,15 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
675675
Info.align = Align(8);
676676
Info.flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad;
677677
return true;
678+
case Intrinsic::wasm_load32_zero:
679+
case Intrinsic::wasm_load64_zero:
680+
Info.opc = ISD::INTRINSIC_W_CHAIN;
681+
Info.memVT = Intrinsic == Intrinsic::wasm_load32_zero ? MVT::i32 : MVT::i64;
682+
Info.ptrVal = I.getArgOperand(0);
683+
Info.offset = 0;
684+
Info.align = Info.memVT == MVT::i32 ? Align(4) : Align(8);
685+
Info.flags = MachineMemOperand::MOLoad;
686+
return true;
678687
default:
679688
return false;
680689
}

llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ defm LOAD_F64 : WebAssemblyLoad<F64, "f64.load", 0x2b, []>;
7070
multiclass LoadPatNoOffset<ValueType ty, PatFrag kind, string inst> {
7171
def : Pat<(ty (kind I32:$addr)), (!cast<NI>(inst # "_A32") 0, 0, I32:$addr)>,
7272
Requires<[HasAddr32]>;
73-
def : Pat<(ty (kind I64:$addr)), (!cast<NI>(inst # "_A64") 0, 0, I64:$addr)>,
73+
def : Pat<(ty (kind (i64 I64:$addr))), (!cast<NI>(inst # "_A64") 0, 0, I64:$addr)>,
7474
Requires<[HasAddr64]>;
7575
}
7676

llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td

+44-6
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,43 @@ defm : LoadPatGlobalAddrOffOnly<types[0], !cast<PatFrag>(exts[0]#types[1]),
163163
"LOAD_EXTEND"#exts[1]#"_"#types[0]>;
164164
}
165165

166+
// Load lane into zero vector
167+
multiclass SIMDLoadZero<ValueType vec_t, string name, bits<32> simdop> {
168+
let mayLoad = 1, UseNamedOperandTable = 1 in {
169+
defm LOAD_ZERO_#vec_t#_A32 :
170+
SIMD_I<(outs V128:$dst),
171+
(ins P2Align:$p2align, offset32_op:$off, I32:$addr),
172+
(outs), (ins P2Align:$p2align, offset32_op:$off), [],
173+
name#"\t$dst, ${off}(${addr})$p2align",
174+
name#"\t$off$p2align", simdop>;
175+
defm LOAD_ZERO_#vec_t#_A64 :
176+
SIMD_I<(outs V128:$dst),
177+
(ins P2Align:$p2align, offset64_op:$off, I64:$addr),
178+
(outs), (ins P2Align:$p2align, offset64_op:$off), [],
179+
name#"\t$dst, ${off}(${addr})$p2align",
180+
name#"\t$off$p2align", simdop>;
181+
} // mayLoad = 1, UseNamedOperandTable = 1
182+
}
183+
184+
// TODO: Also support v4f32 and v2f64 once the instructions are merged
185+
// to the proposal
186+
defm "" : SIMDLoadZero<v4i32, "v128.load32_zero", 252>;
187+
defm "" : SIMDLoadZero<v2i64, "v128.load64_zero", 253>;
188+
189+
defm : LoadPatNoOffset<v4i32, int_wasm_load32_zero, "LOAD_ZERO_v4i32">;
190+
defm : LoadPatNoOffset<v2i64, int_wasm_load64_zero, "LOAD_ZERO_v2i64">;
191+
192+
defm : LoadPatImmOff<v4i32, int_wasm_load32_zero, regPlusImm, "LOAD_ZERO_v4i32">;
193+
defm : LoadPatImmOff<v2i64, int_wasm_load64_zero, regPlusImm, "LOAD_ZERO_v2i64">;
194+
195+
defm : LoadPatImmOff<v4i32, int_wasm_load32_zero, or_is_add, "LOAD_ZERO_v4i32">;
196+
defm : LoadPatImmOff<v2i64, int_wasm_load64_zero, or_is_add, "LOAD_ZERO_v2i64">;
197+
198+
defm : LoadPatOffsetOnly<v4i32, int_wasm_load32_zero, "LOAD_ZERO_v4i32">;
199+
defm : LoadPatOffsetOnly<v2i64, int_wasm_load64_zero, "LOAD_ZERO_v2i64">;
200+
201+
defm : LoadPatGlobalAddrOffOnly<v4i32, int_wasm_load32_zero, "LOAD_ZERO_v4i32">;
202+
defm : LoadPatGlobalAddrOffOnly<v2i64, int_wasm_load64_zero, "LOAD_ZERO_v2i64">;
166203

167204
// Store: v128.store
168205
let mayStore = 1, UseNamedOperandTable = 1 in {
@@ -800,7 +837,7 @@ let isCommutable = 1 in
800837
defm DOT : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins),
801838
[(set V128:$dst, (int_wasm_dot V128:$lhs, V128:$rhs))],
802839
"i32x4.dot_i16x8_s\t$dst, $lhs, $rhs", "i32x4.dot_i16x8_s",
803-
180>;
840+
186>;
804841

805842
//===----------------------------------------------------------------------===//
806843
// Floating-point unary arithmetic
@@ -1038,20 +1075,21 @@ def : Pat<(t1 (bitconvert (t2 V128:$v))), (t1 V128:$v)>;
10381075
// Quasi-Fused Multiply- Add and Subtract (QFMA/QFMS)
10391076
//===----------------------------------------------------------------------===//
10401077

1041-
multiclass SIMDQFM<ValueType vec_t, string vec, bits<32> baseInst> {
1078+
multiclass SIMDQFM<ValueType vec_t, string vec, bits<32> simdopA,
1079+
bits<32> simdopS> {
10421080
defm QFMA_#vec_t :
10431081
SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c),
10441082
(outs), (ins),
10451083
[(set (vec_t V128:$dst),
10461084
(int_wasm_qfma (vec_t V128:$a), (vec_t V128:$b), (vec_t V128:$c)))],
1047-
vec#".qfma\t$dst, $a, $b, $c", vec#".qfma", baseInst>;
1085+
vec#".qfma\t$dst, $a, $b, $c", vec#".qfma", simdopA>;
10481086
defm QFMS_#vec_t :
10491087
SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c),
10501088
(outs), (ins),
10511089
[(set (vec_t V128:$dst),
10521090
(int_wasm_qfms (vec_t V128:$a), (vec_t V128:$b), (vec_t V128:$c)))],
1053-
vec#".qfms\t$dst, $a, $b, $c", vec#".qfms", !add(baseInst, 1)>;
1091+
vec#".qfms\t$dst, $a, $b, $c", vec#".qfms", simdopS>;
10541092
}
10551093

1056-
defm "" : SIMDQFM<v4f32, "f32x4", 252>;
1057-
defm "" : SIMDQFM<v2f64, "f64x2", 254>;
1094+
defm "" : SIMDQFM<v4f32, "f32x4", 180, 212>;
1095+
defm "" : SIMDQFM<v2f64, "f64x2", 254, 255>;

0 commit comments

Comments
 (0)