Skip to content

Commit 9f426ce

Browse files
authored
Merge pull request #1416 from afonso360/aarch64-intrinsics-1
Implement AArch64 intrinsics necessary for simd-json
2 parents ef37036 + 209476e commit 9f426ce

File tree

5 files changed

+356
-21
lines changed

5 files changed

+356
-21
lines changed

build_system/tests.rs

+1
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ const BASE_SYSROOT_SUITE: &[TestCase] = &[
9999
TestCase::build_bin_and_run("aot.mod_bench", "example/mod_bench.rs", &[]),
100100
TestCase::build_bin_and_run("aot.issue-72793", "example/issue-72793.rs", &[]),
101101
TestCase::build_bin("aot.issue-59326", "example/issue-59326.rs"),
102+
TestCase::build_bin_and_run("aot.neon", "example/neon.rs", &[]),
102103
];
103104

104105
pub(crate) static RAND_REPO: GitRepo = GitRepo::github(

config.txt

+1
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ aot.float-minmax-pass
4242
aot.mod_bench
4343
aot.issue-72793
4444
aot.issue-59326
45+
aot.neon
4546

4647
testsuite.extended_sysroot
4748
test.rust-random/rand

example/neon.rs

+234
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
// Most of these tests are copied from https://github.com/japaric/stdsimd/blob/0f4413d01c4f0c3ffbc5a69e9a37fbc7235b31a9/coresimd/arm/neon.rs
2+
3+
#![feature(portable_simd)]
4+
5+
#[cfg(target_arch = "aarch64")]
6+
use std::arch::aarch64::*;
7+
use std::mem::transmute;
8+
use std::simd::*;
9+
10+
#[cfg(target_arch = "aarch64")]
11+
unsafe fn test_vpmin_s8() {
12+
let a = i8x8::from([1, -2, 3, -4, 5, 6, 7, 8]);
13+
let b = i8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
14+
let e = i8x8::from([-2, -4, 5, 7, 0, 2, 4, 6]);
15+
let r: i8x8 = transmute(vpmin_s8(transmute(a), transmute(b)));
16+
assert_eq!(r, e);
17+
}
18+
19+
#[cfg(target_arch = "aarch64")]
20+
unsafe fn test_vpmin_s16() {
21+
let a = i16x4::from([1, 2, 3, -4]);
22+
let b = i16x4::from([0, 3, 2, 5]);
23+
let e = i16x4::from([1, -4, 0, 2]);
24+
let r: i16x4 = transmute(vpmin_s16(transmute(a), transmute(b)));
25+
assert_eq!(r, e);
26+
}
27+
28+
#[cfg(target_arch = "aarch64")]
29+
unsafe fn test_vpmin_s32() {
30+
let a = i32x2::from([1, -2]);
31+
let b = i32x2::from([0, 3]);
32+
let e = i32x2::from([-2, 0]);
33+
let r: i32x2 = transmute(vpmin_s32(transmute(a), transmute(b)));
34+
assert_eq!(r, e);
35+
}
36+
37+
#[cfg(target_arch = "aarch64")]
38+
unsafe fn test_vpmin_u8() {
39+
let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
40+
let b = u8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
41+
let e = u8x8::from([1, 3, 5, 7, 0, 2, 4, 6]);
42+
let r: u8x8 = transmute(vpmin_u8(transmute(a), transmute(b)));
43+
assert_eq!(r, e);
44+
}
45+
46+
#[cfg(target_arch = "aarch64")]
47+
unsafe fn test_vpmin_u16() {
48+
let a = u16x4::from([1, 2, 3, 4]);
49+
let b = u16x4::from([0, 3, 2, 5]);
50+
let e = u16x4::from([1, 3, 0, 2]);
51+
let r: u16x4 = transmute(vpmin_u16(transmute(a), transmute(b)));
52+
assert_eq!(r, e);
53+
}
54+
55+
#[cfg(target_arch = "aarch64")]
56+
unsafe fn test_vpmin_u32() {
57+
let a = u32x2::from([1, 2]);
58+
let b = u32x2::from([0, 3]);
59+
let e = u32x2::from([1, 0]);
60+
let r: u32x2 = transmute(vpmin_u32(transmute(a), transmute(b)));
61+
assert_eq!(r, e);
62+
}
63+
64+
#[cfg(target_arch = "aarch64")]
65+
unsafe fn test_vpmin_f32() {
66+
let a = f32x2::from([1., -2.]);
67+
let b = f32x2::from([0., 3.]);
68+
let e = f32x2::from([-2., 0.]);
69+
let r: f32x2 = transmute(vpmin_f32(transmute(a), transmute(b)));
70+
assert_eq!(r, e);
71+
}
72+
73+
#[cfg(target_arch = "aarch64")]
74+
unsafe fn test_vpmax_s8() {
75+
let a = i8x8::from([1, -2, 3, -4, 5, 6, 7, 8]);
76+
let b = i8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
77+
let e = i8x8::from([1, 3, 6, 8, 3, 5, 7, 9]);
78+
let r: i8x8 = transmute(vpmax_s8(transmute(a), transmute(b)));
79+
assert_eq!(r, e);
80+
}
81+
82+
#[cfg(target_arch = "aarch64")]
83+
unsafe fn test_vpmax_s16() {
84+
let a = i16x4::from([1, 2, 3, -4]);
85+
let b = i16x4::from([0, 3, 2, 5]);
86+
let e = i16x4::from([2, 3, 3, 5]);
87+
let r: i16x4 = transmute(vpmax_s16(transmute(a), transmute(b)));
88+
assert_eq!(r, e);
89+
}
90+
91+
#[cfg(target_arch = "aarch64")]
92+
unsafe fn test_vpmax_s32() {
93+
let a = i32x2::from([1, -2]);
94+
let b = i32x2::from([0, 3]);
95+
let e = i32x2::from([1, 3]);
96+
let r: i32x2 = transmute(vpmax_s32(transmute(a), transmute(b)));
97+
assert_eq!(r, e);
98+
}
99+
100+
#[cfg(target_arch = "aarch64")]
101+
unsafe fn test_vpmax_u8() {
102+
let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
103+
let b = u8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
104+
let e = u8x8::from([2, 4, 6, 8, 3, 5, 7, 9]);
105+
let r: u8x8 = transmute(vpmax_u8(transmute(a), transmute(b)));
106+
assert_eq!(r, e);
107+
}
108+
109+
#[cfg(target_arch = "aarch64")]
110+
unsafe fn test_vpmax_u16() {
111+
let a = u16x4::from([1, 2, 3, 4]);
112+
let b = u16x4::from([0, 3, 2, 5]);
113+
let e = u16x4::from([2, 4, 3, 5]);
114+
let r: u16x4 = transmute(vpmax_u16(transmute(a), transmute(b)));
115+
assert_eq!(r, e);
116+
}
117+
118+
#[cfg(target_arch = "aarch64")]
119+
unsafe fn test_vpmax_u32() {
120+
let a = u32x2::from([1, 2]);
121+
let b = u32x2::from([0, 3]);
122+
let e = u32x2::from([2, 3]);
123+
let r: u32x2 = transmute(vpmax_u32(transmute(a), transmute(b)));
124+
assert_eq!(r, e);
125+
}
126+
127+
#[cfg(target_arch = "aarch64")]
128+
unsafe fn test_vpmax_f32() {
129+
let a = f32x2::from([1., -2.]);
130+
let b = f32x2::from([0., 3.]);
131+
let e = f32x2::from([1., 3.]);
132+
let r: f32x2 = transmute(vpmax_f32(transmute(a), transmute(b)));
133+
assert_eq!(r, e);
134+
}
135+
136+
#[cfg(target_arch = "aarch64")]
137+
unsafe fn test_vpadd_s16() {
138+
let a = i16x4::from([1, 2, 3, 4]);
139+
let b = i16x4::from([0, -1, -2, -3]);
140+
let r: i16x4 = transmute(vpadd_s16(transmute(a), transmute(b)));
141+
let e = i16x4::from([3, 7, -1, -5]);
142+
assert_eq!(r, e);
143+
}
144+
#[cfg(target_arch = "aarch64")]
145+
unsafe fn test_vpadd_s32() {
146+
let a = i32x2::from([1, 2]);
147+
let b = i32x2::from([0, -1]);
148+
let r: i32x2 = transmute(vpadd_s32(transmute(a), transmute(b)));
149+
let e = i32x2::from([3, -1]);
150+
assert_eq!(r, e);
151+
}
152+
#[cfg(target_arch = "aarch64")]
153+
unsafe fn test_vpadd_s8() {
154+
let a = i8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
155+
let b = i8x8::from([0, -1, -2, -3, -4, -5, -6, -7]);
156+
let r: i8x8 = transmute(vpadd_s8(transmute(a), transmute(b)));
157+
let e = i8x8::from([3, 7, 11, 15, -1, -5, -9, -13]);
158+
assert_eq!(r, e);
159+
}
160+
#[cfg(target_arch = "aarch64")]
161+
unsafe fn test_vpadd_u16() {
162+
let a = u16x4::from([1, 2, 3, 4]);
163+
let b = u16x4::from([30, 31, 32, 33]);
164+
let r: u16x4 = transmute(vpadd_u16(transmute(a), transmute(b)));
165+
let e = u16x4::from([3, 7, 61, 65]);
166+
assert_eq!(r, e);
167+
}
168+
#[cfg(target_arch = "aarch64")]
169+
unsafe fn test_vpadd_u32() {
170+
let a = u32x2::from([1, 2]);
171+
let b = u32x2::from([30, 31]);
172+
let r: u32x2 = transmute(vpadd_u32(transmute(a), transmute(b)));
173+
let e = u32x2::from([3, 61]);
174+
assert_eq!(r, e);
175+
}
176+
#[cfg(target_arch = "aarch64")]
177+
unsafe fn test_vpadd_u8() {
178+
let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
179+
let b = u8x8::from([30, 31, 32, 33, 34, 35, 36, 37]);
180+
let r: u8x8 = transmute(vpadd_u8(transmute(a), transmute(b)));
181+
let e = u8x8::from([3, 7, 11, 15, 61, 65, 69, 73]);
182+
assert_eq!(r, e);
183+
}
184+
185+
#[cfg(target_arch = "aarch64")]
186+
unsafe fn test_vqsub_u8() {
187+
let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 0xff]);
188+
let b = u8x8::from([30, 1, 1, 1, 34, 0xff, 36, 37]);
189+
let r: u8x8 = transmute(vqsub_u8(transmute(a), transmute(b)));
190+
let e = u8x8::from([0, 1, 2, 3, 0, 0, 0, 218]);
191+
assert_eq!(r, e);
192+
}
193+
194+
#[cfg(target_arch = "aarch64")]
195+
unsafe fn test_vqadd_u8() {
196+
let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 0xff]);
197+
let b = u8x8::from([30, 1, 1, 1, 34, 0xff, 36, 37]);
198+
let r: u8x8 = transmute(vqadd_u8(transmute(a), transmute(b)));
199+
let e = u8x8::from([31, 3, 4, 5, 39, 0xff, 43, 0xff]);
200+
assert_eq!(r, e);
201+
}
202+
203+
#[cfg(target_arch = "aarch64")]
204+
fn main() {
205+
unsafe {
206+
test_vpmin_s8();
207+
test_vpmin_s16();
208+
test_vpmin_s32();
209+
test_vpmin_u8();
210+
test_vpmin_u16();
211+
test_vpmin_u32();
212+
test_vpmin_f32();
213+
test_vpmax_s8();
214+
test_vpmax_s16();
215+
test_vpmax_s32();
216+
test_vpmax_u8();
217+
test_vpmax_u16();
218+
test_vpmax_u32();
219+
test_vpmax_f32();
220+
221+
test_vpadd_s16();
222+
test_vpadd_s32();
223+
test_vpadd_s8();
224+
test_vpadd_u16();
225+
test_vpadd_u32();
226+
test_vpadd_u8();
227+
228+
test_vqsub_u8();
229+
test_vqadd_u8();
230+
}
231+
}
232+
233+
#[cfg(not(target_arch = "aarch64"))]
234+
fn main() {}

src/intrinsics/llvm_aarch64.rs

+90-21
Original file line numberDiff line numberDiff line change
@@ -44,15 +44,19 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
4444
});
4545
}
4646

47-
_ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v") => {
47+
_ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v")
48+
|| intrinsic.starts_with("llvm.aarch64.neon.uqadd.v") =>
49+
{
4850
intrinsic_args!(fx, args => (x, y); intrinsic);
4951

5052
simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| {
5153
crate::num::codegen_saturating_int_binop(fx, BinOp::Add, x_lane, y_lane)
5254
});
5355
}
5456

55-
_ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v") => {
57+
_ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v")
58+
|| intrinsic.starts_with("llvm.aarch64.neon.uqsub.v") =>
59+
{
5660
intrinsic_args!(fx, args => (x, y); intrinsic);
5761

5862
simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| {
@@ -156,6 +160,90 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
156160
});
157161
}
158162

163+
_ if intrinsic.starts_with("llvm.aarch64.neon.umaxp.v") => {
164+
intrinsic_args!(fx, args => (x, y); intrinsic);
165+
166+
simd_horizontal_pair_for_each_lane(
167+
fx,
168+
x,
169+
y,
170+
ret,
171+
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umax(x_lane, y_lane),
172+
);
173+
}
174+
175+
_ if intrinsic.starts_with("llvm.aarch64.neon.smaxp.v") => {
176+
intrinsic_args!(fx, args => (x, y); intrinsic);
177+
178+
simd_horizontal_pair_for_each_lane(
179+
fx,
180+
x,
181+
y,
182+
ret,
183+
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smax(x_lane, y_lane),
184+
);
185+
}
186+
187+
_ if intrinsic.starts_with("llvm.aarch64.neon.uminp.v") => {
188+
intrinsic_args!(fx, args => (x, y); intrinsic);
189+
190+
simd_horizontal_pair_for_each_lane(
191+
fx,
192+
x,
193+
y,
194+
ret,
195+
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umin(x_lane, y_lane),
196+
);
197+
}
198+
199+
_ if intrinsic.starts_with("llvm.aarch64.neon.sminp.v") => {
200+
intrinsic_args!(fx, args => (x, y); intrinsic);
201+
202+
simd_horizontal_pair_for_each_lane(
203+
fx,
204+
x,
205+
y,
206+
ret,
207+
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smin(x_lane, y_lane),
208+
);
209+
}
210+
211+
_ if intrinsic.starts_with("llvm.aarch64.neon.fminp.v") => {
212+
intrinsic_args!(fx, args => (x, y); intrinsic);
213+
214+
simd_horizontal_pair_for_each_lane(
215+
fx,
216+
x,
217+
y,
218+
ret,
219+
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmin(x_lane, y_lane),
220+
);
221+
}
222+
223+
_ if intrinsic.starts_with("llvm.aarch64.neon.fmaxp.v") => {
224+
intrinsic_args!(fx, args => (x, y); intrinsic);
225+
226+
simd_horizontal_pair_for_each_lane(
227+
fx,
228+
x,
229+
y,
230+
ret,
231+
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmax(x_lane, y_lane),
232+
);
233+
}
234+
235+
_ if intrinsic.starts_with("llvm.aarch64.neon.addp.v") => {
236+
intrinsic_args!(fx, args => (x, y); intrinsic);
237+
238+
simd_horizontal_pair_for_each_lane(
239+
fx,
240+
x,
241+
y,
242+
ret,
243+
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().iadd(x_lane, y_lane),
244+
);
245+
}
246+
159247
// FIXME generalize vector types
160248
"llvm.aarch64.neon.tbl1.v16i8" => {
161249
intrinsic_args!(fx, args => (t, idx); intrinsic);
@@ -172,25 +260,6 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
172260
}
173261
}
174262

175-
// FIXME generalize vector types
176-
"llvm.aarch64.neon.umaxp.v16i8" => {
177-
intrinsic_args!(fx, args => (a, b); intrinsic);
178-
179-
// FIXME add helper for horizontal pairwise operations
180-
for i in 0..8 {
181-
let lane1 = a.value_lane(fx, i * 2).load_scalar(fx);
182-
let lane2 = a.value_lane(fx, i * 2 + 1).load_scalar(fx);
183-
let res = fx.bcx.ins().umax(lane1, lane2);
184-
ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
185-
}
186-
for i in 0..8 {
187-
let lane1 = b.value_lane(fx, i * 2).load_scalar(fx);
188-
let lane2 = b.value_lane(fx, i * 2 + 1).load_scalar(fx);
189-
let res = fx.bcx.ins().umax(lane1, lane2);
190-
ret.place_lane(fx, 8 + i).to_ptr().store(fx, res, MemFlags::trusted());
191-
}
192-
}
193-
194263
/*
195264
_ if intrinsic.starts_with("llvm.aarch64.neon.sshl.v")
196265
|| intrinsic.starts_with("llvm.aarch64.neon.sqshl.v")

0 commit comments

Comments
 (0)