|
| 1 | +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +bf16 \ |
| 2 | +// RUN: -fclangir -disable-O0-optnone \ |
| 3 | +// RUN: -flax-vector-conversions=none -emit-cir -o %t.cir %s |
| 4 | +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s |
| 5 | + |
| 6 | +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +bf16 \ |
| 7 | +// RUN: -fclangir -disable-O0-optnone \ |
| 8 | +// RUN: -flax-vector-conversions=none -emit-llvm -fno-clangir-call-conv-lowering -o - %s \ |
| 9 | +// RUN: | opt -S -passes=mem2reg,simplifycfg -o %t.ll |
| 10 | +// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s |
| 11 | + |
| 12 | +// REQUIRES: aarch64-registered-target || arm-registered-target |
| 13 | + |
| 14 | +// This test mimics clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c, which eventually |
| 15 | +// CIR shall be able to support fully. Since this is going to take some time to converge, |
| 16 | +// the unsupported/NYI code is commented out, so that we can incrementally improve this. |
| 17 | +// The NYI filecheck used contains the LLVM output from OG codegen that should guide the |
| 18 | +// correct result when implementing this into the CIR pipeline. |
| 19 | + |
| 20 | +#include <arm_neon.h> |
| 21 | + |
| 22 | +// CHECK-LABEL: @test_vcreate_bf16( |
| 23 | +// CHECK-NEXT: entry: |
| 24 | +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A:%.*]] to <4 x bfloat> |
| 25 | +// CHECK-NEXT: ret <4 x bfloat> [[TMP0]] |
| 26 | +// |
| 27 | +// bfloat16x4_t test_vcreate_bf16(uint64_t a) { |
| 28 | +// return vcreate_bf16(a); |
| 29 | +// } |
| 30 | + |
| 31 | +// CHECK-LABEL: @test_vdup_n_bf16( |
| 32 | +// CHECK-NEXT: entry: |
| 33 | +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x bfloat> poison, bfloat [[V:%.*]], i32 0 |
| 34 | +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x bfloat> [[VECINIT_I]], bfloat [[V]], i32 1 |
| 35 | +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x bfloat> [[VECINIT1_I]], bfloat [[V]], i32 2 |
| 36 | +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x bfloat> [[VECINIT2_I]], bfloat [[V]], i32 3 |
| 37 | +// CHECK-NEXT: ret <4 x bfloat> [[VECINIT3_I]] |
| 38 | +// |
| 39 | +// bfloat16x4_t test_vdup_n_bf16(bfloat16_t v) { |
| 40 | +// return vdup_n_bf16(v); |
| 41 | +// } |
| 42 | + |
| 43 | +// CHECK-LABEL: @test_vdupq_n_bf16( |
| 44 | +// CHECK-NEXT: entry: |
| 45 | +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x bfloat> poison, bfloat [[V:%.*]], i32 0 |
| 46 | +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x bfloat> [[VECINIT_I]], bfloat [[V]], i32 1 |
| 47 | +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x bfloat> [[VECINIT1_I]], bfloat [[V]], i32 2 |
| 48 | +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x bfloat> [[VECINIT2_I]], bfloat [[V]], i32 3 |
| 49 | +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x bfloat> [[VECINIT3_I]], bfloat [[V]], i32 4 |
| 50 | +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x bfloat> [[VECINIT4_I]], bfloat [[V]], i32 5 |
| 51 | +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x bfloat> [[VECINIT5_I]], bfloat [[V]], i32 6 |
| 52 | +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x bfloat> [[VECINIT6_I]], bfloat [[V]], i32 7 |
| 53 | +// CHECK-NEXT: ret <8 x bfloat> [[VECINIT7_I]] |
| 54 | +// |
| 55 | +// bfloat16x8_t test_vdupq_n_bf16(bfloat16_t v) { |
| 56 | +// return vdupq_n_bf16(v); |
| 57 | +// } |
| 58 | + |
| 59 | +// CHECK-LABEL: @test_vdup_lane_bf16( |
| 60 | +// CHECK-NEXT: entry: |
| 61 | +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <8 x i8> |
| 62 | +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x bfloat> |
| 63 | +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1> |
| 64 | +// CHECK-NEXT: ret <4 x bfloat> [[LANE]] |
| 65 | +// |
| 66 | +// bfloat16x4_t test_vdup_lane_bf16(bfloat16x4_t v) { |
| 67 | +// return vdup_lane_bf16(v, 1); |
| 68 | +// } |
| 69 | + |
| 70 | +// CHECK-LABEL: @test_vdupq_lane_bf16( |
| 71 | +// CHECK-NEXT: entry: |
| 72 | +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <8 x i8> |
| 73 | +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x bfloat> |
| 74 | +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP1]], <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> |
| 75 | +// CHECK-NEXT: ret <8 x bfloat> [[LANE]] |
| 76 | +// |
| 77 | +// bfloat16x8_t test_vdupq_lane_bf16(bfloat16x4_t v) { |
| 78 | +// return vdupq_lane_bf16(v, 1); |
| 79 | +// } |
| 80 | + |
| 81 | +// CHECK-LABEL: @test_vdup_laneq_bf16( |
| 82 | +// CHECK-NEXT: entry: |
| 83 | +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <16 x i8> |
| 84 | +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x bfloat> |
| 85 | +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> |
| 86 | +// CHECK-NEXT: ret <4 x bfloat> [[LANE]] |
| 87 | +// |
| 88 | +// bfloat16x4_t test_vdup_laneq_bf16(bfloat16x8_t v) { |
| 89 | +// return vdup_laneq_bf16(v, 7); |
| 90 | +// } |
| 91 | + |
| 92 | +// CHECK-LABEL: @test_vdupq_laneq_bf16( |
| 93 | +// CHECK-NEXT: entry: |
| 94 | +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <16 x i8> |
| 95 | +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x bfloat> |
| 96 | +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> |
| 97 | +// CHECK-NEXT: ret <8 x bfloat> [[LANE]] |
| 98 | +// |
| 99 | +// bfloat16x8_t test_vdupq_laneq_bf16(bfloat16x8_t v) { |
| 100 | +// return vdupq_laneq_bf16(v, 7); |
| 101 | +// } |
| 102 | + |
| 103 | +// CHECK-LABEL: @test_vcombine_bf16( |
| 104 | +// CHECK-NEXT: entry: |
| 105 | +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[LOW:%.*]], <4 x bfloat> [[HIGH:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| 106 | +// CHECK-NEXT: ret <8 x bfloat> [[SHUFFLE_I]] |
| 107 | +// |
| 108 | +// bfloat16x8_t test_vcombine_bf16(bfloat16x4_t low, bfloat16x4_t high) { |
| 109 | +// return vcombine_bf16(low, high); |
| 110 | +// } |
| 111 | + |
| 112 | +// CHECK-LABEL: @test_vget_high_bf16( |
| 113 | +// CHECK-NEXT: entry: |
| 114 | +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> |
| 115 | +// CHECK-NEXT: ret <4 x bfloat> [[SHUFFLE_I]] |
| 116 | +// |
| 117 | +// bfloat16x4_t test_vget_high_bf16(bfloat16x8_t a) { |
| 118 | +// return vget_high_bf16(a); |
| 119 | +// } |
| 120 | + |
| 121 | +// CHECK-LABEL: @test_vget_low_bf16( |
| 122 | +// CHECK-NEXT: entry: |
| 123 | +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| 124 | +// CHECK-NEXT: ret <4 x bfloat> [[SHUFFLE_I]] |
| 125 | +// |
| 126 | +// bfloat16x4_t test_vget_low_bf16(bfloat16x8_t a) { |
| 127 | +// return vget_low_bf16(a); |
| 128 | +// } |
| 129 | + |
| 130 | +bfloat16_t test_vget_lane_bf16(bfloat16x4_t v) { |
| 131 | + return vget_lane_bf16(v, 1); |
| 132 | + |
| 133 | + // CIR-LABEL: vget_lane_bf16 |
| 134 | + // CIR: [[TMP0:%.*]] = cir.const #cir.int<1> : !s32i |
| 135 | + // CIR: [[TMP1:%.*]] = cir.vec.extract {{.*}}[{{.*}} : !s32i] : !cir.vector<!cir.bf16 x 4> |
| 136 | + |
| 137 | + // LLVM-LABEL: test_vget_lane_bf16 |
| 138 | + // LLVM-SAME: (<4 x bfloat> [[VEC:%.*]]) |
| 139 | + // LLVM: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[VEC]], i32 1 |
| 140 | + // LLVM: ret bfloat [[VGET_LANE]] |
| 141 | +} |
| 142 | + |
| 143 | +// CHECK-LABEL: @test_vgetq_lane_bf16( |
| 144 | +// CHECK-NEXT: entry: |
| 145 | +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[V:%.*]], i32 7 |
| 146 | +// CHECK-NEXT: ret bfloat [[VGETQ_LANE]] |
| 147 | +// |
| 148 | +// bfloat16_t test_vgetq_lane_bf16(bfloat16x8_t v) { |
| 149 | +// return vgetq_lane_bf16(v, 7); |
| 150 | +// } |
| 151 | + |
| 152 | +// CHECK-LABEL: @test_vset_lane_bf16( |
| 153 | +// CHECK-NEXT: entry: |
| 154 | +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i32 1 |
| 155 | +// CHECK-NEXT: ret <4 x bfloat> [[VSET_LANE]] |
| 156 | +// |
| 157 | +// bfloat16x4_t test_vset_lane_bf16(bfloat16_t a, bfloat16x4_t v) { |
| 158 | +// return vset_lane_bf16(a, v, 1); |
| 159 | +// } |
| 160 | + |
| 161 | +// CHECK-LABEL: @test_vsetq_lane_bf16( |
| 162 | +// CHECK-NEXT: entry: |
| 163 | +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i32 7 |
| 164 | +// CHECK-NEXT: ret <8 x bfloat> [[VSET_LANE]] |
| 165 | +// |
| 166 | +// bfloat16x8_t test_vsetq_lane_bf16(bfloat16_t a, bfloat16x8_t v) { |
| 167 | +// return vsetq_lane_bf16(a, v, 7); |
| 168 | +// } |
| 169 | + |
| 170 | +bfloat16_t test_vduph_lane_bf16(bfloat16x4_t v) { |
| 171 | + return vduph_lane_bf16(v, 1); |
| 172 | + |
| 173 | + // CIR-LABEL: vduph_lane_bf16 |
| 174 | + // CIR: [[TMP0:%.*]] = cir.const #cir.int<1> : !s32i |
| 175 | + // CIR: [[TMP1:%.*]] = cir.vec.extract {{.*}}[{{.*}} : !s32i] : !cir.vector<!cir.bf16 x 4> |
| 176 | + |
| 177 | + // LLVM-LABEL: test_vduph_lane_bf16 |
| 178 | + // LLVM-SAME: (<4 x bfloat> [[VEC:%.*]]) |
| 179 | + // LLVM: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[VEC]], i32 1 |
| 180 | + // LLVM: ret bfloat [[VGET_LANE]] |
| 181 | +} |
0 commit comments