Skip to content

Commit 23182a7

Browse files
authored
128-bit vector casts & SSE instruction selection (#1585)
1 parent 1218f2e commit 23182a7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+2173
-101
lines changed

backend/.ocamlformat-enable

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,7 @@ peephole/**/*.ml
1616
peephole/**/*.mli
1717
regalloc/**/*.ml
1818
regalloc/**/*.mli
19+
amd64/simd.ml
20+
arm64/simd.ml
21+
amd64/simd_selection.ml
22+
arm64/simd_selection.ml

backend/CSEgen.ml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -247,8 +247,8 @@ method class_of_operation op =
247247
| Iintop_atomic _ -> Op_store true
248248
| Icompf _
249249
| Icsel _
250-
| Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf
251-
| Ifloatofint | Iintoffloat | Ivalueofint | Iintofvalue -> Op_pure
250+
| Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf | Iscalarcast _
251+
| Ifloatofint | Iintoffloat | Ivalueofint | Iintofvalue | Ivectorcast _ -> Op_pure
252252
| Ispecific _ -> Op_other
253253
| Iname_for_debugger _ -> Op_other
254254
| Iprobe_is_enabled _ -> Op_other

backend/amd64/CSE.ml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,18 @@ method! class_of_operation op =
3636
| Ibswap _ | Isqrtf -> super#class_of_operation op
3737
| Irdtsc | Irdpmc
3838
| Ilfence | Isfence | Imfence -> Op_other
39-
| Ifloat_iround | Ifloat_min | Ifloat_max | Ifloat_round _
40-
| Icrc32q -> Op_pure
39+
| Ifloat_iround | Ifloat_min | Ifloat_max | Ifloat_round _ -> Op_pure
40+
| Isimd op ->
41+
begin match Simd.class_of_operation op with
42+
| Pure -> Op_pure
43+
end
4144
| Ipause
4245
| Iprefetch _ -> Op_other
4346
end
4447
| Imove | Ispill | Ireload | Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf
4548
| Icompf _
4649
| Icsel _
47-
| Ifloatofint | Iintoffloat | Ivalueofint | Iintofvalue
50+
| Ifloatofint | Iintoffloat | Ivalueofint | Iintofvalue | Ivectorcast _ | Iscalarcast _
4851
| Iconst_int _ | Iconst_float _ | Iconst_symbol _ | Iconst_vec128 _
4952
| Icall_ind | Icall_imm _ | Itailcall_ind | Itailcall_imm _ | Iextcall _
5053
| Istackoffset _ | Iload _ | Istore _ | Ialloc _

backend/amd64/arch.ml

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@ let prefetchwt1_support = ref false
3131
(* Emit elf notes with trap handling information. *)
3232
let trap_notes = ref true
3333

34+
(* Basline x86_64 requires SSE and SSE2. The others are optional. *)
35+
let sse3_support = ref true
36+
let ssse3_support = ref true
37+
let sse41_support = ref true
38+
let sse42_support = ref true
39+
3440
(* Enable SIMD register allocation features. *)
3541
let simd_regalloc = ref false
3642

@@ -45,10 +51,6 @@ let command_line_options =
4551
" Use POPCNT instruction (not available prior to Nehalem) (default)";
4652
"-fno-popcnt", Arg.Clear popcnt_support,
4753
" Do not use POPCNT instruction";
48-
"-fcrc32", Arg.Set crc32_support,
49-
" Use CRC32 instructions (requires SSE4.2 support) (default)";
50-
"-fno-crc32", Arg.Clear crc32_support,
51-
" Do not emit CRC32 instructions";
5254
"-fprefetchw", Arg.Set prefetchw_support,
5355
" Use PREFETCHW instructions (not available on Haswell and earlier) \
5456
(default)";
@@ -62,6 +64,22 @@ let command_line_options =
6264
" Emit .note.ocaml_eh section with trap handling information (default)";
6365
"-fno-trap-notes", Arg.Clear trap_notes,
6466
" Do not emit .note.ocaml_eh section with trap handling information";
67+
"-fsse3", Arg.Set sse3_support,
68+
" Enable SSE3 intrinsics (default)";
69+
"-fno-sse3", Arg.Clear sse3_support,
70+
" Disable SSE3 intrinsics";
71+
"-fssse3", Arg.Set ssse3_support,
72+
" Enable SSSE3 intrinsics (default)";
73+
"-fno-ssse3", Arg.Clear ssse3_support,
74+
" Disable SSSE3 intrinsics";
75+
"-fsse41", Arg.Set sse41_support,
76+
" Enable SSE4.1 intrinsics (default)";
77+
"-fno-sse41", Arg.Clear sse41_support,
78+
" Disable SSE4.1 intrinsics";
79+
"-fsse42", Arg.Set sse42_support,
80+
" Enable SSE4.2 intrinsics (default)";
81+
"-fno-sse42", Arg.Clear sse42_support,
82+
" Disable SSE4.2 intrinsics";
6583
"-fsimd-regalloc", Arg.Set simd_regalloc,
6684
" Enable SIMD register allocation (implied by -extension SIMD)";
6785
"-fno-simd-regalloc", Arg.Clear simd_regalloc,
@@ -118,8 +136,8 @@ type specific_operation =
118136
| Ilfence (* load fence *)
119137
| Isfence (* store fence *)
120138
| Imfence (* memory fence *)
121-
| Icrc32q (* compute crc *)
122139
| Ipause (* hint for spin-wait loops *)
140+
| Isimd of Simd.operation (* vectorized operations *)
123141
| Iprefetch of (* memory prefetching hint *)
124142
{ is_write: bool;
125143
locality: prefetch_temporal_locality_hint;
@@ -248,8 +266,8 @@ let print_specific_operation printreg op ppf arg =
248266
fprintf ppf "mfence"
249267
| Irdpmc ->
250268
fprintf ppf "rdpmc %a" printreg arg.(0)
251-
| Icrc32q ->
252-
fprintf ppf "crc32 %a %a" printreg arg.(0) printreg arg.(1)
269+
| Isimd simd ->
270+
Simd.print_operation printreg simd ppf arg
253271
| Ipause ->
254272
fprintf ppf "pause"
255273
| Iprefetch { is_write; locality; } ->
@@ -269,19 +287,19 @@ let operation_is_pure = function
269287
| Ilea _ | Ibswap _ | Isqrtf | Isextend32 | Izextend32 -> true
270288
| Ifloatarithmem _ | Ifloatsqrtf _ -> true
271289
| Ifloat_iround | Ifloat_round _ | Ifloat_min | Ifloat_max -> true
272-
| Icrc32q -> true
273290
| Irdtsc | Irdpmc | Ipause
274291
| Ilfence | Isfence | Imfence
275292
| Istore_int (_, _, _) | Ioffset_loc (_, _)
276293
| Iprefetch _ -> false
294+
| Isimd op -> Simd.is_pure op
277295

278296
(* Specific operations that can raise *)
279297

280298
let operation_can_raise = function
281299
| Ilea _ | Ibswap _ | Isqrtf | Isextend32 | Izextend32
282300
| Ifloatarithmem _ | Ifloatsqrtf _
283301
| Ifloat_iround | Ifloat_round _ | Ifloat_min | Ifloat_max
284-
| Icrc32q | Irdtsc | Irdpmc | Ipause
302+
| Irdtsc | Irdpmc | Ipause | Isimd _
285303
| Ilfence | Isfence | Imfence
286304
| Istore_int (_, _, _) | Ioffset_loc (_, _)
287305
| Iprefetch _ -> false
@@ -290,7 +308,7 @@ let operation_allocates = function
290308
| Ilea _ | Ibswap _ | Isqrtf | Isextend32 | Izextend32
291309
| Ifloatarithmem _ | Ifloatsqrtf _
292310
| Ifloat_iround | Ifloat_round _ | Ifloat_min | Ifloat_max
293-
| Icrc32q | Irdtsc | Irdpmc | Ipause
311+
| Irdtsc | Irdpmc | Ipause | Isimd _
294312
| Ilfence | Isfence | Imfence
295313
| Istore_int (_, _, _) | Ioffset_loc (_, _)
296314
| Iprefetch _ -> false
@@ -383,8 +401,6 @@ let equal_specific_operation left right =
383401
true
384402
| Imfence, Imfence ->
385403
true
386-
| Icrc32q, Icrc32q ->
387-
true
388404
| Ifloat_iround, Ifloat_iround -> true
389405
| Ifloat_round x, Ifloat_round y -> equal_rounding_mode x y
390406
| Ifloat_min, Ifloat_min -> true
@@ -395,8 +411,10 @@ let equal_specific_operation left right =
395411
Bool.equal left_is_write right_is_write
396412
&& equal_prefetch_temporal_locality_hint left_locality right_locality
397413
&& equal_addressing_mode left_addr right_addr
414+
| Isimd l, Isimd r ->
415+
Simd.equal_operation l r
398416
| (Ilea _ | Istore_int _ | Ioffset_loc _ | Ifloatarithmem _ | Ibswap _
399417
| Isqrtf | Ifloatsqrtf _ | Isextend32 | Izextend32 | Irdtsc | Irdpmc
400418
| Ilfence | Isfence | Imfence | Ifloat_iround | Ifloat_round _ |
401-
Ifloat_min | Ifloat_max | Ipause | Icrc32q | Iprefetch _), _ ->
419+
Ifloat_min | Ifloat_max | Ipause | Isimd _ | Iprefetch _), _ ->
402420
false

backend/amd64/emit.mlp

Lines changed: 68 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -698,8 +698,8 @@ let add_vec128_constant bits =
698698
lbl
699699

700700
let emit_vec128_constant {high; low} lbl =
701-
_label (emit_label lbl);
702701
(* SIMD vectors respect little-endian byte order *)
702+
_label (emit_label lbl);
703703
D.qword (Const low);
704704
D.qword (Const high)
705705

@@ -908,6 +908,38 @@ let emit_atomic instr op (size : Cmm.atomic_bitwidth) addr =
908908
I.set E res8;
909909
I.movzx res8 res
910910

911+
let emit_simd_instr op i =
912+
(match Simd_selection.register_behavior op with
913+
| RM_to_R ->
914+
assert (Reg.is_reg i.res.(0))
915+
| R_to_R ->
916+
assert (Reg.is_reg i.arg.(0) && Reg.is_reg i.res.(0))
917+
| R_RM_to_fst ->
918+
assert (arg i 0 = res i 0);
919+
assert (Reg.is_reg i.arg.(0))
920+
| R_R_to_fst ->
921+
assert (arg i 0 = res i 0);
922+
assert (Reg.is_reg i.arg.(0) && Reg.is_reg i.arg.(1)));
923+
match (op : Simd.operation) with
924+
| SSE (Cmp_f32 n) -> I.cmpps n (arg i 1) (res i 0)
925+
| SSE Add_f32 -> I.addps (arg i 1) (res i 0)
926+
| SSE Sub_f32 -> I.subps (arg i 1) (res i 0)
927+
| SSE Mul_f32 -> I.mulps (arg i 1) (res i 0)
928+
| SSE Div_f32 -> I.divps (arg i 1) (res i 0)
929+
| SSE Max_f32 -> I.maxps (arg i 1) (res i 0)
930+
| SSE Min_f32 -> I.minps (arg i 1) (res i 0)
931+
| SSE Rcp_f32 -> I.rcpps (arg i 0) (res i 0)
932+
| SSE Sqrt_f32 -> I.sqrtps (arg i 0) (res i 0)
933+
| SSE Rsqrt_f32 -> I.rsqrtps (arg i 0) (res i 0)
934+
| SSE High_64_to_low_64 -> I.movhlps (arg i 1) (res i 0)
935+
| SSE Low_64_to_high_64 -> I.movlhps (arg i 1) (res i 0)
936+
| SSE Interleave_high_32 -> I.unpckhps (arg i 1) (res i 0)
937+
| SSE Interleave_low_32 -> I.unpcklps (arg i 1) (res i 0)
938+
| SSE Movemask_32 -> I.movmskps (arg i 0) (res i 0)
939+
| SSE (Shuffle_32 n) -> I.shufps n (arg i 1) (res i 0)
940+
| SSE42 Crc32_64 -> I.crc32 (arg i 1) (res i 0)
941+
| _ -> .
942+
911943
(* Emit an instruction *)
912944
let emit_instr fallthrough i =
913945
emit_debug_info_linear i;
@@ -1182,8 +1214,40 @@ let emit_instr fallthrough i =
11821214
I.cvtsi2sd (arg i 0) (res i 0)
11831215
| Lop(Iintoffloat) ->
11841216
I.cvttsd2si (arg i 0) (res i 0)
1185-
| Lop(Iintofvalue | Ivalueofint) ->
1217+
| Lop(Iintofvalue | Ivalueofint | Ivectorcast Bits128) ->
11861218
move i.arg.(0) i.res.(0)
1219+
| Lop(Iscalarcast (V128_of_scalar Float64x2 | V128_to_scalar Float64x2)) ->
1220+
I.movsd (arg i 0) (res i 0)
1221+
| Lop(Iscalarcast (V128_to_scalar Int64x2 | V128_of_scalar Int64x2)) ->
1222+
I.movq (arg i 0) (res i 0)
1223+
| Lop(Iscalarcast (V128_to_scalar Int32x4)) ->
1224+
I.movd (arg i 0) (res32 i 0)
1225+
| Lop(Iscalarcast (V128_of_scalar Int32x4)) ->
1226+
I.movd (arg32 i 0) (res i 0)
1227+
| Lop(Iscalarcast (V128_of_scalar Float32x4)) ->
1228+
(* CR mslater: (SIMD) remove cvt once we have unboxed float32 *)
1229+
I.cvtsd2ss (arg i 0) (res i 0)
1230+
| Lop(Iscalarcast (V128_to_scalar Float32x4)) ->
1231+
(* CR mslater: (SIMD) remove cvt once we have unboxed float32 *)
1232+
I.cvtss2sd (arg i 0) (res i 0)
1233+
| Lop(Iscalarcast (V128_to_scalar Int16x8)) ->
1234+
(* [movw] and [movzx] cannot operate on vector registers.
1235+
We must zero extend as the result is an untagged positive int.
1236+
CR mslater: (SIMD) remove zx once we have unboxed int16 *)
1237+
I.movd (arg i 0) (res32 i 0);
1238+
I.movzx (res16 i 0) (res i 0)
1239+
| Lop(Iscalarcast (V128_to_scalar Int8x16)) ->
1240+
(* [movb] and [movzx] cannot operate on vector registers.
1241+
We must zero extend as the result is an untagged positive int.
1242+
CR mslater: (SIMD) remove zx once we have unboxed int8 *)
1243+
I.movd (arg i 0) (res32 i 0);
1244+
I.movzx (res8 i 0) (res i 0)
1245+
| Lop(Iscalarcast (V128_of_scalar Int16x8 | V128_of_scalar Int8x16)) ->
1246+
(* [movw] and [movb] cannot operate on vector registers.
1247+
Moving 32 bits is OK because the argument is an untagged
1248+
positive int and these operations leave the top bits of the vector unspecified.
1249+
CR mslater: (SIMD) don't load 32 bits once we have unboxed int16/int8 *)
1250+
I.movd (arg32 i 0) (res i 0)
11871251
| Lop(Iopaque) ->
11881252
assert (i.arg.(0).loc = i.res.(0).loc)
11891253
| Lop(Ispecific(Ilea addr)) ->
@@ -1297,9 +1361,8 @@ let emit_instr fallthrough i =
12971361
I.sfence ()
12981362
| Lop (Ispecific Imfence) ->
12991363
I.mfence ()
1300-
| Lop (Ispecific Icrc32q) ->
1301-
assert (arg i 0 = res i 0);
1302-
I.crc32 (arg i 1) (res i 0)
1364+
| Lop (Ispecific (Isimd op)) ->
1365+
emit_simd_instr op i
13031366
| Lop (Ispecific Ipause) ->
13041367
I.pause ()
13051368
| Lop (Ispecific (Iprefetch { is_write; locality; addr; })) ->

backend/amd64/proc.ml

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -427,11 +427,12 @@ let destroyed_at_oper = function
427427
| Ireturn traps when has_pushtrap traps -> assert false
428428
| Iop(Ispecific (Irdtsc | Irdpmc)) -> [| rax; rdx |]
429429
| Iop(Ispecific(Ilfence | Isfence | Imfence)) -> [||]
430-
| Iop(Ispecific(Isqrtf | Isextend32 | Izextend32 | Icrc32q | Ilea _
430+
| Iop(Ispecific(Isqrtf | Isextend32 | Izextend32 | Ilea _
431431
| Istore_int (_, _, _) | Ioffset_loc (_, _)
432432
| Ipause
433433
| Iprefetch _
434434
| Ifloat_round _
435+
| Isimd _
435436
| Ifloat_iround | Ifloat_min | Ifloat_max
436437
| Ifloatarithmem (_, _) | Ibswap _ | Ifloatsqrtf _))
437438
| Iop(Iintop(Iadd | Isub | Imul | Iand | Ior | Ixor | Ilsl | Ilsr | Iasr
@@ -448,6 +449,7 @@ let destroyed_at_oper = function
448449
| Icsel _
449450
| Ifloatofint | Iintoffloat
450451
| Ivalueofint | Iintofvalue
452+
| Ivectorcast _ | Iscalarcast _
451453
| Iconst_int _ | Iconst_float _ | Iconst_symbol _ | Iconst_vec128 _
452454
| Itailcall_ind | Itailcall_imm _ | Istackoffset _ | Iload (_, _, _)
453455
| Iname_for_debugger _ | Iprobe _| Iprobe_is_enabled _ | Iopaque)
@@ -499,15 +501,17 @@ let destroyed_at_basic (basic : Cfg_intf.S.basic) =
499501
| Csel _
500502
| Floatofint | Intoffloat
501503
| Valueofint | Intofvalue
504+
| Vectorcast _
505+
| Scalarcast _
502506
| Probe_is_enabled _
503507
| Opaque
504508
| Begin_region
505509
| End_region
506510
| Specific (Ilea _ | Istore_int _ | Ioffset_loc _
507511
| Ifloatarithmem _ | Ibswap _ | Isqrtf
508-
| Ifloatsqrtf _ | Ifloat_iround
512+
| Ifloatsqrtf _ | Ifloat_iround | Isimd _
509513
| Ifloat_round _ | Ifloat_min | Ifloat_max
510-
| Isextend32 | Izextend32 | Icrc32q | Ipause
514+
| Isextend32 | Izextend32 | Ipause
511515
| Iprefetch _ | Ilfence | Isfence | Imfence)
512516
| Name_for_debugger _)
513517
| Poptrap | Prologue ->
@@ -535,7 +539,7 @@ let destroyed_at_terminator (terminator : Cfg_intf.S.terminator) =
535539
| Specific_can_raise { op = (Ilea _ | Ibswap _ | Isqrtf | Isextend32 | Izextend32
536540
| Ifloatarithmem _ | Ifloatsqrtf _
537541
| Ifloat_iround | Ifloat_round _ | Ifloat_min | Ifloat_max
538-
| Icrc32q | Irdtsc | Irdpmc | Ipause
542+
| Irdtsc | Irdpmc | Ipause | Isimd _
539543
| Ilfence | Isfence | Imfence
540544
| Istore_int (_, _, _) | Ioffset_loc (_, _)
541545
| Iprefetch _); _ } ->
@@ -567,7 +571,7 @@ let is_destruction_point (terminator : Cfg_intf.S.terminator) =
567571
| Specific_can_raise { op = (Ilea _ | Ibswap _ | Isqrtf | Isextend32 | Izextend32
568572
| Ifloatarithmem _ | Ifloatsqrtf _
569573
| Ifloat_iround | Ifloat_round _ | Ifloat_min | Ifloat_max
570-
| Icrc32q | Irdtsc | Irdpmc | Ipause
574+
| Irdtsc | Irdpmc | Ipause | Isimd _
571575
| Ilfence | Isfence | Imfence
572576
| Istore_int (_, _, _) | Ioffset_loc (_, _)
573577
| Iprefetch _); _ } ->
@@ -581,8 +585,8 @@ let safe_register_pressure = function
581585
Iextcall _ -> if win64 then if fp then 7 else 8 else 0
582586
| Ialloc _ | Ipoll _ | Imove | Ispill | Ireload
583587
| Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf
584-
| Ifloatofint | Iintoffloat | Ivalueofint | Iintofvalue
585-
| Icompf _
588+
| Ifloatofint | Iintoffloat | Ivalueofint | Iintofvalue | Ivectorcast _
589+
| Icompf _ | Iscalarcast _
586590
| Icsel _
587591
| Iconst_int _ | Iconst_float _ | Iconst_symbol _ | Iconst_vec128 _
588592
| Icall_ind | Icall_imm _ | Itailcall_ind | Itailcall_imm _
@@ -622,14 +626,14 @@ let max_register_pressure =
622626
_, _)
623627
| Imove | Ispill | Ireload | Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf
624628
| Icsel _
625-
| Ifloatofint | Iintoffloat | Ivalueofint | Iintofvalue
629+
| Ifloatofint | Iintoffloat | Ivalueofint | Iintofvalue | Ivectorcast _ | Iscalarcast _
626630
| Iconst_int _ | Iconst_float _ | Iconst_symbol _ | Iconst_vec128 _
627631
| Icall_ind | Icall_imm _ | Itailcall_ind | Itailcall_imm _
628632
| Istackoffset _ | Iload (_, _, _)
629633
| Ispecific(Ilea _ | Isextend32 | Izextend32 | Iprefetch _ | Ipause
630-
| Irdtsc | Irdpmc | Icrc32q | Istore_int (_, _, _)
634+
| Irdtsc | Irdpmc | Istore_int (_, _, _)
631635
| Ilfence | Isfence | Imfence
632-
| Ifloat_round _
636+
| Ifloat_round _ | Isimd _
633637
| Ifloat_iround | Ifloat_min | Ifloat_max
634638
| Ioffset_loc (_, _) | Ifloatarithmem (_, _)
635639
| Ibswap _ | Ifloatsqrtf _ | Isqrtf)
@@ -725,6 +729,7 @@ let operation_supported = function
725729
| Ccmpf _
726730
| Craise _
727731
| Ccheckbound
732+
| Cvectorcast _ | Cscalarcast _
728733
| Cprobe _ | Cprobe_is_enabled _ | Copaque | Cbeginregion | Cendregion
729734
-> true
730735

0 commit comments

Comments
 (0)