Skip to content

Commit 952f8f1

Browse files
Mike KleinSkia Commit-Bot
Mike Klein
authored and
Skia Commit-Bot
committed
Reland "update skvx scalar-fallback strategy"
This is a reland of 4985db4 ...with a better implementation of map(). I don't understand why we had to revert, but it had something with calling the function pointer in map_(), so maybe this will help. I've flattened the map_() / map() merge CL into this one, and marked the resulting map() as no_sanitize("cfi"). I don't see anything wrong, so I think it's a false positive. Original change's description: > update skvx scalar-fallback strategy > > Turns out Clang's a lot better at auto-vectorizing "obvious" scalar code > into obvious vector code when it's written out the long way, e.g. > > F32x4 x = ...; > x = { sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]) }; > > vectorizes into sqrtps a lot more reliably than our recurse-onto-scalars > strategy, and also better than the other naive approach, > > F32x4 x = ...; > for (int i = 0; i < 4; i++) { x[i] = sqrtf(x[i]); } > > So here I've added a map(V, fn) -> V' using C++14 tricks to let the > compiler handle the expansion of x = { fn(x[0]), fn(x[1]), ... > fn(x[N-1]) } for any N, and implemented most skvx scalar fallback code > using that. > > With these now vectorizing well at any N, we can remove any > specializations we'd written for particular N, really tidying up. > > Over in the SkVM interpreter, this is a big improvement for ceil and > floor, which were being done 2 floats at a time instead of 8. They're > now slimmed way down to > > shlq $6, %r13 > vroundps $K, (%r12,%r13), %ymm0 > vroundps $K, 32(%r12,%r13), %ymm1 > jmp ... > > where K is 9 or 10 depending on the op. > > I haven't found a scalar function that Clang will vectorize to vcvtps2pd > (the rounding one, not truncating vcvttps2pd), so I've kept lrint() > written the long way, updated to the style I've been using lately with > specializations inline. > > Change-Id: Ia97abe3c876008228bf62b1daacd6f6140408fc4 > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/317375 > Reviewed-by: Herb Derby <[email protected]> > Commit-Queue: Mike Klein <[email protected]> Cq-Include-Trybots: luci.chromium.try:linux_chromium_cfi_rel_ng Bug: chromium:1129408 Change-Id: Ia9c14074b9a14a67dd221f4925894d35a551f9d7 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/317551 Commit-Queue: Mike Klein <[email protected]> Reviewed-by: Herb Derby <[email protected]>
1 parent 3ed22a9 commit 952f8f1

File tree

1 file changed

+49
-78
lines changed

1 file changed

+49
-78
lines changed

include/private/SkVx.h

Lines changed: 49 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -329,22 +329,6 @@ SIT bool all(const Vec<1,T>& x) { return x.val != 0; }
329329

330330
SIT Vec<1,T> pow(const Vec<1,T>& x, const Vec<1,T>& y) { return std::pow(x.val, y.val); }
331331

332-
SIT Vec<1,T> atan(const Vec<1,T>& x) { return std:: atan(x.val); }
333-
SIT Vec<1,T> ceil(const Vec<1,T>& x) { return std:: ceil(x.val); }
334-
SIT Vec<1,T> floor(const Vec<1,T>& x) { return std::floor(x.val); }
335-
SIT Vec<1,T> trunc(const Vec<1,T>& x) { return std::trunc(x.val); }
336-
SIT Vec<1,T> round(const Vec<1,T>& x) { return std::round(x.val); }
337-
SIT Vec<1,T> sqrt(const Vec<1,T>& x) { return std:: sqrt(x.val); }
338-
SIT Vec<1,T> abs(const Vec<1,T>& x) { return std:: abs(x.val); }
339-
SIT Vec<1,T> sin(const Vec<1,T>& x) { return std:: sin(x.val); }
340-
SIT Vec<1,T> cos(const Vec<1,T>& x) { return std:: cos(x.val); }
341-
SIT Vec<1,T> tan(const Vec<1,T>& x) { return std:: tan(x.val); }
342-
343-
SIT Vec<1,int> lrint(const Vec<1,T>& x) { return (int)std::lrint(x.val); }
344-
345-
SIT Vec<1,T> rcp(const Vec<1,T>& x) { return 1 / x.val; }
346-
SIT Vec<1,T> rsqrt(const Vec<1,T>& x) { return rcp(sqrt(x)); }
347-
348332
// All default N != 1 implementations just recurse on lo and hi halves.
349333

350334
// Clang can reason about naive_if_then_else() and optimize through it better
@@ -395,23 +379,6 @@ SINT Vec<N,T> pow(const Vec<N,T>& x, const Vec<N,T>& y) {
395379
return join(pow(x.lo, y.lo), pow(x.hi, y.hi));
396380
}
397381

398-
SINT Vec<N,T> atan(const Vec<N,T>& x) { return join( atan(x.lo), atan(x.hi)); }
399-
SINT Vec<N,T> ceil(const Vec<N,T>& x) { return join( ceil(x.lo), ceil(x.hi)); }
400-
SINT Vec<N,T> floor(const Vec<N,T>& x) { return join(floor(x.lo), floor(x.hi)); }
401-
SINT Vec<N,T> trunc(const Vec<N,T>& x) { return join(trunc(x.lo), trunc(x.hi)); }
402-
SINT Vec<N,T> round(const Vec<N,T>& x) { return join(round(x.lo), round(x.hi)); }
403-
SINT Vec<N,T> sqrt(const Vec<N,T>& x) { return join( sqrt(x.lo), sqrt(x.hi)); }
404-
SINT Vec<N,T> abs(const Vec<N,T>& x) { return join( abs(x.lo), abs(x.hi)); }
405-
SINT Vec<N,T> sin(const Vec<N,T>& x) { return join( sin(x.lo), sin(x.hi)); }
406-
SINT Vec<N,T> cos(const Vec<N,T>& x) { return join( cos(x.lo), cos(x.hi)); }
407-
SINT Vec<N,T> tan(const Vec<N,T>& x) { return join( tan(x.lo), tan(x.hi)); }
408-
409-
SINT Vec<N,int> lrint(const Vec<N,T>& x) { return join(lrint(x.lo), lrint(x.hi)); }
410-
411-
SINT Vec<N,T> rcp(const Vec<N,T>& x) { return join( rcp(x.lo), rcp(x.hi)); }
412-
SINT Vec<N,T> rsqrt(const Vec<N,T>& x) { return join(rsqrt(x.lo), rsqrt(x.hi)); }
413-
414-
415382
// Scalar/vector operations just splat the scalar to a vector...
416383
SINTU Vec<N,T> operator+ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) + y; }
417384
SINTU Vec<N,T> operator- (U x, const Vec<N,T>& y) { return Vec<N,T>(x) - y; }
@@ -519,10 +486,57 @@ SIN Vec<N,float> fma(const Vec<N,float>& x, const Vec<N,float>& y, const Vec<N,f
519486
fma(x.hi, y.hi, z.hi));
520487
}
521488

522-
SIN Vec<N,float> fract(const Vec<N,float>& x) {
523-
return x - floor(x);
489+
template <int N, typename T, typename Fn, std::size_t... I>
490+
#if defined(__clang__)
491+
// CFI, specifically -fsanitize=cfi-icall, seems to give a false positive here,
492+
// with errors like "control flow integrity check for type 'float (float)
493+
// noexcept' failed during indirect function call... note: sqrtf.cfi_jt defined
494+
// here". But we can be quite sure fn is the right type: it's all inferred!
495+
// So, stifle CFI in this function.
496+
__attribute__((no_sanitize("cfi")))
497+
#endif
498+
SI auto map(const skvx::Vec<N,T>& x, Fn&& fn,
499+
std::index_sequence<I...> ix = {}) -> skvx::Vec<N, decltype(fn(x[0]))> {
500+
if /*constexpr*/ (sizeof...(I) == 0) {
501+
// When called as map(x, fn), bootstrap the index_sequence we want: 0,1,...,N-1.
502+
return map(x, fn, std::make_index_sequence<N>{});
503+
}
504+
return { fn(x[I])... };
505+
}
506+
507+
SIN Vec<N,float> atan(const Vec<N,float>& x) { return map(x, atanf); }
508+
SIN Vec<N,float> ceil(const Vec<N,float>& x) { return map(x, ceilf); }
509+
SIN Vec<N,float> floor(const Vec<N,float>& x) { return map(x, floorf); }
510+
SIN Vec<N,float> trunc(const Vec<N,float>& x) { return map(x, truncf); }
511+
SIN Vec<N,float> round(const Vec<N,float>& x) { return map(x, roundf); }
512+
SIN Vec<N,float> sqrt(const Vec<N,float>& x) { return map(x, sqrtf); }
513+
SIN Vec<N,float> abs(const Vec<N,float>& x) { return map(x, fabsf); }
514+
SIN Vec<N,float> sin(const Vec<N,float>& x) { return map(x, sinf); }
515+
SIN Vec<N,float> cos(const Vec<N,float>& x) { return map(x, cosf); }
516+
SIN Vec<N,float> tan(const Vec<N,float>& x) { return map(x, tanf); }
517+
518+
SI Vec<1,int> lrint(const Vec<1,float>& x) {
519+
return (int)lrintf(x.val);
520+
}
521+
SIN Vec<N,int> lrint(const Vec<N,float>& x) {
522+
#if defined(__AVX__)
523+
if /*constexpr*/ (N == 8) {
524+
return unchecked_bit_pun<Vec<N,int>>(_mm256_cvtps_epi32(unchecked_bit_pun<__m256>(x)));
525+
}
526+
#endif
527+
#if defined(__SSE__)
528+
if /*constexpr*/ (N == 4) {
529+
return unchecked_bit_pun<Vec<N,int>>(_mm_cvtps_epi32(unchecked_bit_pun<__m128>(x)));
530+
}
531+
#endif
532+
return join(lrint(x.lo),
533+
lrint(x.hi));
524534
}
525535

536+
SIN Vec<N,float> rcp(const Vec<N,float>& x) { return 1/x; }
537+
SIN Vec<N,float> rsqrt(const Vec<N,float>& x) { return rcp(sqrt(x)); }
538+
SIN Vec<N,float> fract(const Vec<N,float>& x) { return x - floor(x); }
539+
526540
// The default cases for to_half/from_half are borrowed from skcms,
527541
// and assume inputs are finite and treat/flush denorm half floats as/to zero.
528542
// Key constants to watch for:
@@ -638,46 +652,28 @@ SIN Vec<N,uint8_t> approx_scale(const Vec<N,uint8_t>& x, const Vec<N,uint8_t>& y
638652
// Platform-specific specializations and overloads can now drop in here.
639653

640654
#if defined(__AVX__)
641-
SI Vec<8,float> sqrt(const Vec<8,float>& x) {
642-
return bit_pun<Vec<8,float>>(_mm256_sqrt_ps(bit_pun<__m256>(x)));
643-
}
644655
SI Vec<8,float> rsqrt(const Vec<8,float>& x) {
645656
return bit_pun<Vec<8,float>>(_mm256_rsqrt_ps(bit_pun<__m256>(x)));
646657
}
647658
SI Vec<8,float> rcp(const Vec<8,float>& x) {
648659
return bit_pun<Vec<8,float>>(_mm256_rcp_ps(bit_pun<__m256>(x)));
649660
}
650-
SI Vec<8,int> lrint(const Vec<8,float>& x) {
651-
return bit_pun<Vec<8,int>>(_mm256_cvtps_epi32(bit_pun<__m256>(x)));
652-
}
653661
#endif
654662

655663
#if defined(__SSE__)
656-
SI Vec<4,float> sqrt(const Vec<4,float>& x) {
657-
return bit_pun<Vec<4,float>>(_mm_sqrt_ps(bit_pun<__m128>(x)));
658-
}
659664
SI Vec<4,float> rsqrt(const Vec<4,float>& x) {
660665
return bit_pun<Vec<4,float>>(_mm_rsqrt_ps(bit_pun<__m128>(x)));
661666
}
662667
SI Vec<4,float> rcp(const Vec<4,float>& x) {
663668
return bit_pun<Vec<4,float>>(_mm_rcp_ps(bit_pun<__m128>(x)));
664669
}
665-
SI Vec<4,int> lrint(const Vec<4,float>& x) {
666-
return bit_pun<Vec<4,int>>(_mm_cvtps_epi32(bit_pun<__m128>(x)));
667-
}
668670

669-
SI Vec<2,float> sqrt(const Vec<2,float>& x) {
670-
return shuffle<0,1>( sqrt(shuffle<0,1,0,1>(x)));
671-
}
672671
SI Vec<2,float> rsqrt(const Vec<2,float>& x) {
673672
return shuffle<0,1>(rsqrt(shuffle<0,1,0,1>(x)));
674673
}
675674
SI Vec<2,float> rcp(const Vec<2,float>& x) {
676675
return shuffle<0,1>( rcp(shuffle<0,1,0,1>(x)));
677676
}
678-
SI Vec<2,int> lrint(const Vec<2,float>& x) {
679-
return shuffle<0,1>(lrint(shuffle<0,1,0,1>(x)));
680-
}
681677
#endif
682678

683679
#if defined(__AVX2__)
@@ -701,36 +697,11 @@ SIN Vec<N,uint8_t> approx_scale(const Vec<N,uint8_t>& x, const Vec<N,uint8_t>& y
701697
}
702698
#endif
703699

704-
// WASM SIMD compatible operations which are not automatically compiled to SIMD commands
705-
// by emscripten:
706700
#if defined __wasm_simd128__
707-
SI Vec<4, float> rcp (const Vec<4, float>& x) { return 1.0f / x; }
708-
SI Vec<2,double> rcp (const Vec<2,double>& x) { return 1.0f / x; }
709-
SI Vec<4, float> rsqrt(const Vec<4, float>& x) { return 1.0f / sqrt(x); }
710-
SI Vec<2,double> rsqrt(const Vec<2,double>& x) { return 1.0f / sqrt(x); }
711-
712-
SI Vec<4,float> sqrt(const Vec<4,float>& x) {
713-
return to_vec<4,float>(wasm_f32x4_sqrt(to_vext(x)));
714-
}
715-
SI Vec<4,float> abs(const Vec<4,float>& x) {
716-
return to_vec<4,float>(wasm_f32x4_abs(to_vext(x)));
717-
}
718-
719-
SI Vec<2,double> sqrt(const Vec<2,double>& x) {
720-
return to_vec<2,double>(wasm_f64x2_sqrt(to_vext(x)));
721-
}
722-
SI Vec<2,double> abs(const Vec<2,double>& x) {
723-
return to_vec<2,double>(wasm_f64x2_abs(to_vext(x)));
724-
}
725-
726701
SI bool any(const Vec<4, int32_t>& x) { return wasm_i32x4_any_true(to_vext(x)); }
727702
SI bool any(const Vec<4,uint32_t>& x) { return wasm_i32x4_any_true(to_vext(x)); }
728703
SI bool all(const Vec<4, int32_t>& x) { return wasm_i32x4_all_true(to_vext(x)); }
729704
SI bool all(const Vec<4,uint32_t>& x) { return wasm_i32x4_all_true(to_vext(x)); }
730-
731-
SI Vec<4,int32_t> abs(const Vec<4,int32_t>& x) {
732-
return to_vec<4,int32_t>(wasm_i32x4_abs(to_vext(x)));
733-
}
734705
#endif
735706

736707
#endif // !defined(SKNX_NO_SIMD)

0 commit comments

Comments
 (0)