[RISC-V] Remove round-trip to memory when using `compressstore` #113242

Validark · 2024-10-22T00:40:19Z

This Zig code:

fn compressstore(vec: @Vector(64, u8), ptr: *@Vector(64, u8), bitstr: u64) void {
    return struct {
        extern fn @"llvm.masked.compressstore.v64i8"(@Vector(64, u8), *@Vector(64, u8), @Vector(64, u1)) callconv(.Unspecified) void;
    }.@"llvm.masked.compressstore.v64i8"(vec, ptr, @bitCast(bitstr));
}

export fn compress(vec: @Vector(64, u8), bitstr: u64, vec2: @Vector(64, u8)) @Vector(64, u8) {
    var buffer: [64]u8 align(64) = undefined;
    compressstore(vec, &buffer, bitstr);
    return buffer -% vec2;
}

Gives us this optimized LLVM code:

define dso_local void @compress(ptr noalias nocapture nonnull writeonly sret(<64 x i8>) %0, ptr nocapture noundef readonly %1, i64 %2, ptr nocapture noundef readonly %3) local_unnamed_addr {
Entry:
  %4 = alloca [64 x i8], align 64
  %5 = load <64 x i8>, ptr %1, align 64
  %6 = load <64 x i8>, ptr %3, align 64
    #dbg_value(<64 x i8> %5, !138, !DIExpression(), !139)
    #dbg_value(i64 %2, !140, !DIExpression(), !139)
    #dbg_value(<64 x i8> %6, !141, !DIExpression(), !139)
    #dbg_declare(ptr %4, !142, !DIExpression(), !144)
    #dbg_value(<64 x i8> %5, !145, !DIExpression(), !149)
    #dbg_value(ptr %4, !151, !DIExpression(), !149)
    #dbg_value(i64 %2, !152, !DIExpression(), !149)
  %7 = bitcast i64 %2 to <64 x i1>
  call fastcc void @llvm.masked.compressstore.v64i8(<64 x i8> %5, ptr nonnull align 64 %4, <64 x i1> %7)
  %8 = load <64 x i8>, ptr %4, align 64
  %9 = sub <64 x i8> %8, %6
  store <64 x i8> %9, ptr %0, align 64
  ret void
}

declare fastcc void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr nocapture, <64 x i1>) #1

Which gets emitted like so for spacemit_x60:

compress:
        addi    sp, sp, -128
        sd      ra, 120(sp)
        sd      s0, 112(sp)
        addi    s0, sp, 128
        andi    sp, sp, -64
        li      a4, 64
        vsetvli zero, a4, e8, m2, ta, ma
        vle8.v  v8, (a1)
        vle8.v  v10, (a3)
        vsetivli        zero, 1, e64, m1, ta, ma
        vmv.s.x v12, a2
        vsetvli zero, a4, e8, m2, ta, ma
        vcompress.vm    v14, v8, v12
        vcpop.m a1, v12
        mv      a2, sp
        vsetvli zero, a1, e8, m2, ta, ma
        vse8.v  v14, (a2)
        vsetvli zero, a4, e8, m2, ta, ma
        vle8.v  v8, (a2)
        vsub.vv v8, v8, v10
        vse8.v  v8, (a0)
        addi    sp, s0, -128
        ld      ra, 120(sp)
        ld      s0, 112(sp)
        addi    sp, sp, 128
        ret

Is it necessary to have this section of the assembly?

        vse8.v  v14, (a2)
        vsetvli zero, a4, e8, m2, ta, ma
        vle8.v  v8, (a2)

I haven't read that much RISC-V Vector assembly yet, but my hunch is this could be done better.

The text was updated successfully, but these errors were encountered:

llvmbot · 2024-10-22T01:04:35Z

@llvm/issue-subscribers-backend-risc-v

Author: Niles Salter (Validark)

This Zig code:

fn compressstore(vec: @<!-- -->Vector(64, u8), ptr: *@<!-- -->Vector(64, u8), bitstr: u64) void {
    return struct {
        extern fn @"llvm.masked.compressstore.v64i8"(@<!-- -->Vector(64, u8), *@<!-- -->Vector(64, u8), @<!-- -->Vector(64, u1)) callconv(.Unspecified) void;
    }.@"llvm.masked.compressstore.v64i8"(vec, ptr, @<!-- -->bitCast(bitstr));
}

export fn compress(vec: @<!-- -->Vector(64, u8), bitstr: u64, vec2: @<!-- -->Vector(64, u8)) @<!-- -->Vector(64, u8) {
    var buffer: [64]u8 align(64) = undefined;
    compressstore(vec, &amp;buffer, bitstr);
    return buffer -% vec2;
}

Gives us this optimized LLVM code:

define dso_local void @<!-- -->compress(ptr noalias nocapture nonnull writeonly sret(&lt;64 x i8&gt;) %0, ptr nocapture noundef readonly %1, i64 %2, ptr nocapture noundef readonly %3) local_unnamed_addr {
Entry:
  %4 = alloca [64 x i8], align 64
  %5 = load &lt;64 x i8&gt;, ptr %1, align 64
  %6 = load &lt;64 x i8&gt;, ptr %3, align 64
    #dbg_value(&lt;64 x i8&gt; %5, !138, !DIExpression(), !139)
    #dbg_value(i64 %2, !140, !DIExpression(), !139)
    #dbg_value(&lt;64 x i8&gt; %6, !141, !DIExpression(), !139)
    #dbg_declare(ptr %4, !142, !DIExpression(), !144)
    #dbg_value(&lt;64 x i8&gt; %5, !145, !DIExpression(), !149)
    #dbg_value(ptr %4, !151, !DIExpression(), !149)
    #dbg_value(i64 %2, !152, !DIExpression(), !149)
  %7 = bitcast i64 %2 to &lt;64 x i1&gt;
  call fastcc void @<!-- -->llvm.masked.compressstore.v64i8(&lt;64 x i8&gt; %5, ptr nonnull align 64 %4, &lt;64 x i1&gt; %7)
  %8 = load &lt;64 x i8&gt;, ptr %4, align 64
  %9 = sub &lt;64 x i8&gt; %8, %6
  store &lt;64 x i8&gt; %9, ptr %0, align 64
  ret void
}

declare fastcc void @<!-- -->llvm.masked.compressstore.v64i8(&lt;64 x i8&gt;, ptr nocapture, &lt;64 x i1&gt;) #<!-- -->1

Which gets emitted like so for spacemit_x60:

compress:
        addi    sp, sp, -128
        sd      ra, 120(sp)
        sd      s0, 112(sp)
        addi    s0, sp, 128
        andi    sp, sp, -64
        li      a4, 64
        vsetvli zero, a4, e8, m2, ta, ma
        vle8.v  v8, (a1)
        vle8.v  v10, (a3)
        vsetivli        zero, 1, e64, m1, ta, ma
        vmv.s.x v12, a2
        vsetvli zero, a4, e8, m2, ta, ma
        vcompress.vm    v14, v8, v12
        vcpop.m a1, v12
        mv      a2, sp
        vsetvli zero, a1, e8, m2, ta, ma
        vse8.v  v14, (a2)
        vsetvli zero, a4, e8, m2, ta, ma
        vle8.v  v8, (a2)
        vsub.vv v8, v8, v10
        vse8.v  v8, (a0)
        addi    sp, s0, -128
        ld      ra, 120(sp)
        ld      s0, 112(sp)
        addi    sp, sp, 128
        ret

Is it necessary to have this section of the assembly?

        vse8.v  v14, (a2)
        vsetvli zero, a4, e8, m2, ta, ma
        vle8.v  v8, (a2)

I haven't read that much RISC-V Vector assembly yet, but my hunch is this could be done better.

Validark · 2024-10-22T02:49:00Z

Same thing happens on Zen 4 as well:

compress:
        push    rbp
        mov     rbp, rsp
        and     rsp, -64
        sub     rsp, 128
        kmovq   k1, rdi
        vpcompressb     zmmword ptr [rsp] {k1}, zmm0
        vmovdqa64       zmm0, zmmword ptr [rsp]
        vpsubb  zmm0, zmm0, zmm1
        mov     rsp, rbp
        pop     rbp
        ret

wangpc-pp · 2024-10-22T04:13:38Z

I think it is not doing things wrong? It just keeps the store semantics of compressstore? If we want to remove the store and followed load, we should use in-register compression.

Validark · 2024-10-22T04:35:29Z

Yes, it is not doing anything incorrect, but it would be nice to optimize this away. I don't really have another way to access this functionality in Zig, so it would be great if using this cross-platform LLVM intrinsic gave me optimized code on RISC-V targets. Otherwise, I would have to dip into inline assembly I think, since I think we don't have access to vscale types in Zig.

wangpc-pp · 2024-10-22T04:40:30Z

Maybe you can try llvm.experimental.vector.compress.*’ Intrinsics?

Validark · 2024-10-22T04:51:50Z

Looks like it's not hooked up to the compress instruction on RISC-V targets. Godbolt

wangpc-pp · 2024-10-22T05:25:02Z

Oh sorry, this intrinsic was introduced three months ago, RISC-V target hasn't supported it. I will add a custom lowering for it today.

Validark · 2024-10-22T05:48:53Z

Thank you!

Validark · 2024-10-22T06:08:29Z

I noticed we don't have a llvm.experimental.vector.expand intrinsic? That means I can't work-around the expandload version of this issue:

fn expandload(ptr: *const @Vector(64, u8), bitstr: u64, fallback: @Vector(64, u8)) @Vector(64, u8) {
    return struct {
        extern fn @"llvm.masked.expandload.v64i8"(*const @Vector(64, u8), @Vector(64, u1), @Vector(64, u8)) callconv(.Unspecified) @Vector(64, u8);
    }.@"llvm.masked.expandload.v64i8"(ptr, @bitCast(bitstr), fallback);
}

export fn expand(vec: @Vector(64, u8), bitstr: u64) @Vector(64, u8) {
    return expandload(&vec, bitstr, @splat(0));
}

Compiled for Zen 5:

(I won't show the RISC-V version for now since your PR was not merged yet)

expand:
        push    rbp
        mov     rbp, rsp
        and     rsp, -64
        sub     rsp, 128
        vmovaps zmmword ptr [rsp], zmm0
        kmovq   k1, rdi
        vpexpandb       zmm0 {k1} {z}, zmmword ptr [rsp]
        mov     rsp, rbp
        pop     rbp
        ret

This intrinsic was introduced by llvm#92289 and currently we just expand it for RISC-V. This patch adds custom lowering for this intrinsic and simply maps it to `vcompress` instruction. Fixes llvm#113242.

wangpc-pp · 2024-10-22T10:06:10Z

Yeah, we don't have intrinsic for decompress. Though it should be able to be synthesized via vp.select, I think it's a nice-to-have thing. Will try to add one later.

This intrinsic was introduced by #92289 and currently we just expand it for RISC-V. This patch adds custom lowering for this intrinsic and simply maps it to `vcompress` instruction. Fixes #113242.

wangpc-pp · 2024-10-23T06:23:25Z

@Validark Can you please file another issue to track this?

Validark · 2024-10-23T07:08:35Z

I am glad that your PR added support for llvm.experimental.compress but I don't think that actually solves this issue. It's nice to have support for that, of course, but I still think compressstore should be able to optimize away situations like this. Even though I don't need it as a workaround now thanks to your work, you could imagine that at some point in the future the compiler could compile a piece of code that after inlining a bunch of constants and inlining 5 functions deep, it turns into the trivial code I wrote.

Validark · 2024-10-23T07:12:39Z

#113422

wangpc-pp · 2024-10-23T08:20:34Z

Yeah, I agree that we may have some potential gains. For example, we can propagate the store value to later load if we can prove they are the same value, so that we can avoid actual memory accesses. But I think we can't remove the store totally, the optimization should not change the semantics.
Here is an example: https://godbolt.org/z/nP8PzqzWj
We can optimize

int foo(int* a, int b) {
    *a=b;
    return *a;
}

to

int foo(int* a, int b) {
    *a=b;
    return b;
}

So we don't really need to load value from *a but return value b directly. We can also do the same optimization for vector type in DAG Combine: https://godbolt.org/z/o5qrGeK4h

But for vectors, the precondition is that we store/load the same value. As for your example, the AVLs (a1 and a4) are different, that is the reason why we can't do the propagation.

        vsetvli zero, a1, e8, m2, ta, ma
        vse8.v  v14, (a2)
        vsetvli zero, a4, e8, m2, ta, ma
        vle8.v  v8, (a2)

Validark · 2024-10-30T14:32:32Z

Is there an intrinsic for a runtime vector shuffle? We could probably do with a 16, 32, 64, and 128 byte lookup tables (and the index vector size can vary independently). Even if the zeroing semantics were not made consistent by LLVM, it would be extremely convenient to have those so I don't have to interact with vscale vectors. Should I open an issue for such a thing?

wangpc-pp · 2024-10-31T04:05:58Z

Maybe ‘shufflevector’ Instruction?

topperc · 2024-10-31T04:07:45Z

shufflevector only works with compile time constant indices.

wangpc-pp · 2024-10-31T05:41:28Z

Then we may use vp.select/vp.merge? But not all targets support VP instrinsics.

Validark · 2024-10-31T12:08:50Z

I don't think select and merge get you to an arbitrary runtime shufflevector. Are you just saying random intrinsics at this point? I want a runtime vrgather intrinsic that let's me deal with fixed width vectors. What do you think of that?

wangpc-pp · 2024-10-31T13:18:59Z

Well, I was thinking we may be able to synthesize it via vp.select, vp.merge and other operations. But I just had a try and found it is really hard to express the semantics (kick me if someone find out a way). For your lookup table scenarios, we can of course use indexed load, but if we want to do it with in-register operations, I think it is hard with current LLVM IR (which means we may not be able to expand it without going through memory in DAGISel/GISel).
As for my opinion, I encourge you to file an issue on Github and start a discussion on Discourse. I personally would like to see such thing added. It may need more investigations to abstract all the architectures.

topperc · 2024-10-31T18:46:58Z

X86 backend has code to detect patterns of extractelts with variable offsets and insertelts with constant offsets and turn them into the equivalent of vrgather.vv. See LowerBUILD_VECTORAsVariablePermute in X86ISelLowering.cpp and the var-permute-*.ll tests in llvm/test/CodeGen/X86

wangpc-pp · 2024-11-01T03:09:48Z

Thanks! So it is possible to expand future llvm.vector.gather intrinsic to sequencial extractelement and insertelement!

github-actions bot added the new issue label Oct 22, 2024

EugeneZelenko added backend:RISC-V and removed new issue labels Oct 22, 2024

wangpc-pp self-assigned this Oct 22, 2024

wangpc-pp mentioned this issue Oct 22, 2024

[RISCV] Add lowering for @llvm.experimental.vector.compress #113291

Merged

wangpc-pp closed this as completed in #113291 Oct 23, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[RISC-V] Remove round-trip to memory when using `compressstore` #113242

[RISC-V] Remove round-trip to memory when using `compressstore` #113242

Validark commented Oct 22, 2024 •

edited

Loading

llvmbot commented Oct 22, 2024

Validark commented Oct 22, 2024

wangpc-pp commented Oct 22, 2024

Validark commented Oct 22, 2024

wangpc-pp commented Oct 22, 2024

Validark commented Oct 22, 2024

wangpc-pp commented Oct 22, 2024

Validark commented Oct 22, 2024

Validark commented Oct 22, 2024

wangpc-pp commented Oct 22, 2024

wangpc-pp commented Oct 23, 2024

Validark commented Oct 23, 2024 •

edited

Loading

Validark commented Oct 23, 2024

wangpc-pp commented Oct 23, 2024

Validark commented Oct 30, 2024

wangpc-pp commented Oct 31, 2024

topperc commented Oct 31, 2024

wangpc-pp commented Oct 31, 2024

Validark commented Oct 31, 2024

wangpc-pp commented Oct 31, 2024

topperc commented Oct 31, 2024

wangpc-pp commented Nov 1, 2024

[RISC-V] Remove round-trip to memory when using compressstore #113242

[RISC-V] Remove round-trip to memory when using compressstore #113242

Comments

Validark commented Oct 22, 2024 • edited Loading

llvmbot commented Oct 22, 2024

Validark commented Oct 22, 2024

wangpc-pp commented Oct 22, 2024

Validark commented Oct 22, 2024

wangpc-pp commented Oct 22, 2024

Validark commented Oct 22, 2024

wangpc-pp commented Oct 22, 2024

Validark commented Oct 22, 2024

Validark commented Oct 22, 2024

wangpc-pp commented Oct 22, 2024

wangpc-pp commented Oct 23, 2024

Validark commented Oct 23, 2024 • edited Loading

Validark commented Oct 23, 2024

wangpc-pp commented Oct 23, 2024

Validark commented Oct 30, 2024

wangpc-pp commented Oct 31, 2024

topperc commented Oct 31, 2024

wangpc-pp commented Oct 31, 2024

Validark commented Oct 31, 2024

wangpc-pp commented Oct 31, 2024

topperc commented Oct 31, 2024

wangpc-pp commented Nov 1, 2024

[RISC-V] Remove round-trip to memory when using `compressstore` #113242

[RISC-V] Remove round-trip to memory when using `compressstore` #113242

Validark commented Oct 22, 2024 •

edited

Loading

Validark commented Oct 23, 2024 •

edited

Loading