Skip to content

[RISC-V] Remove round-trip to memory when using compressstore #113242

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Validark opened this issue Oct 22, 2024 · 22 comments · Fixed by #113291
Closed

[RISC-V] Remove round-trip to memory when using compressstore #113242

Validark opened this issue Oct 22, 2024 · 22 comments · Fixed by #113291
Assignees

Comments

@Validark
Copy link

Validark commented Oct 22, 2024

This Zig code:

fn compressstore(vec: @Vector(64, u8), ptr: *@Vector(64, u8), bitstr: u64) void {
    return struct {
        extern fn @"llvm.masked.compressstore.v64i8"(@Vector(64, u8), *@Vector(64, u8), @Vector(64, u1)) callconv(.Unspecified) void;
    }.@"llvm.masked.compressstore.v64i8"(vec, ptr, @bitCast(bitstr));
}

export fn compress(vec: @Vector(64, u8), bitstr: u64, vec2: @Vector(64, u8)) @Vector(64, u8) {
    var buffer: [64]u8 align(64) = undefined;
    compressstore(vec, &buffer, bitstr);
    return buffer -% vec2;
}

Gives us this optimized LLVM code:

define dso_local void @compress(ptr noalias nocapture nonnull writeonly sret(<64 x i8>) %0, ptr nocapture noundef readonly %1, i64 %2, ptr nocapture noundef readonly %3) local_unnamed_addr {
Entry:
  %4 = alloca [64 x i8], align 64
  %5 = load <64 x i8>, ptr %1, align 64
  %6 = load <64 x i8>, ptr %3, align 64
    #dbg_value(<64 x i8> %5, !138, !DIExpression(), !139)
    #dbg_value(i64 %2, !140, !DIExpression(), !139)
    #dbg_value(<64 x i8> %6, !141, !DIExpression(), !139)
    #dbg_declare(ptr %4, !142, !DIExpression(), !144)
    #dbg_value(<64 x i8> %5, !145, !DIExpression(), !149)
    #dbg_value(ptr %4, !151, !DIExpression(), !149)
    #dbg_value(i64 %2, !152, !DIExpression(), !149)
  %7 = bitcast i64 %2 to <64 x i1>
  call fastcc void @llvm.masked.compressstore.v64i8(<64 x i8> %5, ptr nonnull align 64 %4, <64 x i1> %7)
  %8 = load <64 x i8>, ptr %4, align 64
  %9 = sub <64 x i8> %8, %6
  store <64 x i8> %9, ptr %0, align 64
  ret void
}

declare fastcc void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr nocapture, <64 x i1>) #1

Which gets emitted like so for spacemit_x60:

compress:
        addi    sp, sp, -128
        sd      ra, 120(sp)
        sd      s0, 112(sp)
        addi    s0, sp, 128
        andi    sp, sp, -64
        li      a4, 64
        vsetvli zero, a4, e8, m2, ta, ma
        vle8.v  v8, (a1)
        vle8.v  v10, (a3)
        vsetivli        zero, 1, e64, m1, ta, ma
        vmv.s.x v12, a2
        vsetvli zero, a4, e8, m2, ta, ma
        vcompress.vm    v14, v8, v12
        vcpop.m a1, v12
        mv      a2, sp
        vsetvli zero, a1, e8, m2, ta, ma
        vse8.v  v14, (a2)
        vsetvli zero, a4, e8, m2, ta, ma
        vle8.v  v8, (a2)
        vsub.vv v8, v8, v10
        vse8.v  v8, (a0)
        addi    sp, s0, -128
        ld      ra, 120(sp)
        ld      s0, 112(sp)
        addi    sp, sp, 128
        ret

Is it necessary to have this section of the assembly?

        vse8.v  v14, (a2)
        vsetvli zero, a4, e8, m2, ta, ma
        vle8.v  v8, (a2)

I haven't read that much RISC-V Vector assembly yet, but my hunch is this could be done better.

@llvmbot
Copy link
Member

llvmbot commented Oct 22, 2024

@llvm/issue-subscribers-backend-risc-v

Author: Niles Salter (Validark)

This Zig code:
fn compressstore(vec: @<!-- -->Vector(64, u8), ptr: *@<!-- -->Vector(64, u8), bitstr: u64) void {
    return struct {
        extern fn @"llvm.masked.compressstore.v64i8"(@<!-- -->Vector(64, u8), *@<!-- -->Vector(64, u8), @<!-- -->Vector(64, u1)) callconv(.Unspecified) void;
    }.@"llvm.masked.compressstore.v64i8"(vec, ptr, @<!-- -->bitCast(bitstr));
}

export fn compress(vec: @<!-- -->Vector(64, u8), bitstr: u64, vec2: @<!-- -->Vector(64, u8)) @<!-- -->Vector(64, u8) {
    var buffer: [64]u8 align(64) = undefined;
    compressstore(vec, &amp;buffer, bitstr);
    return buffer -% vec2;
}

Gives us this optimized LLVM code:

define dso_local void @<!-- -->compress(ptr noalias nocapture nonnull writeonly sret(&lt;64 x i8&gt;) %0, ptr nocapture noundef readonly %1, i64 %2, ptr nocapture noundef readonly %3) local_unnamed_addr {
Entry:
  %4 = alloca [64 x i8], align 64
  %5 = load &lt;64 x i8&gt;, ptr %1, align 64
  %6 = load &lt;64 x i8&gt;, ptr %3, align 64
    #dbg_value(&lt;64 x i8&gt; %5, !138, !DIExpression(), !139)
    #dbg_value(i64 %2, !140, !DIExpression(), !139)
    #dbg_value(&lt;64 x i8&gt; %6, !141, !DIExpression(), !139)
    #dbg_declare(ptr %4, !142, !DIExpression(), !144)
    #dbg_value(&lt;64 x i8&gt; %5, !145, !DIExpression(), !149)
    #dbg_value(ptr %4, !151, !DIExpression(), !149)
    #dbg_value(i64 %2, !152, !DIExpression(), !149)
  %7 = bitcast i64 %2 to &lt;64 x i1&gt;
  call fastcc void @<!-- -->llvm.masked.compressstore.v64i8(&lt;64 x i8&gt; %5, ptr nonnull align 64 %4, &lt;64 x i1&gt; %7)
  %8 = load &lt;64 x i8&gt;, ptr %4, align 64
  %9 = sub &lt;64 x i8&gt; %8, %6
  store &lt;64 x i8&gt; %9, ptr %0, align 64
  ret void
}

declare fastcc void @<!-- -->llvm.masked.compressstore.v64i8(&lt;64 x i8&gt;, ptr nocapture, &lt;64 x i1&gt;) #<!-- -->1

Which gets emitted like so for spacemit_x60:

compress:
        addi    sp, sp, -128
        sd      ra, 120(sp)
        sd      s0, 112(sp)
        addi    s0, sp, 128
        andi    sp, sp, -64
        li      a4, 64
        vsetvli zero, a4, e8, m2, ta, ma
        vle8.v  v8, (a1)
        vle8.v  v10, (a3)
        vsetivli        zero, 1, e64, m1, ta, ma
        vmv.s.x v12, a2
        vsetvli zero, a4, e8, m2, ta, ma
        vcompress.vm    v14, v8, v12
        vcpop.m a1, v12
        mv      a2, sp
        vsetvli zero, a1, e8, m2, ta, ma
        vse8.v  v14, (a2)
        vsetvli zero, a4, e8, m2, ta, ma
        vle8.v  v8, (a2)
        vsub.vv v8, v8, v10
        vse8.v  v8, (a0)
        addi    sp, s0, -128
        ld      ra, 120(sp)
        ld      s0, 112(sp)
        addi    sp, sp, 128
        ret

Is it necessary to have this section of the assembly?

        vse8.v  v14, (a2)
        vsetvli zero, a4, e8, m2, ta, ma
        vle8.v  v8, (a2)

I haven't read that much RISC-V Vector assembly yet, but my hunch is this could be done better.

@Validark
Copy link
Author

Same thing happens on Zen 4 as well:

compress:
        push    rbp
        mov     rbp, rsp
        and     rsp, -64
        sub     rsp, 128
        kmovq   k1, rdi
        vpcompressb     zmmword ptr [rsp] {k1}, zmm0
        vmovdqa64       zmm0, zmmword ptr [rsp]
        vpsubb  zmm0, zmm0, zmm1
        mov     rsp, rbp
        pop     rbp
        ret

@wangpc-pp
Copy link
Contributor

I think it is not doing things wrong? It just keeps the store semantics of compressstore? If we want to remove the store and followed load, we should use in-register compression.

@Validark
Copy link
Author

I think it is not doing things wrong? It just keeps the store semantics of compressstore? If we want to remove the store and followed load, we should use in-register compression.

Yes, it is not doing anything incorrect, but it would be nice to optimize this away. I don't really have another way to access this functionality in Zig, so it would be great if using this cross-platform LLVM intrinsic gave me optimized code on RISC-V targets. Otherwise, I would have to dip into inline assembly I think, since I think we don't have access to vscale types in Zig.

@wangpc-pp
Copy link
Contributor

I think it is not doing things wrong? It just keeps the store semantics of compressstore? If we want to remove the store and followed load, we should use in-register compression.

Yes, it is not doing anything incorrect, but it would be nice to optimize this away. I don't really have another way to access this functionality in Zig, so it would be great if using this cross-platform LLVM intrinsic gave me optimized code on RISC-V targets. Otherwise, I would have to dip into inline assembly I think, since I think we don't have access to vscale types in Zig.

Maybe you can try llvm.experimental.vector.compress.*’ Intrinsics?

LLVM provides an intrinsic for compressing data within a vector based on a selection mask. Semantically, this is similar to llvm.masked.compressstore but with weaker assumptions and without storing the results to memory, i.e., the data remains in the vector.

@Validark
Copy link
Author

Maybe you can try llvm.experimental.vector.compress.*’ Intrinsics?

Looks like it's not hooked up to the compress instruction on RISC-V targets. Godbolt

@wangpc-pp
Copy link
Contributor

Maybe you can try llvm.experimental.vector.compress.*’ Intrinsics?

Looks like it's not hooked up to the compress instruction on RISC-V targets. Godbolt

Oh sorry, this intrinsic was introduced three months ago, RISC-V target hasn't supported it. I will add a custom lowering for it today.

@wangpc-pp wangpc-pp self-assigned this Oct 22, 2024
@Validark
Copy link
Author

Oh sorry, this intrinsic was introduced three months ago, RISC-V target hasn't supported it. I will add a custom lowering for it today.

Thank you!

@Validark
Copy link
Author

Maybe you can try llvm.experimental.vector.compress.*’ Intrinsics?

I noticed we don't have a llvm.experimental.vector.expand intrinsic? That means I can't work-around the expandload version of this issue:

fn expandload(ptr: *const @Vector(64, u8), bitstr: u64, fallback: @Vector(64, u8)) @Vector(64, u8) {
    return struct {
        extern fn @"llvm.masked.expandload.v64i8"(*const @Vector(64, u8), @Vector(64, u1), @Vector(64, u8)) callconv(.Unspecified) @Vector(64, u8);
    }.@"llvm.masked.expandload.v64i8"(ptr, @bitCast(bitstr), fallback);
}

export fn expand(vec: @Vector(64, u8), bitstr: u64) @Vector(64, u8) {
    return expandload(&vec, bitstr, @splat(0));
}

Compiled for Zen 5:

(I won't show the RISC-V version for now since your PR was not merged yet)

expand:
        push    rbp
        mov     rbp, rsp
        and     rsp, -64
        sub     rsp, 128
        vmovaps zmmword ptr [rsp], zmm0
        kmovq   k1, rdi
        vpexpandb       zmm0 {k1} {z}, zmmword ptr [rsp]
        mov     rsp, rbp
        pop     rbp
        ret

wangpc-pp added a commit to wangpc-pp/llvm-project that referenced this issue Oct 22, 2024
This intrinsic was introduced by llvm#92289 and currently we just expand
it for RISC-V.

This patch adds custom lowering for this intrinsic and simply maps
it to `vcompress` instruction.

Fixes llvm#113242.
@wangpc-pp
Copy link
Contributor

I noticed we don't have a llvm.experimental.vector.expand intrinsic? That means I can't work-around the expandload version of this issue.

Yeah, we don't have intrinsic for decompress. Though it should be able to be synthesized via vp.select, I think it's a nice-to-have thing. Will try to add one later.

wangpc-pp added a commit that referenced this issue Oct 23, 2024
This intrinsic was introduced by #92289 and currently we just expand
it for RISC-V.

This patch adds custom lowering for this intrinsic and simply maps
it to `vcompress` instruction.

Fixes #113242.
@wangpc-pp
Copy link
Contributor

I noticed we don't have a llvm.experimental.vector.expand intrinsic? That means I can't work-around the expandload version of this issue.

Yeah, we don't have intrinsic for decompress. Though it should be able to be synthesized via vp.select, I think it's a nice-to-have thing. Will try to add one later.

@Validark Can you please file another issue to track this?

@Validark
Copy link
Author

Validark commented Oct 23, 2024

I am glad that your PR added support for llvm.experimental.compress but I don't think that actually solves this issue. It's nice to have support for that, of course, but I still think compressstore should be able to optimize away situations like this. Even though I don't need it as a workaround now thanks to your work, you could imagine that at some point in the future the compiler could compile a piece of code that after inlining a bunch of constants and inlining 5 functions deep, it turns into the trivial code I wrote.

@Validark
Copy link
Author

I noticed we don't have a llvm.experimental.vector.expand intrinsic? That means I can't work-around the expandload version of this issue.

Yeah, we don't have intrinsic for decompress. Though it should be able to be synthesized via vp.select, I think it's a nice-to-have thing. Will try to add one later.

@Validark Can you please file another issue to track this?

#113422

@wangpc-pp
Copy link
Contributor

I am glad that your PR added support for llvm.experimental.compress but I don't think that actually solves this issue. It's nice to have support for that, of course, but I still think compressstore should be able to optimize away situations like this. Even though I don't need it as a workaround now thanks to your work, you could imagine that at some point in the future the compiler could compile a piece of code that after inlining a bunch of constants and inlining 5 functions deep, it turns into the trivial code I wrote.

Yeah, I agree that we may have some potential gains. For example, we can propagate the store value to later load if we can prove they are the same value, so that we can avoid actual memory accesses. But I think we can't remove the store totally, the optimization should not change the semantics.
Here is an example: https://godbolt.org/z/nP8PzqzWj
We can optimize

int foo(int* a, int b) {
    *a=b;
    return *a;
}

to

int foo(int* a, int b) {
    *a=b;
    return b;
}

So we don't really need to load value from *a but return value b directly. We can also do the same optimization for vector type in DAG Combine: https://godbolt.org/z/o5qrGeK4h

But for vectors, the precondition is that we store/load the same value. As for your example, the AVLs (a1 and a4) are different, that is the reason why we can't do the propagation.

        vsetvli zero, a1, e8, m2, ta, ma
        vse8.v  v14, (a2)
        vsetvli zero, a4, e8, m2, ta, ma
        vle8.v  v8, (a2)

@Validark
Copy link
Author

I noticed we don't have a llvm.experimental.vector.expand intrinsic? That means I can't work-around the expandload version of this issue.

Yeah, we don't have intrinsic for decompress. Though it should be able to be synthesized via vp.select, I think it's a nice-to-have thing. Will try to add one later.

Is there an intrinsic for a runtime vector shuffle? We could probably do with a 16, 32, 64, and 128 byte lookup tables (and the index vector size can vary independently). Even if the zeroing semantics were not made consistent by LLVM, it would be extremely convenient to have those so I don't have to interact with vscale vectors. Should I open an issue for such a thing?

@wangpc-pp
Copy link
Contributor

I noticed we don't have a llvm.experimental.vector.expand intrinsic? That means I can't work-around the expandload version of this issue.

Yeah, we don't have intrinsic for decompress. Though it should be able to be synthesized via vp.select, I think it's a nice-to-have thing. Will try to add one later.

Is there an intrinsic for a runtime vector shuffle? We could probably do with a 16, 32, 64, and 128 byte lookup tables (and the index vector size can vary independently). Even if the zeroing semantics were not made consistent by LLVM, it would be extremely convenient to have those so I don't have to interact with vscale vectors. Should I open an issue for such a thing?

Maybe ‘shufflevector’ Instruction?

@topperc
Copy link
Collaborator

topperc commented Oct 31, 2024

I noticed we don't have a llvm.experimental.vector.expand intrinsic? That means I can't work-around the expandload version of this issue.

Yeah, we don't have intrinsic for decompress. Though it should be able to be synthesized via vp.select, I think it's a nice-to-have thing. Will try to add one later.

Is there an intrinsic for a runtime vector shuffle? We could probably do with a 16, 32, 64, and 128 byte lookup tables (and the index vector size can vary independently). Even if the zeroing semantics were not made consistent by LLVM, it would be extremely convenient to have those so I don't have to interact with vscale vectors. Should I open an issue for such a thing?

Maybe ‘shufflevector’ Instruction?

shufflevector only works with compile time constant indices.

@wangpc-pp
Copy link
Contributor

I noticed we don't have a llvm.experimental.vector.expand intrinsic? That means I can't work-around the expandload version of this issue.

Yeah, we don't have intrinsic for decompress. Though it should be able to be synthesized via vp.select, I think it's a nice-to-have thing. Will try to add one later.

Is there an intrinsic for a runtime vector shuffle? We could probably do with a 16, 32, 64, and 128 byte lookup tables (and the index vector size can vary independently). Even if the zeroing semantics were not made consistent by LLVM, it would be extremely convenient to have those so I don't have to interact with vscale vectors. Should I open an issue for such a thing?

Maybe ‘shufflevector’ Instruction?

shufflevector only works with compile time constant indices.

Then we may use vp.select/vp.merge? But not all targets support VP instrinsics.

@Validark
Copy link
Author

I noticed we don't have a llvm.experimental.vector.expand intrinsic? That means I can't work-around the expandload version of this issue.

Yeah, we don't have intrinsic for decompress. Though it should be able to be synthesized via vp.select, I think it's a nice-to-have thing. Will try to add one later.

Is there an intrinsic for a runtime vector shuffle? We could probably do with a 16, 32, 64, and 128 byte lookup tables (and the index vector size can vary independently). Even if the zeroing semantics were not made consistent by LLVM, it would be extremely convenient to have those so I don't have to interact with vscale vectors. Should I open an issue for such a thing?

Maybe ‘shufflevector’ Instruction?

shufflevector only works with compile time constant indices.

Then we may use vp.select/vp.merge? But not all targets support VP instrinsics.

I don't think select and merge get you to an arbitrary runtime shufflevector. Are you just saying random intrinsics at this point? I want a runtime vrgather intrinsic that let's me deal with fixed width vectors. What do you think of that?

@wangpc-pp
Copy link
Contributor

I noticed we don't have a llvm.experimental.vector.expand intrinsic? That means I can't work-around the expandload version of this issue.

Yeah, we don't have intrinsic for decompress. Though it should be able to be synthesized via vp.select, I think it's a nice-to-have thing. Will try to add one later.

Is there an intrinsic for a runtime vector shuffle? We could probably do with a 16, 32, 64, and 128 byte lookup tables (and the index vector size can vary independently). Even if the zeroing semantics were not made consistent by LLVM, it would be extremely convenient to have those so I don't have to interact with vscale vectors. Should I open an issue for such a thing?

Maybe ‘shufflevector’ Instruction?

shufflevector only works with compile time constant indices.

Then we may use vp.select/vp.merge? But not all targets support VP instrinsics.

I don't think select and merge get you to an arbitrary runtime shufflevector. Are you just saying random intrinsics at this point? I want a runtime vrgather intrinsic that let's me deal with fixed width vectors. What do you think of that?

Well, I was thinking we may be able to synthesize it via vp.select, vp.merge and other operations. But I just had a try and found it is really hard to express the semantics (kick me if someone find out a way). For your lookup table scenarios, we can of course use indexed load, but if we want to do it with in-register operations, I think it is hard with current LLVM IR (which means we may not be able to expand it without going through memory in DAGISel/GISel).
As for my opinion, I encourge you to file an issue on Github and start a discussion on Discourse. I personally would like to see such thing added. It may need more investigations to abstract all the architectures.

@topperc
Copy link
Collaborator

topperc commented Oct 31, 2024

I noticed we don't have a llvm.experimental.vector.expand intrinsic? That means I can't work-around the expandload version of this issue.

Yeah, we don't have intrinsic for decompress. Though it should be able to be synthesized via vp.select, I think it's a nice-to-have thing. Will try to add one later.

Is there an intrinsic for a runtime vector shuffle? We could probably do with a 16, 32, 64, and 128 byte lookup tables (and the index vector size can vary independently). Even if the zeroing semantics were not made consistent by LLVM, it would be extremely convenient to have those so I don't have to interact with vscale vectors. Should I open an issue for such a thing?

Maybe ‘shufflevector’ Instruction?

shufflevector only works with compile time constant indices.

Then we may use vp.select/vp.merge? But not all targets support VP instrinsics.

I don't think select and merge get you to an arbitrary runtime shufflevector. Are you just saying random intrinsics at this point? I want a runtime vrgather intrinsic that let's me deal with fixed width vectors. What do you think of that?

Well, I was thinking we may be able to synthesize it via vp.select, vp.merge and other operations. But I just had a try and found it is really hard to express the semantics (kick me if someone find out a way). For your lookup table scenarios, we can of course use indexed load, but if we want to do it with in-register operations, I think it is hard with current LLVM IR (which means we may not be able to expand it without going through memory in DAGISel/GISel). As for my opinion, I encourge you to file an issue on Github and start a discussion on Discourse. I personally would like to see such thing added. It may need more investigations to abstract all the architectures.

X86 backend has code to detect patterns of extractelts with variable offsets and insertelts with constant offsets and turn them into the equivalent of vrgather.vv. See LowerBUILD_VECTORAsVariablePermute in X86ISelLowering.cpp and the var-permute-*.ll tests in llvm/test/CodeGen/X86

@wangpc-pp
Copy link
Contributor

I noticed we don't have a llvm.experimental.vector.expand intrinsic? That means I can't work-around the expandload version of this issue.

Yeah, we don't have intrinsic for decompress. Though it should be able to be synthesized via vp.select, I think it's a nice-to-have thing. Will try to add one later.

Is there an intrinsic for a runtime vector shuffle? We could probably do with a 16, 32, 64, and 128 byte lookup tables (and the index vector size can vary independently). Even if the zeroing semantics were not made consistent by LLVM, it would be extremely convenient to have those so I don't have to interact with vscale vectors. Should I open an issue for such a thing?

Maybe ‘shufflevector’ Instruction?

shufflevector only works with compile time constant indices.

Then we may use vp.select/vp.merge? But not all targets support VP instrinsics.

I don't think select and merge get you to an arbitrary runtime shufflevector. Are you just saying random intrinsics at this point? I want a runtime vrgather intrinsic that let's me deal with fixed width vectors. What do you think of that?

Well, I was thinking we may be able to synthesize it via vp.select, vp.merge and other operations. But I just had a try and found it is really hard to express the semantics (kick me if someone find out a way). For your lookup table scenarios, we can of course use indexed load, but if we want to do it with in-register operations, I think it is hard with current LLVM IR (which means we may not be able to expand it without going through memory in DAGISel/GISel). As for my opinion, I encourge you to file an issue on Github and start a discussion on Discourse. I personally would like to see such thing added. It may need more investigations to abstract all the architectures.

X86 backend has code to detect patterns of extractelts with variable offsets and insertelts with constant offsets and turn them into the equivalent of vrgather.vv. See LowerBUILD_VECTORAsVariablePermute in X86ISelLowering.cpp and the var-permute-*.ll tests in llvm/test/CodeGen/X86

Thanks! So it is possible to expand future llvm.vector.gather intrinsic to sequencial extractelement and insertelement!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging a pull request may close this issue.

5 participants