@@ -493,8 +493,8 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
493
493
if (!ST->isWave32 ()) {
494
494
// Combine lane 31 into lanes 32..63.
495
495
V = B.CreateBitCast (V, IntNTy);
496
- Value *const Lane31 = B.CreateIntrinsic (Intrinsic::amdgcn_readlane, {},
497
- {V, B.getInt32 (31 )});
496
+ Value *const Lane31 = B.CreateIntrinsic (
497
+ V-> getType (), Intrinsic::amdgcn_readlane, {V, B.getInt32 (31 )});
498
498
499
499
Value *UpdateDPPCall = B.CreateCall (
500
500
UpdateDPP, {Identity, Lane31, B.getInt32 (DPP::QUAD_PERM_ID),
@@ -598,16 +598,16 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
598
598
599
599
// Get the value required for atomic operation
600
600
V = B.CreateBitCast (V, IntNTy);
601
- Value *LaneValue =
602
- B. CreateIntrinsic (Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt});
601
+ Value *LaneValue = B. CreateIntrinsic (V-> getType (), Intrinsic::amdgcn_readlane,
602
+ {V, LaneIdxInt});
603
603
LaneValue = B.CreateBitCast (LaneValue, Ty);
604
604
605
605
// Perform writelane if intermediate scan results are required later in the
606
606
// kernel computations
607
607
Value *OldValue = nullptr ;
608
608
if (NeedResult) {
609
609
OldValue =
610
- B.CreateIntrinsic (Intrinsic::amdgcn_writelane, {} ,
610
+ B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_writelane,
611
611
{B.CreateBitCast (Accumulator, IntNTy), LaneIdxInt,
612
612
B.CreateBitCast (OldValuePhi, IntNTy)});
613
613
OldValue = B.CreateBitCast (OldValue, Ty);
@@ -789,7 +789,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
789
789
Value *const LastLaneIdx = B.getInt32 (ST->getWavefrontSize () - 1 );
790
790
assert (TyBitWidth == 32 );
791
791
NewV = B.CreateBitCast (NewV, IntNTy);
792
- NewV = B.CreateIntrinsic (Intrinsic::amdgcn_readlane, {} ,
792
+ NewV = B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_readlane,
793
793
{NewV, LastLaneIdx});
794
794
NewV = B.CreateBitCast (NewV, Ty);
795
795
}
@@ -936,10 +936,10 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
936
936
Value *const ExtractLo = B.CreateTrunc (CastedPhi, Int32Ty);
937
937
Value *const ExtractHi =
938
938
B.CreateTrunc (B.CreateLShr (CastedPhi, 32 ), Int32Ty);
939
- CallInst *const ReadFirstLaneLo =
940
- B. CreateIntrinsic ( Intrinsic::amdgcn_readfirstlane, {} , ExtractLo);
941
- CallInst *const ReadFirstLaneHi =
942
- B. CreateIntrinsic ( Intrinsic::amdgcn_readfirstlane, {} , ExtractHi);
939
+ CallInst *const ReadFirstLaneLo = B. CreateIntrinsic (
940
+ Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractLo);
941
+ CallInst *const ReadFirstLaneHi = B. CreateIntrinsic (
942
+ Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractHi);
943
943
Value *const PartialInsert = B.CreateInsertElement (
944
944
PoisonValue::get (VecTy), ReadFirstLaneLo, B.getInt32 (0 ));
945
945
Value *const Insert =
@@ -948,7 +948,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
948
948
} else if (TyBitWidth == 32 ) {
949
949
Value *CastedPhi = B.CreateBitCast (PHI, IntNTy);
950
950
BroadcastI =
951
- B.CreateIntrinsic (Intrinsic::amdgcn_readfirstlane, {} , CastedPhi);
951
+ B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_readfirstlane, CastedPhi);
952
952
BroadcastI = B.CreateBitCast (BroadcastI, Ty);
953
953
954
954
} else {
0 commit comments