Skip to content

Commit 906580b

Browse files
authored
[PowerPC] Add intrinsics for rldimi/rlwimi/rlwnm (#82968)
These builtins are already there in Clang, however current codegen may produce suboptimal results due to their complex behavior. Implement them as intrinsics to ensure expected instructions are emitted.
1 parent e4882d8 commit 906580b

File tree

7 files changed

+234
-105
lines changed

7 files changed

+234
-105
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -17091,37 +17091,24 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
1709117091
}
1709217092
return Builder.CreateCall(CGM.getIntrinsic(ID), Ops, "");
1709317093
}
17094-
// Rotate and insert under mask operation.
17095-
// __rldimi(rs, is, shift, mask)
17096-
// (rotl64(rs, shift) & mask) | (is & ~mask)
17097-
// __rlwimi(rs, is, shift, mask)
17098-
// (rotl(rs, shift) & mask) | (is & ~mask)
1709917094
case PPC::BI__builtin_ppc_rldimi:
1710017095
case PPC::BI__builtin_ppc_rlwimi: {
1710117096
Value *Op0 = EmitScalarExpr(E->getArg(0));
1710217097
Value *Op1 = EmitScalarExpr(E->getArg(1));
1710317098
Value *Op2 = EmitScalarExpr(E->getArg(2));
1710417099
Value *Op3 = EmitScalarExpr(E->getArg(3));
17105-
llvm::Type *Ty = Op0->getType();
17106-
Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
17107-
if (BuiltinID == PPC::BI__builtin_ppc_rldimi)
17108-
Op2 = Builder.CreateZExt(Op2, Int64Ty);
17109-
Value *Shift = Builder.CreateCall(F, {Op0, Op0, Op2});
17110-
Value *X = Builder.CreateAnd(Shift, Op3);
17111-
Value *Y = Builder.CreateAnd(Op1, Builder.CreateNot(Op3));
17112-
return Builder.CreateOr(X, Y);
17113-
}
17114-
// Rotate and insert under mask operation.
17115-
// __rlwnm(rs, shift, mask)
17116-
// rotl(rs, shift) & mask
17100+
return Builder.CreateCall(
17101+
CGM.getIntrinsic(BuiltinID == PPC::BI__builtin_ppc_rldimi
17102+
? Intrinsic::ppc_rldimi
17103+
: Intrinsic::ppc_rlwimi),
17104+
{Op0, Op1, Op2, Op3});
17105+
}
1711717106
case PPC::BI__builtin_ppc_rlwnm: {
1711817107
Value *Op0 = EmitScalarExpr(E->getArg(0));
1711917108
Value *Op1 = EmitScalarExpr(E->getArg(1));
1712017109
Value *Op2 = EmitScalarExpr(E->getArg(2));
17121-
llvm::Type *Ty = Op0->getType();
17122-
Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
17123-
Value *Shift = Builder.CreateCall(F, {Op0, Op0, Op1});
17124-
return Builder.CreateAnd(Shift, Op2);
17110+
return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_rlwnm),
17111+
{Op0, Op1, Op2});
1712517112
}
1712617113
case PPC::BI__builtin_ppc_poppar4:
1712717114
case PPC::BI__builtin_ppc_poppar8: {

clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-rotate.c

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,8 @@ void test_builtin_ppc_rldimi() {
1616
// CHECK: %res = alloca i64, align 8
1717
// CHECK-NEXT: [[RA:%[0-9]+]] = load i64, ptr @ull, align 8
1818
// CHECK-NEXT: [[RB:%[0-9]+]] = load i64, ptr @ull, align 8
19-
// CHECK-NEXT: [[RC:%[0-9]+]] = call i64 @llvm.fshl.i64(i64 [[RA]], i64 [[RA]], i64 63)
20-
// CHECK-NEXT: [[RD:%[0-9]+]] = and i64 [[RC]], 72057593769492480
21-
// CHECK-NEXT: [[RE:%[0-9]+]] = and i64 [[RB]], -72057593769492481
22-
// CHECK-NEXT: [[RF:%[0-9]+]] = or i64 [[RD]], [[RE]]
23-
// CHECK-NEXT: store i64 [[RF]], ptr %res, align 8
19+
// CHECK-NEXT: [[RC:%[0-9]+]] = call i64 @llvm.ppc.rldimi(i64 [[RA]], i64 [[RB]], i32 63, i64 72057593769492480)
20+
// CHECK-NEXT: store i64 [[RC]], ptr %res, align 8
2421
// CHECK-NEXT: ret void
2522

2623
/*shift = 63, mask = 0x00FFFFFFF0000000 = 72057593769492480, ~mask = 0xFF0000000FFFFFFF = -72057593769492481*/
@@ -32,11 +29,8 @@ void test_builtin_ppc_rlwimi() {
3229
// CHECK: %res = alloca i32, align 4
3330
// CHECK-NEXT: [[RA:%[0-9]+]] = load i32, ptr @ui, align 4
3431
// CHECK-NEXT: [[RB:%[0-9]+]] = load i32, ptr @ui, align 4
35-
// CHECK-NEXT: [[RC:%[0-9]+]] = call i32 @llvm.fshl.i32(i32 [[RA]], i32 [[RA]], i32 31)
36-
// CHECK-NEXT: [[RD:%[0-9]+]] = and i32 [[RC]], 16776960
37-
// CHECK-NEXT: [[RE:%[0-9]+]] = and i32 [[RB]], -16776961
38-
// CHECK-NEXT: [[RF:%[0-9]+]] = or i32 [[RD]], [[RE]]
39-
// CHECK-NEXT: store i32 [[RF]], ptr %res, align 4
32+
// CHECK-NEXT: [[RC:%[0-9]+]] = call i32 @llvm.ppc.rlwimi(i32 [[RA]], i32 [[RB]], i32 31, i32 16776960)
33+
// CHECK-NEXT: store i32 [[RC]], ptr %res, align 4
4034
// CHECK-NEXT: ret void
4135

4236
/*shift = 31, mask = 0xFFFF00 = 16776960, ~mask = 0xFFFFFFFFFF0000FF = -16776961*/
@@ -47,9 +41,8 @@ void test_builtin_ppc_rlwnm() {
4741
// CHECK-LABEL: test_builtin_ppc_rlwnm
4842
// CHECK: %res = alloca i32, align 4
4943
// CHECK-NEXT: [[RA:%[0-9]+]] = load i32, ptr @ui, align 4
50-
// CHECK-NEXT: [[RB:%[0-9]+]] = call i32 @llvm.fshl.i32(i32 [[RA]], i32 [[RA]], i32 31)
51-
// CHECK-NEXT: [[RC:%[0-9]+]] = and i32 [[RB]], 511
52-
// CHECK-NEXT: store i32 [[RC]], ptr %res, align 4
44+
// CHECK-NEXT: [[RB:%[0-9]+]] = call i32 @llvm.ppc.rlwnm(i32 [[RA]], i32 31, i32 511)
45+
// CHECK-NEXT: store i32 [[RB]], ptr %res, align 4
5346
// CHECK-NEXT: ret void
5447

5548
/*shift = 31, mask = 0x1FF = 511*/
@@ -63,9 +56,8 @@ void test_builtin_ppc_rlwnm2(unsigned int shift) {
6356
// CHECK-NEXT: store i32 %shift, ptr %shift.addr, align 4
6457
// CHECK-NEXT: [[RA:%[0-9]+]] = load i32, ptr @ui, align 4
6558
// CHECK-NEXT: [[RB:%[0-9]+]] = load i32, ptr %shift.addr, align 4
66-
// CHECK-NEXT: [[RC:%[0-9]+]] = call i32 @llvm.fshl.i32(i32 [[RA]], i32 [[RA]], i32 [[RB]])
67-
// CHECK-NEXT: [[RD:%[0-9]+]] = and i32 [[RC]], 511
68-
// CHECK-NEXT: store i32 [[RD]], ptr %res, align 4
59+
// CHECK-NEXT: [[RC:%[0-9]+]] = call i32 @llvm.ppc.rlwnm(i32 [[RA]], i32 [[RB]], i32 511)
60+
// CHECK-NEXT: store i32 [[RC]], ptr %res, align 4
6961
// CHECK-NEXT: ret void
7062

7163
/*mask = 0x1FF = 511*/

llvm/include/llvm/IR/IntrinsicsPowerPC.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,18 @@ let TargetPrefix = "ppc" in { // All intrinsics start with "llvm.ppc.".
182182
def int_ppc_fctuwz
183183
: ClangBuiltin<"__builtin_ppc_fctuwz">,
184184
DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
185+
def int_ppc_rldimi
186+
: ClangBuiltin<"__builtin_ppc_rldimi">,
187+
DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
188+
[IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
189+
def int_ppc_rlwimi
190+
: ClangBuiltin<"__builtin_ppc_rlwimi">,
191+
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
192+
[IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
193+
def int_ppc_rlwnm
194+
: ClangBuiltin<"__builtin_ppc_rlwnm">,
195+
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
196+
[IntrNoMem, ImmArg<ArgIndex<2>>]>;
185197

186198
// XL compatible select functions
187199
// TODO: Add llvm_f128_ty support.

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
//===----------------------------------------------------------------------===//
1212

1313
#include "PPCISelLowering.h"
14+
#include "MCTargetDesc/PPCMCTargetDesc.h"
1415
#include "MCTargetDesc/PPCPredicates.h"
1516
#include "PPC.h"
1617
#include "PPCCCState.h"
@@ -10762,6 +10763,42 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
1076210763
return DAG.getRegister(PPC::X13, MVT::i64);
1076310764
return DAG.getRegister(PPC::R2, MVT::i32);
1076410765

10766+
case Intrinsic::ppc_rldimi: {
10767+
uint64_t SH = Op.getConstantOperandVal(3);
10768+
unsigned MB = 0, ME = 0;
10769+
if (!isRunOfOnes64(Op.getConstantOperandVal(4), MB, ME) || ME != 63 - SH)
10770+
report_fatal_error("invalid rldimi mask!");
10771+
return SDValue(DAG.getMachineNode(
10772+
PPC::RLDIMI, dl, MVT::i64,
10773+
{Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
10774+
DAG.getTargetConstant(MB, dl, MVT::i32)}),
10775+
0);
10776+
}
10777+
10778+
case Intrinsic::ppc_rlwimi: {
10779+
unsigned MB = 0, ME = 0;
10780+
if (!isRunOfOnes(Op.getConstantOperandVal(4), MB, ME))
10781+
report_fatal_error("invalid rlwimi mask!");
10782+
return SDValue(DAG.getMachineNode(
10783+
PPC::RLWIMI, dl, MVT::i32,
10784+
{Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
10785+
DAG.getTargetConstant(MB, dl, MVT::i32),
10786+
DAG.getTargetConstant(ME, dl, MVT::i32)}),
10787+
0);
10788+
}
10789+
10790+
case Intrinsic::ppc_rlwnm: {
10791+
unsigned MB = 0, ME = 0;
10792+
if (!isRunOfOnes(Op.getConstantOperandVal(3), MB, ME))
10793+
report_fatal_error("invalid rlwnm mask!");
10794+
return SDValue(
10795+
DAG.getMachineNode(PPC::RLWNM, dl, MVT::i32,
10796+
{Op.getOperand(1), Op.getOperand(2),
10797+
DAG.getTargetConstant(MB, dl, MVT::i32),
10798+
DAG.getTargetConstant(ME, dl, MVT::i32)}),
10799+
0);
10800+
}
10801+
1076510802
case Intrinsic::ppc_mma_disassemble_acc: {
1076610803
if (Subtarget.isISAFuture()) {
1076710804
EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};

llvm/test/CodeGen/PowerPC/rldimi.ll

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,18 @@ entry:
5858
%8 = or i64 %6, %7
5959
ret i64 %8
6060
}
61+
62+
define i64 @rldimi_intrinsic(i64 %a) {
63+
; CHECK-LABEL: rldimi_intrinsic:
64+
; CHECK: # %bb.0:
65+
; CHECK-NEXT: rldimi 3, 3, 8, 0
66+
; CHECK-NEXT: rldimi 3, 3, 16, 0
67+
; CHECK-NEXT: rldimi 3, 3, 32, 0
68+
; CHECK-NEXT: blr
69+
%r1 = call i64 @llvm.ppc.rldimi(i64 %a, i64 %a, i32 8, i64 -256)
70+
%r2 = call i64 @llvm.ppc.rldimi(i64 %r1, i64 %r1, i32 16, i64 -65536)
71+
%r3 = call i64 @llvm.ppc.rldimi(i64 %r2, i64 %r2, i32 32, i64 -4294967296)
72+
ret i64 %r3
73+
}
74+
75+
declare i64 @llvm.ppc.rldimi(i64, i64, i32 immarg, i64 immarg)

llvm/test/CodeGen/PowerPC/rlwimi.ll

Lines changed: 85 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,70 +1,117 @@
1-
; All of these ands and shifts should be folded into rlwimi's
2-
; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- | not grep and
3-
; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- | grep rlwimi | count 8
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s
43

54
define i32 @test1(i32 %x, i32 %y) {
5+
; CHECK-LABEL: test1:
6+
; CHECK: # %bb.0: # %entry
7+
; CHECK-NEXT: rlwimi 4, 3, 16, 0, 15
8+
; CHECK-NEXT: mr 3, 4
9+
; CHECK-NEXT: blr
610
entry:
7-
%tmp.3 = shl i32 %x, 16 ; <i32> [#uses=1]
8-
%tmp.7 = and i32 %y, 65535 ; <i32> [#uses=1]
9-
%tmp.9 = or i32 %tmp.7, %tmp.3 ; <i32> [#uses=1]
10-
ret i32 %tmp.9
11+
%tmp.3 = shl i32 %x, 16
12+
%tmp.7 = and i32 %y, 65535
13+
%tmp.9 = or i32 %tmp.7, %tmp.3
14+
ret i32 %tmp.9
1115
}
1216

1317
define i32 @test2(i32 %x, i32 %y) {
18+
; CHECK-LABEL: test2:
19+
; CHECK: # %bb.0: # %entry
20+
; CHECK-NEXT: rlwimi 3, 4, 16, 0, 15
21+
; CHECK-NEXT: blr
1422
entry:
15-
%tmp.7 = and i32 %x, 65535 ; <i32> [#uses=1]
16-
%tmp.3 = shl i32 %y, 16 ; <i32> [#uses=1]
17-
%tmp.9 = or i32 %tmp.7, %tmp.3 ; <i32> [#uses=1]
18-
ret i32 %tmp.9
23+
%tmp.7 = and i32 %x, 65535
24+
%tmp.3 = shl i32 %y, 16
25+
%tmp.9 = or i32 %tmp.7, %tmp.3
26+
ret i32 %tmp.9
1927
}
2028

2129
define i32 @test3(i32 %x, i32 %y) {
30+
; CHECK-LABEL: test3:
31+
; CHECK: # %bb.0: # %entry
32+
; CHECK-NEXT: rlwimi 4, 3, 16, 16, 31
33+
; CHECK-NEXT: mr 3, 4
34+
; CHECK-NEXT: blr
2235
entry:
23-
%tmp.3 = lshr i32 %x, 16 ; <i32> [#uses=1]
24-
%tmp.6 = and i32 %y, -65536 ; <i32> [#uses=1]
25-
%tmp.7 = or i32 %tmp.6, %tmp.3 ; <i32> [#uses=1]
26-
ret i32 %tmp.7
36+
%tmp.3 = lshr i32 %x, 16
37+
%tmp.6 = and i32 %y, -65536
38+
%tmp.7 = or i32 %tmp.6, %tmp.3
39+
ret i32 %tmp.7
2740
}
2841

2942
define i32 @test4(i32 %x, i32 %y) {
43+
; CHECK-LABEL: test4:
44+
; CHECK: # %bb.0: # %entry
45+
; CHECK-NEXT: rlwimi 3, 4, 16, 16, 31
46+
; CHECK-NEXT: blr
3047
entry:
31-
%tmp.6 = and i32 %x, -65536 ; <i32> [#uses=1]
32-
%tmp.3 = lshr i32 %y, 16 ; <i32> [#uses=1]
33-
%tmp.7 = or i32 %tmp.6, %tmp.3 ; <i32> [#uses=1]
34-
ret i32 %tmp.7
48+
%tmp.6 = and i32 %x, -65536
49+
%tmp.3 = lshr i32 %y, 16
50+
%tmp.7 = or i32 %tmp.6, %tmp.3
51+
ret i32 %tmp.7
3552
}
3653

3754
define i32 @test5(i32 %x, i32 %y) {
55+
; CHECK-LABEL: test5:
56+
; CHECK: # %bb.0: # %entry
57+
; CHECK-NEXT: rlwimi 4, 3, 1, 0, 15
58+
; CHECK-NEXT: mr 3, 4
59+
; CHECK-NEXT: blr
3860
entry:
39-
%tmp.3 = shl i32 %x, 1 ; <i32> [#uses=1]
40-
%tmp.4 = and i32 %tmp.3, -65536 ; <i32> [#uses=1]
41-
%tmp.7 = and i32 %y, 65535 ; <i32> [#uses=1]
42-
%tmp.9 = or i32 %tmp.4, %tmp.7 ; <i32> [#uses=1]
43-
ret i32 %tmp.9
61+
%tmp.3 = shl i32 %x, 1
62+
%tmp.4 = and i32 %tmp.3, -65536
63+
%tmp.7 = and i32 %y, 65535
64+
%tmp.9 = or i32 %tmp.4, %tmp.7
65+
ret i32 %tmp.9
4466
}
4567

4668
define i32 @test6(i32 %x, i32 %y) {
69+
; CHECK-LABEL: test6:
70+
; CHECK: # %bb.0: # %entry
71+
; CHECK-NEXT: rlwimi 3, 4, 1, 0, 15
72+
; CHECK-NEXT: blr
4773
entry:
48-
%tmp.7 = and i32 %x, 65535 ; <i32> [#uses=1]
49-
%tmp.3 = shl i32 %y, 1 ; <i32> [#uses=1]
50-
%tmp.4 = and i32 %tmp.3, -65536 ; <i32> [#uses=1]
51-
%tmp.9 = or i32 %tmp.4, %tmp.7 ; <i32> [#uses=1]
52-
ret i32 %tmp.9
74+
%tmp.7 = and i32 %x, 65535
75+
%tmp.3 = shl i32 %y, 1
76+
%tmp.4 = and i32 %tmp.3, -65536
77+
%tmp.9 = or i32 %tmp.4, %tmp.7
78+
ret i32 %tmp.9
5379
}
5480

5581
define i32 @test7(i32 %x, i32 %y) {
82+
; CHECK-LABEL: test7:
83+
; CHECK: # %bb.0: # %entry
84+
; CHECK-NEXT: andis. 3, 3, 65535
85+
; CHECK-NEXT: rldimi 3, 4, 0, 48
86+
; CHECK-NEXT: blr
5687
entry:
57-
%tmp.2 = and i32 %x, -65536 ; <i32> [#uses=1]
58-
%tmp.5 = and i32 %y, 65535 ; <i32> [#uses=1]
59-
%tmp.7 = or i32 %tmp.5, %tmp.2 ; <i32> [#uses=1]
60-
ret i32 %tmp.7
88+
%tmp.2 = and i32 %x, -65536
89+
%tmp.5 = and i32 %y, 65535
90+
%tmp.7 = or i32 %tmp.5, %tmp.2
91+
ret i32 %tmp.7
6192
}
6293

6394
define i32 @test8(i32 %bar) {
95+
; CHECK-LABEL: test8:
96+
; CHECK: # %bb.0: # %entry
97+
; CHECK-NEXT: rlwimi 3, 3, 1, 30, 30
98+
; CHECK-NEXT: blr
6499
entry:
65-
%tmp.3 = shl i32 %bar, 1 ; <i32> [#uses=1]
66-
%tmp.4 = and i32 %tmp.3, 2 ; <i32> [#uses=1]
67-
%tmp.6 = and i32 %bar, -3 ; <i32> [#uses=1]
68-
%tmp.7 = or i32 %tmp.4, %tmp.6 ; <i32> [#uses=1]
69-
ret i32 %tmp.7
100+
%tmp.3 = shl i32 %bar, 1
101+
%tmp.4 = and i32 %tmp.3, 2
102+
%tmp.6 = and i32 %bar, -3
103+
%tmp.7 = or i32 %tmp.4, %tmp.6
104+
ret i32 %tmp.7
70105
}
106+
107+
define i32 @test9(i32 %a, i32 %b) {
108+
; CHECK-LABEL: test9:
109+
; CHECK: # %bb.0: # %entry
110+
; CHECK-NEXT: rlwimi 3, 4, 8, 20, 26
111+
; CHECK-NEXT: blr
112+
entry:
113+
%r = call i32 @llvm.ppc.rlwimi(i32 %a, i32 %b, i32 8, i32 4064)
114+
ret i32 %r
115+
}
116+
117+
declare i32 @llvm.ppc.rlwimi(i32, i32, i32 immarg, i32 immarg)

0 commit comments

Comments
 (0)