Skip to content

Commit 6590d0f

Browse files
authored
[DAGCombiner][ARM] Teach reduceLoadWidth to handle (and (srl (load), C, ShiftedMask)) (#80342)
If we have a shifted mask, we may be able to reduce the load width to the width of the non-zero part of the mask and use an offset to the base address to remove the srl. The offset is given by C+trailingzeros(ShiftedMask). Then we add a final shl to restore the trailing zero bits. I've use the ARM test because that's where the existing (and (srl (load))) tests were. The X86 test was modified to keep the H register.
1 parent f72da9f commit 6590d0f

File tree

3 files changed

+132
-5
lines changed

3 files changed

+132
-5
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14198,7 +14198,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
1419814198
// away, but using an AND rather than a right shift. HasShiftedOffset is used
1419914199
// to indicate that the narrowed load should be left-shifted ShAmt bits to get
1420014200
// the result.
14201-
bool HasShiftedOffset = false;
14201+
unsigned ShiftedOffset = 0;
1420214202
// Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then
1420314203
// extended to VT.
1420414204
if (Opc == ISD::SIGN_EXTEND_INREG) {
@@ -14243,7 +14243,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
1424314243
if (Mask.isMask()) {
1424414244
ActiveBits = Mask.countr_one();
1424514245
} else if (Mask.isShiftedMask(ShAmt, ActiveBits)) {
14246-
HasShiftedOffset = true;
14246+
ShiftedOffset = ShAmt;
1424714247
} else {
1424814248
return SDValue();
1424914249
}
@@ -14307,6 +14307,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
1430714307
SDNode *Mask = *(SRL->use_begin());
1430814308
if (SRL.hasOneUse() && Mask->getOpcode() == ISD::AND &&
1430914309
isa<ConstantSDNode>(Mask->getOperand(1))) {
14310+
unsigned Offset, ActiveBits;
1431014311
const APInt& ShiftMask = Mask->getConstantOperandAPInt(1);
1431114312
if (ShiftMask.isMask()) {
1431214313
EVT MaskedVT =
@@ -14315,6 +14316,18 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
1431514316
if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) &&
1431614317
TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT))
1431714318
ExtVT = MaskedVT;
14319+
} else if (ExtType == ISD::ZEXTLOAD &&
14320+
ShiftMask.isShiftedMask(Offset, ActiveBits) &&
14321+
(Offset + ShAmt) < VT.getScalarSizeInBits()) {
14322+
EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
14323+
// If the mask is shifted we can use a narrower load and a shl to insert
14324+
// the trailing zeros.
14325+
if (((Offset + ActiveBits) <= ExtVT.getScalarSizeInBits()) &&
14326+
TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT)) {
14327+
ExtVT = MaskedVT;
14328+
ShAmt = Offset + ShAmt;
14329+
ShiftedOffset = Offset;
14330+
}
1431814331
}
1431914332
}
1432014333

@@ -14400,12 +14413,12 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
1440014413
Result, DAG.getConstant(ShLeftAmt, DL, ShImmTy));
1440114414
}
1440214415

14403-
if (HasShiftedOffset) {
14416+
if (ShiftedOffset != 0) {
1440414417
// We're using a shifted mask, so the load now has an offset. This means
1440514418
// that data has been loaded into the lower bytes than it would have been
1440614419
// before, so we need to shl the loaded data into the correct position in the
1440714420
// register.
14408-
SDValue ShiftC = DAG.getConstant(ShAmt, DL, VT);
14421+
SDValue ShiftC = DAG.getConstant(ShiftedOffset, DL, VT);
1440914422
Result = DAG.getNode(ISD::SHL, DL, VT, Result, ShiftC);
1441014423
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1441114424
}

llvm/test/CodeGen/ARM/shift-combine.ll

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1278,3 +1278,115 @@ define <4 x i32> @or_tree_with_mismatching_shifts_vec_i32(<4 x i32> %a, <4 x i32
12781278
%r = or <4 x i32> %or.ab, %or.cd
12791279
ret <4 x i32> %r
12801280
}
1281+
1282+
define arm_aapcscc i32 @test_shift15_and510(ptr nocapture %p) {
1283+
; CHECK-ARM-LABEL: test_shift15_and510:
1284+
; CHECK-ARM: @ %bb.0: @ %entry
1285+
; CHECK-ARM-NEXT: ldrb r0, [r0, #2]
1286+
; CHECK-ARM-NEXT: lsl r0, r0, #1
1287+
; CHECK-ARM-NEXT: bx lr
1288+
;
1289+
; CHECK-BE-LABEL: test_shift15_and510:
1290+
; CHECK-BE: @ %bb.0: @ %entry
1291+
; CHECK-BE-NEXT: ldrb r0, [r0, #1]
1292+
; CHECK-BE-NEXT: lsl r0, r0, #1
1293+
; CHECK-BE-NEXT: bx lr
1294+
;
1295+
; CHECK-THUMB-LABEL: test_shift15_and510:
1296+
; CHECK-THUMB: @ %bb.0: @ %entry
1297+
; CHECK-THUMB-NEXT: ldrb r0, [r0, #2]
1298+
; CHECK-THUMB-NEXT: lsls r0, r0, #1
1299+
; CHECK-THUMB-NEXT: bx lr
1300+
;
1301+
; CHECK-ALIGN-LABEL: test_shift15_and510:
1302+
; CHECK-ALIGN: @ %bb.0: @ %entry
1303+
; CHECK-ALIGN-NEXT: ldrb r0, [r0, #2]
1304+
; CHECK-ALIGN-NEXT: lsls r0, r0, #1
1305+
; CHECK-ALIGN-NEXT: bx lr
1306+
;
1307+
; CHECK-V6M-LABEL: test_shift15_and510:
1308+
; CHECK-V6M: @ %bb.0: @ %entry
1309+
; CHECK-V6M-NEXT: ldrb r0, [r0, #2]
1310+
; CHECK-V6M-NEXT: lsls r0, r0, #1
1311+
; CHECK-V6M-NEXT: bx lr
1312+
entry:
1313+
%load = load i32, ptr %p, align 4
1314+
%lshr = lshr i32 %load, 15
1315+
%and = and i32 %lshr, 510
1316+
ret i32 %and
1317+
}
1318+
1319+
define arm_aapcscc i32 @test_shift22_and1020(ptr nocapture %p) {
1320+
; CHECK-ARM-LABEL: test_shift22_and1020:
1321+
; CHECK-ARM: @ %bb.0: @ %entry
1322+
; CHECK-ARM-NEXT: ldrb r0, [r0, #3]
1323+
; CHECK-ARM-NEXT: lsl r0, r0, #2
1324+
; CHECK-ARM-NEXT: bx lr
1325+
;
1326+
; CHECK-BE-LABEL: test_shift22_and1020:
1327+
; CHECK-BE: @ %bb.0: @ %entry
1328+
; CHECK-BE-NEXT: ldrb r0, [r0]
1329+
; CHECK-BE-NEXT: lsl r0, r0, #2
1330+
; CHECK-BE-NEXT: bx lr
1331+
;
1332+
; CHECK-THUMB-LABEL: test_shift22_and1020:
1333+
; CHECK-THUMB: @ %bb.0: @ %entry
1334+
; CHECK-THUMB-NEXT: ldrb r0, [r0, #3]
1335+
; CHECK-THUMB-NEXT: lsls r0, r0, #2
1336+
; CHECK-THUMB-NEXT: bx lr
1337+
;
1338+
; CHECK-ALIGN-LABEL: test_shift22_and1020:
1339+
; CHECK-ALIGN: @ %bb.0: @ %entry
1340+
; CHECK-ALIGN-NEXT: ldrb r0, [r0, #3]
1341+
; CHECK-ALIGN-NEXT: lsls r0, r0, #2
1342+
; CHECK-ALIGN-NEXT: bx lr
1343+
;
1344+
; CHECK-V6M-LABEL: test_shift22_and1020:
1345+
; CHECK-V6M: @ %bb.0: @ %entry
1346+
; CHECK-V6M-NEXT: ldrb r0, [r0, #3]
1347+
; CHECK-V6M-NEXT: lsls r0, r0, #2
1348+
; CHECK-V6M-NEXT: bx lr
1349+
entry:
1350+
%load = load i32, ptr %p, align 4
1351+
%lshr = lshr i32 %load, 22
1352+
%and = and i32 %lshr, 1020
1353+
ret i32 %and
1354+
}
1355+
1356+
define arm_aapcscc i32 @test_zext_shift5_and2040(ptr nocapture %p) {
1357+
; CHECK-ARM-LABEL: test_zext_shift5_and2040:
1358+
; CHECK-ARM: @ %bb.0: @ %entry
1359+
; CHECK-ARM-NEXT: ldrb r0, [r0, #1]
1360+
; CHECK-ARM-NEXT: lsl r0, r0, #3
1361+
; CHECK-ARM-NEXT: bx lr
1362+
;
1363+
; CHECK-BE-LABEL: test_zext_shift5_and2040:
1364+
; CHECK-BE: @ %bb.0: @ %entry
1365+
; CHECK-BE-NEXT: ldrb r0, [r0]
1366+
; CHECK-BE-NEXT: lsl r0, r0, #3
1367+
; CHECK-BE-NEXT: bx lr
1368+
;
1369+
; CHECK-THUMB-LABEL: test_zext_shift5_and2040:
1370+
; CHECK-THUMB: @ %bb.0: @ %entry
1371+
; CHECK-THUMB-NEXT: ldrb r0, [r0, #1]
1372+
; CHECK-THUMB-NEXT: lsls r0, r0, #3
1373+
; CHECK-THUMB-NEXT: bx lr
1374+
;
1375+
; CHECK-ALIGN-LABEL: test_zext_shift5_and2040:
1376+
; CHECK-ALIGN: @ %bb.0: @ %entry
1377+
; CHECK-ALIGN-NEXT: ldrb r0, [r0, #1]
1378+
; CHECK-ALIGN-NEXT: lsls r0, r0, #3
1379+
; CHECK-ALIGN-NEXT: bx lr
1380+
;
1381+
; CHECK-V6M-LABEL: test_zext_shift5_and2040:
1382+
; CHECK-V6M: @ %bb.0: @ %entry
1383+
; CHECK-V6M-NEXT: ldrb r0, [r0, #1]
1384+
; CHECK-V6M-NEXT: lsls r0, r0, #3
1385+
; CHECK-V6M-NEXT: bx lr
1386+
entry:
1387+
%load = load i16, ptr %p, align 2
1388+
%zext = zext i16 %load to i32
1389+
%lshr = lshr i32 %zext, 5
1390+
%and = and i32 %lshr, 2040
1391+
ret i32 %and
1392+
}

llvm/test/CodeGen/X86/h-registers-2.ll

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,14 @@ define i32 @foo(ptr %x, i32 %y) nounwind {
99
; CHECK: # %bb.0:
1010
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
1111
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
12+
; CHECK-NEXT: imull %eax, %eax
1213
; CHECK-NEXT: movzbl %ah, %eax
1314
; CHECK-NEXT: movb $77, (%ecx,%eax,8)
1415
; CHECK-NEXT: shll $3, %eax
1516
; CHECK-NEXT: retl
1617

17-
%t0 = lshr i32 %y, 8 ; <i32> [#uses=1]
18+
%t4 = mul i32 %y, %y
19+
%t0 = lshr i32 %t4, 8 ; <i32> [#uses=1]
1820
%t1 = and i32 %t0, 255 ; <i32> [#uses=2]
1921
%t2 = shl i32 %t1, 3
2022
%t3 = getelementptr i8, ptr %x, i32 %t2 ; <ptr> [#uses=1]

0 commit comments

Comments
 (0)