@@ -41,6 +41,7 @@ STATISTIC(NumPostFolded, "Number of post-index updates folded");
41
41
STATISTIC (NumPreFolded, " Number of pre-index updates folded" );
42
42
STATISTIC (NumUnscaledPairCreated,
43
43
" Number of load/store from unscaled generated" );
44
+ STATISTIC (NumSmallTypeMerged, " Number of small type loads merged" );
44
45
45
46
static cl::opt<unsigned > ScanLimit (" aarch64-load-store-scan-limit" ,
46
47
cl::init (20 ), cl::Hidden);
@@ -77,12 +78,13 @@ typedef struct LdStPairFlags {
77
78
78
79
struct AArch64LoadStoreOpt : public MachineFunctionPass {
79
80
static char ID;
80
- AArch64LoadStoreOpt () : MachineFunctionPass(ID) {
81
+ AArch64LoadStoreOpt () : MachineFunctionPass(ID), IsStrictAlign( false ) {
81
82
initializeAArch64LoadStoreOptPass (*PassRegistry::getPassRegistry ());
82
83
}
83
84
84
85
const AArch64InstrInfo *TII;
85
86
const TargetRegisterInfo *TRI;
87
+ bool IsStrictAlign;
86
88
87
89
// Scan the instructions looking for a load/store that can be combined
88
90
// with the current instruction into a load/store pair.
@@ -122,6 +124,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
122
124
mergeUpdateInsn (MachineBasicBlock::iterator I,
123
125
MachineBasicBlock::iterator Update, bool IsPreIdx);
124
126
127
+ // Find and merge foldable ldr/str instructions.
128
+ bool tryToMergeLdStInst (MachineBasicBlock::iterator &MBBI);
129
+
125
130
bool optimizeBlock (MachineBasicBlock &MBB);
126
131
127
132
bool runOnMachineFunction (MachineFunction &Fn) override ;
@@ -151,6 +156,7 @@ static bool isUnscaledLdSt(unsigned Opc) {
151
156
case AArch64::LDURWi:
152
157
case AArch64::LDURXi:
153
158
case AArch64::LDURSWi:
159
+ case AArch64::LDURHHi:
154
160
return true ;
155
161
}
156
162
}
@@ -159,6 +165,20 @@ static bool isUnscaledLdSt(MachineInstr *MI) {
159
165
return isUnscaledLdSt (MI->getOpcode ());
160
166
}
161
167
168
+ static bool isSmallTypeLdMerge (unsigned Opc) {
169
+ switch (Opc) {
170
+ default :
171
+ return false ;
172
+ case AArch64::LDRHHui:
173
+ case AArch64::LDURHHi:
174
+ return true ;
175
+ // FIXME: Add other instructions (e.g, LDRBBui, LDURSHWi, LDRSHWui, etc.).
176
+ }
177
+ }
178
+ static bool isSmallTypeLdMerge (MachineInstr *MI) {
179
+ return isSmallTypeLdMerge (MI->getOpcode ());
180
+ }
181
+
162
182
// Scaling factor for unscaled load or store.
163
183
static int getMemScale (MachineInstr *MI) {
164
184
switch (MI->getOpcode ()) {
@@ -168,6 +188,7 @@ static int getMemScale(MachineInstr *MI) {
168
188
case AArch64::STRBBui:
169
189
return 1 ;
170
190
case AArch64::LDRHHui:
191
+ case AArch64::LDURHHi:
171
192
case AArch64::STRHHui:
172
193
return 2 ;
173
194
case AArch64::LDRSui:
@@ -238,6 +259,8 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
238
259
case AArch64::STURSi:
239
260
case AArch64::LDRSui:
240
261
case AArch64::LDURSi:
262
+ case AArch64::LDRHHui:
263
+ case AArch64::LDURHHi:
241
264
return Opc;
242
265
case AArch64::LDRSWui:
243
266
return AArch64::LDRWui;
@@ -283,6 +306,10 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
283
306
case AArch64::LDRSWui:
284
307
case AArch64::LDURSWi:
285
308
return AArch64::LDPSWi;
309
+ case AArch64::LDRHHui:
310
+ return AArch64::LDRWui;
311
+ case AArch64::LDURHHi:
312
+ return AArch64::LDURWi;
286
313
}
287
314
}
288
315
@@ -440,6 +467,21 @@ static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
440
467
return MI->getOperand (Idx);
441
468
}
442
469
470
+ // Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI.
471
+ static void concatenateMemOperands (MachineInstr *MI, MachineInstr *Op0,
472
+ MachineInstr *Op1) {
473
+ assert (MI->memoperands_empty () && " expected a new machineinstr" );
474
+ size_t numMemRefs = (Op0->memoperands_end () - Op0->memoperands_begin ()) +
475
+ (Op1->memoperands_end () - Op1->memoperands_begin ());
476
+
477
+ MachineFunction *MF = MI->getParent ()->getParent ();
478
+ MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray (numMemRefs);
479
+ MachineSDNode::mmo_iterator MemEnd =
480
+ std::copy (Op0->memoperands_begin (), Op0->memoperands_end (), MemBegin);
481
+ MemEnd = std::copy (Op1->memoperands_begin (), Op1->memoperands_end (), MemEnd);
482
+ MI->setMemRefs (MemBegin, MemEnd);
483
+ }
484
+
443
485
MachineBasicBlock::iterator
444
486
AArch64LoadStoreOpt::mergePairedInsns (MachineBasicBlock::iterator I,
445
487
MachineBasicBlock::iterator Paired,
@@ -484,8 +526,79 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
484
526
RtMI = I;
485
527
Rt2MI = Paired;
486
528
}
487
- // Handle Unscaled
529
+
488
530
int OffsetImm = getLdStOffsetOp (RtMI).getImm ();
531
+
532
+ if (isSmallTypeLdMerge (Opc)) {
533
+ // Change the scaled offset from small to large type.
534
+ if (!IsUnscaled)
535
+ OffsetImm /= 2 ;
536
+ MachineInstr *RtNewDest = MergeForward ? I : Paired;
537
+ // Construct the new load instruction.
538
+ // FIXME: currently we support only halfword unsigned load. We need to
539
+ // handle byte type, signed, and store instructions as well.
540
+ MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2;
541
+ NewMemMI = BuildMI (*I->getParent (), InsertionPoint, I->getDebugLoc (),
542
+ TII->get (NewOpc))
543
+ .addOperand (getLdStRegOp (RtNewDest))
544
+ .addOperand (BaseRegOp)
545
+ .addImm (OffsetImm);
546
+
547
+ // Copy MachineMemOperands from the original loads.
548
+ concatenateMemOperands (NewMemMI, I, Paired);
549
+
550
+ DEBUG (
551
+ dbgs ()
552
+ << " Creating the new load and extract. Replacing instructions:\n " );
553
+ DEBUG (I->print (dbgs ()));
554
+ DEBUG (dbgs () << " " );
555
+ DEBUG (Paired->print (dbgs ()));
556
+ DEBUG (dbgs () << " with instructions:\n " );
557
+ DEBUG ((NewMemMI)->print (dbgs ()));
558
+
559
+ MachineInstr *ExtDestMI = MergeForward ? Paired : I;
560
+ if (ExtDestMI == Rt2MI) {
561
+ // Create the bitfield extract for high half.
562
+ BitExtMI1 = BuildMI (*I->getParent (), InsertionPoint, I->getDebugLoc (),
563
+ TII->get (AArch64::UBFMWri))
564
+ .addOperand (getLdStRegOp (Rt2MI))
565
+ .addReg (getLdStRegOp (RtNewDest).getReg ())
566
+ .addImm (16 )
567
+ .addImm (31 );
568
+ // Create the bitfield extract for low half.
569
+ BitExtMI2 = BuildMI (*I->getParent (), InsertionPoint, I->getDebugLoc (),
570
+ TII->get (AArch64::ANDWri))
571
+ .addOperand (getLdStRegOp (RtMI))
572
+ .addReg (getLdStRegOp (RtNewDest).getReg ())
573
+ .addImm (15 );
574
+ } else {
575
+ // Create the bitfield extract for low half.
576
+ BitExtMI1 = BuildMI (*I->getParent (), InsertionPoint, I->getDebugLoc (),
577
+ TII->get (AArch64::ANDWri))
578
+ .addOperand (getLdStRegOp (RtMI))
579
+ .addReg (getLdStRegOp (RtNewDest).getReg ())
580
+ .addImm (15 );
581
+ // Create the bitfield extract for high half.
582
+ BitExtMI2 = BuildMI (*I->getParent (), InsertionPoint, I->getDebugLoc (),
583
+ TII->get (AArch64::UBFMWri))
584
+ .addOperand (getLdStRegOp (Rt2MI))
585
+ .addReg (getLdStRegOp (RtNewDest).getReg ())
586
+ .addImm (16 )
587
+ .addImm (31 );
588
+ }
589
+ DEBUG (dbgs () << " " );
590
+ DEBUG ((BitExtMI1)->print (dbgs ()));
591
+ DEBUG (dbgs () << " " );
592
+ DEBUG ((BitExtMI2)->print (dbgs ()));
593
+ DEBUG (dbgs () << " \n " );
594
+
595
+ // Erase the old instructions.
596
+ I->eraseFromParent ();
597
+ Paired->eraseFromParent ();
598
+ return NextI;
599
+ }
600
+
601
+ // Handle Unscaled
489
602
if (IsUnscaled)
490
603
OffsetImm /= OffsetStride;
491
604
@@ -622,8 +735,7 @@ static bool mayAlias(MachineInstr *MIa,
622
735
// / be combined with the current instruction into a load/store pair.
623
736
MachineBasicBlock::iterator
624
737
AArch64LoadStoreOpt::findMatchingInsn (MachineBasicBlock::iterator I,
625
- LdStPairFlags &Flags,
626
- unsigned Limit) {
738
+ LdStPairFlags &Flags, unsigned Limit) {
627
739
MachineBasicBlock::iterator E = I->getParent ()->end ();
628
740
MachineBasicBlock::iterator MBBI = I;
629
741
MachineInstr *FirstMI = I;
@@ -645,7 +757,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
645
757
// range, plus allow an extra one in case we find a later insn that matches
646
758
// with Offset-1)
647
759
int OffsetStride = IsUnscaled ? getMemScale (FirstMI) : 1 ;
648
- if (!inBoundsForPair (IsUnscaled, Offset, OffsetStride))
760
+ if (!isSmallTypeLdMerge (Opc) &&
761
+ !inBoundsForPair (IsUnscaled, Offset, OffsetStride))
649
762
return E;
650
763
651
764
// Track which registers have been modified and used between the first insn
@@ -704,18 +817,32 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
704
817
// If the resultant immediate offset of merging these instructions
705
818
// is out of range for a pairwise instruction, bail and keep looking.
706
819
bool MIIsUnscaled = isUnscaledLdSt (MI);
707
- if (!inBoundsForPair (MIIsUnscaled, MinOffset, OffsetStride)) {
820
+ bool IsSmallTypeLd = isSmallTypeLdMerge (MI->getOpcode ());
821
+ if (!IsSmallTypeLd &&
822
+ !inBoundsForPair (MIIsUnscaled, MinOffset, OffsetStride)) {
708
823
trackRegDefsUses (MI, ModifiedRegs, UsedRegs, TRI);
709
824
MemInsns.push_back (MI);
710
825
continue ;
711
826
}
712
- // If the alignment requirements of the paired (scaled) instruction
713
- // can't express the offset of the unscaled input, bail and keep
714
- // looking.
715
- if (IsUnscaled && (alignTo (MinOffset, OffsetStride) != MinOffset)) {
716
- trackRegDefsUses (MI, ModifiedRegs, UsedRegs, TRI);
717
- MemInsns.push_back (MI);
718
- continue ;
827
+
828
+ if (IsSmallTypeLd) {
829
+ // If the alignment requirements of the larger type scaled load
830
+ // instruction can't express the scaled offset of the smaller type
831
+ // input, bail and keep looking.
832
+ if (!IsUnscaled && alignTo (MinOffset, 2 ) != MinOffset) {
833
+ trackRegDefsUses (MI, ModifiedRegs, UsedRegs, TRI);
834
+ MemInsns.push_back (MI);
835
+ continue ;
836
+ }
837
+ } else {
838
+ // If the alignment requirements of the paired (scaled) instruction
839
+ // can't express the offset of the unscaled input, bail and keep
840
+ // looking.
841
+ if (IsUnscaled && (alignTo (MinOffset, OffsetStride) != MinOffset)) {
842
+ trackRegDefsUses (MI, ModifiedRegs, UsedRegs, TRI);
843
+ MemInsns.push_back (MI);
844
+ continue ;
845
+ }
719
846
}
720
847
// If the destination register of the loads is the same register, bail
721
848
// and keep looking. A load-pair instruction with both destination
@@ -996,24 +1123,94 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
996
1123
return E;
997
1124
}
998
1125
1126
+ bool AArch64LoadStoreOpt::tryToMergeLdStInst (
1127
+ MachineBasicBlock::iterator &MBBI) {
1128
+ MachineInstr *MI = MBBI;
1129
+ MachineBasicBlock::iterator E = MI->getParent ()->end ();
1130
+ // If this is a volatile load/store, don't mess with it.
1131
+ if (MI->hasOrderedMemoryRef ())
1132
+ return false ;
1133
+
1134
+ // Make sure this is a reg+imm (as opposed to an address reloc).
1135
+ if (!getLdStOffsetOp (MI).isImm ())
1136
+ return false ;
1137
+
1138
+ // Check if this load/store has a hint to avoid pair formation.
1139
+ // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1140
+ if (TII->isLdStPairSuppressed (MI))
1141
+ return false ;
1142
+
1143
+ // Look ahead up to ScanLimit instructions for a pairable instruction.
1144
+ LdStPairFlags Flags;
1145
+ MachineBasicBlock::iterator Paired = findMatchingInsn (MBBI, Flags, ScanLimit);
1146
+ if (Paired != E) {
1147
+ if (isSmallTypeLdMerge (MI)) {
1148
+ ++NumSmallTypeMerged;
1149
+ } else {
1150
+ ++NumPairCreated;
1151
+ if (isUnscaledLdSt (MI))
1152
+ ++NumUnscaledPairCreated;
1153
+ }
1154
+
1155
+ // Merge the loads into a pair. Keeping the iterator straight is a
1156
+ // pain, so we let the merge routine tell us what the next instruction
1157
+ // is after it's done mucking about.
1158
+ MBBI = mergePairedInsns (MBBI, Paired, Flags);
1159
+ return true ;
1160
+ }
1161
+ return false ;
1162
+ }
1163
+
999
1164
bool AArch64LoadStoreOpt::optimizeBlock (MachineBasicBlock &MBB) {
1000
1165
bool Modified = false ;
1001
- // Two tranformations to do here:
1002
- // 1) Find loads and stores that can be merged into a single load or store
1166
+ // Three tranformations to do here:
1167
+ // 1) Find halfword loads that can be merged into a single 32-bit word load
1168
+ // with bitfield extract instructions.
1169
+ // e.g.,
1170
+ // ldrh w0, [x2]
1171
+ // ldrh w1, [x2, #2]
1172
+ // ; becomes
1173
+ // ldr w0, [x2]
1174
+ // ubfx w1, w0, #16, #16
1175
+ // and w0, w0, #ffff
1176
+ // 2) Find loads and stores that can be merged into a single load or store
1003
1177
// pair instruction.
1004
1178
// e.g.,
1005
1179
// ldr x0, [x2]
1006
1180
// ldr x1, [x2, #8]
1007
1181
// ; becomes
1008
1182
// ldp x0, x1, [x2]
1009
- // 2 ) Find base register updates that can be merged into the load or store
1183
+ // 3 ) Find base register updates that can be merged into the load or store
1010
1184
// as a base-reg writeback.
1011
1185
// e.g.,
1012
1186
// ldr x0, [x2]
1013
1187
// add x2, x2, #4
1014
1188
// ; becomes
1015
1189
// ldr x0, [x2], #4
1016
1190
1191
+ for (MachineBasicBlock::iterator MBBI = MBB.begin (), E = MBB.end ();
1192
+ !IsStrictAlign && MBBI != E;) {
1193
+ MachineInstr *MI = MBBI;
1194
+ switch (MI->getOpcode ()) {
1195
+ default :
1196
+ // Just move on to the next instruction.
1197
+ ++MBBI;
1198
+ break ;
1199
+ // Scaled instructions.
1200
+ case AArch64::LDRHHui:
1201
+ // Unscaled instructions.
1202
+ case AArch64::LDURHHi: {
1203
+ if (tryToMergeLdStInst (MBBI)) {
1204
+ Modified = true ;
1205
+ break ;
1206
+ }
1207
+ ++MBBI;
1208
+ break ;
1209
+ }
1210
+ // FIXME: Do the other instructions.
1211
+ }
1212
+ }
1213
+
1017
1214
for (MachineBasicBlock::iterator MBBI = MBB.begin (), E = MBB.end ();
1018
1215
MBBI != E;) {
1019
1216
MachineInstr *MI = MBBI;
@@ -1046,35 +1243,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
1046
1243
case AArch64::LDURWi:
1047
1244
case AArch64::LDURXi:
1048
1245
case AArch64::LDURSWi: {
1049
- // If this is a volatile load/store, don't mess with it.
1050
- if (MI->hasOrderedMemoryRef ()) {
1051
- ++MBBI;
1052
- break ;
1053
- }
1054
- // Make sure this is a reg+imm (as opposed to an address reloc).
1055
- if (!getLdStOffsetOp (MI).isImm ()) {
1056
- ++MBBI;
1057
- break ;
1058
- }
1059
- // Check if this load/store has a hint to avoid pair formation.
1060
- // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1061
- if (TII->isLdStPairSuppressed (MI)) {
1062
- ++MBBI;
1063
- break ;
1064
- }
1065
- // Look ahead up to ScanLimit instructions for a pairable instruction.
1066
- LdStPairFlags Flags;
1067
- MachineBasicBlock::iterator Paired =
1068
- findMatchingInsn (MBBI, Flags, ScanLimit);
1069
- if (Paired != E) {
1070
- ++NumPairCreated;
1071
- if (isUnscaledLdSt (MI))
1072
- ++NumUnscaledPairCreated;
1073
-
1074
- // Merge the loads into a pair. Keeping the iterator straight is a
1075
- // pain, so we let the merge routine tell us what the next instruction
1076
- // is after it's done mucking about.
1077
- MBBI = mergePairedInsns (MBBI, Paired, Flags);
1246
+ if (tryToMergeLdStInst (MBBI)) {
1078
1247
Modified = true ;
1079
1248
break ;
1080
1249
}
@@ -1206,6 +1375,8 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
1206
1375
bool AArch64LoadStoreOpt::runOnMachineFunction (MachineFunction &Fn) {
1207
1376
TII = static_cast <const AArch64InstrInfo *>(Fn.getSubtarget ().getInstrInfo ());
1208
1377
TRI = Fn.getSubtarget ().getRegisterInfo ();
1378
+ IsStrictAlign = (static_cast <const AArch64Subtarget &>(Fn.getSubtarget ()))
1379
+ .requiresStrictAlign ();
1209
1380
1210
1381
bool Modified = false ;
1211
1382
for (auto &MBB : Fn)
0 commit comments