@@ -41,6 +41,7 @@ STATISTIC(NumPostFolded, "Number of post-index updates folded");
41
41
STATISTIC (NumPreFolded, " Number of pre-index updates folded" );
42
42
STATISTIC (NumUnscaledPairCreated,
43
43
" Number of load/store from unscaled generated" );
44
+ STATISTIC (NumSmallTypeMerged, " Number of small type loads merged" );
44
45
45
46
static cl::opt<unsigned > ScanLimit (" aarch64-load-store-scan-limit" ,
46
47
cl::init (20 ), cl::Hidden);
@@ -77,12 +78,13 @@ typedef struct LdStPairFlags {
77
78
78
79
struct AArch64LoadStoreOpt : public MachineFunctionPass {
79
80
static char ID;
80
- AArch64LoadStoreOpt () : MachineFunctionPass(ID) {
81
+ AArch64LoadStoreOpt () : MachineFunctionPass(ID), IsStrictAlign( false ) {
81
82
initializeAArch64LoadStoreOptPass (*PassRegistry::getPassRegistry ());
82
83
}
83
84
84
85
const AArch64InstrInfo *TII;
85
86
const TargetRegisterInfo *TRI;
87
+ bool IsStrictAlign;
86
88
87
89
// Scan the instructions looking for a load/store that can be combined
88
90
// with the current instruction into a load/store pair.
@@ -122,6 +124,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
122
124
mergeUpdateInsn (MachineBasicBlock::iterator I,
123
125
MachineBasicBlock::iterator Update, bool IsPreIdx);
124
126
127
+ // Find and merge foldable ldr/str instructions.
128
+ bool tryToMergeLdStInst (MachineBasicBlock::iterator &MBBI);
129
+
125
130
bool optimizeBlock (MachineBasicBlock &MBB);
126
131
127
132
bool runOnMachineFunction (MachineFunction &Fn) override ;
@@ -151,6 +156,7 @@ static bool isUnscaledLdSt(unsigned Opc) {
151
156
case AArch64::LDURWi:
152
157
case AArch64::LDURXi:
153
158
case AArch64::LDURSWi:
159
+ case AArch64::LDURHHi:
154
160
return true ;
155
161
}
156
162
}
@@ -159,6 +165,20 @@ static bool isUnscaledLdSt(MachineInstr *MI) {
159
165
return isUnscaledLdSt (MI->getOpcode ());
160
166
}
161
167
168
+ static bool isSmallTypeLdMerge (unsigned Opc) {
169
+ switch (Opc) {
170
+ default :
171
+ return false ;
172
+ case AArch64::LDRHHui:
173
+ case AArch64::LDURHHi:
174
+ return true ;
175
+ // FIXME: Add other instructions (e.g, LDRBBui, LDURSHWi, LDRSHWui, etc.).
176
+ }
177
+ }
178
+ static bool isSmallTypeLdMerge (MachineInstr *MI) {
179
+ return isSmallTypeLdMerge (MI->getOpcode ());
180
+ }
181
+
162
182
// Scaling factor for unscaled load or store.
163
183
static int getMemScale (MachineInstr *MI) {
164
184
switch (MI->getOpcode ()) {
@@ -168,6 +188,7 @@ static int getMemScale(MachineInstr *MI) {
168
188
case AArch64::STRBBui:
169
189
return 1 ;
170
190
case AArch64::LDRHHui:
191
+ case AArch64::LDURHHi:
171
192
case AArch64::STRHHui:
172
193
return 2 ;
173
194
case AArch64::LDRSui:
@@ -238,6 +259,8 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
238
259
case AArch64::STURSi:
239
260
case AArch64::LDRSui:
240
261
case AArch64::LDURSi:
262
+ case AArch64::LDRHHui:
263
+ case AArch64::LDURHHi:
241
264
return Opc;
242
265
case AArch64::LDRSWui:
243
266
return AArch64::LDRWui;
@@ -283,6 +306,10 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
283
306
case AArch64::LDRSWui:
284
307
case AArch64::LDURSWi:
285
308
return AArch64::LDPSWi;
309
+ case AArch64::LDRHHui:
310
+ return AArch64::LDRWui;
311
+ case AArch64::LDURHHi:
312
+ return AArch64::LDURWi;
286
313
}
287
314
}
288
315
@@ -440,6 +467,21 @@ static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
440
467
return MI->getOperand (Idx);
441
468
}
442
469
470
+ // Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI.
471
+ static void concatenateMemOperands (MachineInstr *MI, MachineInstr *Op0,
472
+ MachineInstr *Op1) {
473
+ assert (MI->memoperands_empty () && " expected a new machineinstr" );
474
+ size_t numMemRefs = (Op0->memoperands_end () - Op0->memoperands_begin ()) +
475
+ (Op1->memoperands_end () - Op1->memoperands_begin ());
476
+
477
+ MachineFunction *MF = MI->getParent ()->getParent ();
478
+ MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray (numMemRefs);
479
+ MachineSDNode::mmo_iterator MemEnd =
480
+ std::copy (Op0->memoperands_begin (), Op0->memoperands_end (), MemBegin);
481
+ MemEnd = std::copy (Op1->memoperands_begin (), Op1->memoperands_end (), MemEnd);
482
+ MI->setMemRefs (MemBegin, MemEnd);
483
+ }
484
+
443
485
MachineBasicBlock::iterator
444
486
AArch64LoadStoreOpt::mergePairedInsns (MachineBasicBlock::iterator I,
445
487
MachineBasicBlock::iterator Paired,
@@ -484,8 +526,78 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
484
526
RtMI = I;
485
527
Rt2MI = Paired;
486
528
}
487
- // Handle Unscaled
529
+
488
530
int OffsetImm = getLdStOffsetOp (RtMI).getImm ();
531
+
532
+ if (isSmallTypeLdMerge (Opc)) {
533
+ // Change the scaled offset from small to large type.
534
+ if (!IsUnscaled)
535
+ OffsetImm /= 2 ;
536
+ MachineInstr *RtNewDest = MergeForward ? I : Paired;
537
+ // Construct the new load instruction.
538
+ // FIXME: currently we support only halfword unsigned load. We need to
539
+ // handle byte type, signed, and store instructions as well.
540
+ MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2;
541
+ NewMemMI = BuildMI (*I->getParent (), I, I->getDebugLoc (), TII->get (NewOpc))
542
+ .addOperand (getLdStRegOp (RtNewDest))
543
+ .addOperand (BaseRegOp)
544
+ .addImm (OffsetImm);
545
+
546
+ // Copy MachineMemOperands from the original loads.
547
+ concatenateMemOperands (NewMemMI, I, Paired);
548
+
549
+ DEBUG (
550
+ dbgs ()
551
+ << " Creating the new load and extract. Replacing instructions:\n " );
552
+ DEBUG (I->print (dbgs ()));
553
+ DEBUG (dbgs () << " " );
554
+ DEBUG (Paired->print (dbgs ()));
555
+ DEBUG (dbgs () << " with instructions:\n " );
556
+ DEBUG ((NewMemMI)->print (dbgs ()));
557
+
558
+ MachineInstr *ExtDestMI = MergeForward ? Paired : I;
559
+ if (ExtDestMI == Rt2MI) {
560
+ // Create the bitfield extract for high half.
561
+ BitExtMI1 = BuildMI (*I->getParent (), InsertionPoint, I->getDebugLoc (),
562
+ TII->get (AArch64::UBFMWri))
563
+ .addOperand (getLdStRegOp (Rt2MI))
564
+ .addReg (getLdStRegOp (RtNewDest).getReg ())
565
+ .addImm (16 )
566
+ .addImm (31 );
567
+ // Create the bitfield extract for low half.
568
+ BitExtMI2 = BuildMI (*I->getParent (), InsertionPoint, I->getDebugLoc (),
569
+ TII->get (AArch64::ANDWri))
570
+ .addOperand (getLdStRegOp (RtMI))
571
+ .addReg (getLdStRegOp (RtNewDest).getReg ())
572
+ .addImm (15 );
573
+ } else {
574
+ // Create the bitfield extract for low half.
575
+ BitExtMI1 = BuildMI (*I->getParent (), InsertionPoint, I->getDebugLoc (),
576
+ TII->get (AArch64::ANDWri))
577
+ .addOperand (getLdStRegOp (RtMI))
578
+ .addReg (getLdStRegOp (RtNewDest).getReg ())
579
+ .addImm (15 );
580
+ // Create the bitfield extract for high half.
581
+ BitExtMI2 = BuildMI (*I->getParent (), InsertionPoint, I->getDebugLoc (),
582
+ TII->get (AArch64::UBFMWri))
583
+ .addOperand (getLdStRegOp (Rt2MI))
584
+ .addReg (getLdStRegOp (RtNewDest).getReg ())
585
+ .addImm (16 )
586
+ .addImm (31 );
587
+ }
588
+ DEBUG (dbgs () << " " );
589
+ DEBUG ((BitExtMI1)->print (dbgs ()));
590
+ DEBUG (dbgs () << " " );
591
+ DEBUG ((BitExtMI2)->print (dbgs ()));
592
+ DEBUG (dbgs () << " \n " );
593
+
594
+ // Erase the old instructions.
595
+ I->eraseFromParent ();
596
+ Paired->eraseFromParent ();
597
+ return NextI;
598
+ }
599
+
600
+ // Handle Unscaled
489
601
if (IsUnscaled)
490
602
OffsetImm /= OffsetStride;
491
603
@@ -622,8 +734,7 @@ static bool mayAlias(MachineInstr *MIa,
622
734
// / be combined with the current instruction into a load/store pair.
623
735
MachineBasicBlock::iterator
624
736
AArch64LoadStoreOpt::findMatchingInsn (MachineBasicBlock::iterator I,
625
- LdStPairFlags &Flags,
626
- unsigned Limit) {
737
+ LdStPairFlags &Flags, unsigned Limit) {
627
738
MachineBasicBlock::iterator E = I->getParent ()->end ();
628
739
MachineBasicBlock::iterator MBBI = I;
629
740
MachineInstr *FirstMI = I;
@@ -645,7 +756,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
645
756
// range, plus allow an extra one in case we find a later insn that matches
646
757
// with Offset-1)
647
758
int OffsetStride = IsUnscaled ? getMemScale (FirstMI) : 1 ;
648
- if (!inBoundsForPair (IsUnscaled, Offset, OffsetStride))
759
+ if (!isSmallTypeLdMerge (Opc) &&
760
+ !inBoundsForPair (IsUnscaled, Offset, OffsetStride))
649
761
return E;
650
762
651
763
// Track which registers have been modified and used between the first insn
@@ -704,18 +816,32 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
704
816
// If the resultant immediate offset of merging these instructions
705
817
// is out of range for a pairwise instruction, bail and keep looking.
706
818
bool MIIsUnscaled = isUnscaledLdSt (MI);
707
- if (!inBoundsForPair (MIIsUnscaled, MinOffset, OffsetStride)) {
819
+ bool IsSmallTypeLd = isSmallTypeLdMerge (MI->getOpcode ());
820
+ if (!IsSmallTypeLd &&
821
+ !inBoundsForPair (MIIsUnscaled, MinOffset, OffsetStride)) {
708
822
trackRegDefsUses (MI, ModifiedRegs, UsedRegs, TRI);
709
823
MemInsns.push_back (MI);
710
824
continue ;
711
825
}
712
- // If the alignment requirements of the paired (scaled) instruction
713
- // can't express the offset of the unscaled input, bail and keep
714
- // looking.
715
- if (IsUnscaled && (alignTo (MinOffset, OffsetStride) != MinOffset)) {
716
- trackRegDefsUses (MI, ModifiedRegs, UsedRegs, TRI);
717
- MemInsns.push_back (MI);
718
- continue ;
826
+
827
+ if (IsSmallTypeLd) {
828
+ // If the alignment requirements of the larger type scaled load
829
+ // instruction can't express the scaled offset of the smaller type
830
+ // input, bail and keep looking.
831
+ if (!IsUnscaled && alignTo (MinOffset, 2 ) != MinOffset) {
832
+ trackRegDefsUses (MI, ModifiedRegs, UsedRegs, TRI);
833
+ MemInsns.push_back (MI);
834
+ continue ;
835
+ }
836
+ } else {
837
+ // If the alignment requirements of the paired (scaled) instruction
838
+ // can't express the offset of the unscaled input, bail and keep
839
+ // looking.
840
+ if (IsUnscaled && (alignTo (MinOffset, OffsetStride) != MinOffset)) {
841
+ trackRegDefsUses (MI, ModifiedRegs, UsedRegs, TRI);
842
+ MemInsns.push_back (MI);
843
+ continue ;
844
+ }
719
845
}
720
846
// If the destination register of the loads is the same register, bail
721
847
// and keep looking. A load-pair instruction with both destination
@@ -996,24 +1122,94 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
996
1122
return E;
997
1123
}
998
1124
1125
+ bool AArch64LoadStoreOpt::tryToMergeLdStInst (
1126
+ MachineBasicBlock::iterator &MBBI) {
1127
+ MachineInstr *MI = MBBI;
1128
+ MachineBasicBlock::iterator E = MI->getParent ()->end ();
1129
+ // If this is a volatile load/store, don't mess with it.
1130
+ if (MI->hasOrderedMemoryRef ())
1131
+ return false ;
1132
+
1133
+ // Make sure this is a reg+imm (as opposed to an address reloc).
1134
+ if (!getLdStOffsetOp (MI).isImm ())
1135
+ return false ;
1136
+
1137
+ // Check if this load/store has a hint to avoid pair formation.
1138
+ // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1139
+ if (TII->isLdStPairSuppressed (MI))
1140
+ return false ;
1141
+
1142
+ // Look ahead up to ScanLimit instructions for a pairable instruction.
1143
+ LdStPairFlags Flags;
1144
+ MachineBasicBlock::iterator Paired = findMatchingInsn (MBBI, Flags, ScanLimit);
1145
+ if (Paired != E) {
1146
+ if (isSmallTypeLdMerge (MI)) {
1147
+ ++NumSmallTypeMerged;
1148
+ } else {
1149
+ ++NumPairCreated;
1150
+ if (isUnscaledLdSt (MI))
1151
+ ++NumUnscaledPairCreated;
1152
+ }
1153
+
1154
+ // Merge the loads into a pair. Keeping the iterator straight is a
1155
+ // pain, so we let the merge routine tell us what the next instruction
1156
+ // is after it's done mucking about.
1157
+ MBBI = mergePairedInsns (MBBI, Paired, Flags);
1158
+ return true ;
1159
+ }
1160
+ return false ;
1161
+ }
1162
+
999
1163
bool AArch64LoadStoreOpt::optimizeBlock (MachineBasicBlock &MBB) {
1000
1164
bool Modified = false ;
1001
- // Two tranformations to do here:
1002
- // 1) Find loads and stores that can be merged into a single load or store
1165
+ // Three tranformations to do here:
1166
+ // 1) Find halfword loads that can be merged into a single 32-bit word load
1167
+ // with bitfield extract instructions.
1168
+ // e.g.,
1169
+ // ldrh w0, [x2]
1170
+ // ldrh w1, [x2, #2]
1171
+ // ; becomes
1172
+ // ldr w0, [x2]
1173
+ // ubfx w1, w0, #16, #16
1174
+ // and w0, w0, #ffff
1175
+ // 2) Find loads and stores that can be merged into a single load or store
1003
1176
// pair instruction.
1004
1177
// e.g.,
1005
1178
// ldr x0, [x2]
1006
1179
// ldr x1, [x2, #8]
1007
1180
// ; becomes
1008
1181
// ldp x0, x1, [x2]
1009
- // 2 ) Find base register updates that can be merged into the load or store
1182
+ // 3 ) Find base register updates that can be merged into the load or store
1010
1183
// as a base-reg writeback.
1011
1184
// e.g.,
1012
1185
// ldr x0, [x2]
1013
1186
// add x2, x2, #4
1014
1187
// ; becomes
1015
1188
// ldr x0, [x2], #4
1016
1189
1190
+ for (MachineBasicBlock::iterator MBBI = MBB.begin (), E = MBB.end ();
1191
+ !IsStrictAlign && MBBI != E;) {
1192
+ MachineInstr *MI = MBBI;
1193
+ switch (MI->getOpcode ()) {
1194
+ default :
1195
+ // Just move on to the next instruction.
1196
+ ++MBBI;
1197
+ break ;
1198
+ // Scaled instructions.
1199
+ case AArch64::LDRHHui:
1200
+ // Unscaled instructions.
1201
+ case AArch64::LDURHHi: {
1202
+ if (tryToMergeLdStInst (MBBI)) {
1203
+ Modified = true ;
1204
+ break ;
1205
+ }
1206
+ ++MBBI;
1207
+ break ;
1208
+ }
1209
+ // FIXME: Do the other instructions.
1210
+ }
1211
+ }
1212
+
1017
1213
for (MachineBasicBlock::iterator MBBI = MBB.begin (), E = MBB.end ();
1018
1214
MBBI != E;) {
1019
1215
MachineInstr *MI = MBBI;
@@ -1046,35 +1242,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
1046
1242
case AArch64::LDURWi:
1047
1243
case AArch64::LDURXi:
1048
1244
case AArch64::LDURSWi: {
1049
- // If this is a volatile load/store, don't mess with it.
1050
- if (MI->hasOrderedMemoryRef ()) {
1051
- ++MBBI;
1052
- break ;
1053
- }
1054
- // Make sure this is a reg+imm (as opposed to an address reloc).
1055
- if (!getLdStOffsetOp (MI).isImm ()) {
1056
- ++MBBI;
1057
- break ;
1058
- }
1059
- // Check if this load/store has a hint to avoid pair formation.
1060
- // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1061
- if (TII->isLdStPairSuppressed (MI)) {
1062
- ++MBBI;
1063
- break ;
1064
- }
1065
- // Look ahead up to ScanLimit instructions for a pairable instruction.
1066
- LdStPairFlags Flags;
1067
- MachineBasicBlock::iterator Paired =
1068
- findMatchingInsn (MBBI, Flags, ScanLimit);
1069
- if (Paired != E) {
1070
- ++NumPairCreated;
1071
- if (isUnscaledLdSt (MI))
1072
- ++NumUnscaledPairCreated;
1073
-
1074
- // Merge the loads into a pair. Keeping the iterator straight is a
1075
- // pain, so we let the merge routine tell us what the next instruction
1076
- // is after it's done mucking about.
1077
- MBBI = mergePairedInsns (MBBI, Paired, Flags);
1245
+ if (tryToMergeLdStInst (MBBI)) {
1078
1246
Modified = true ;
1079
1247
break ;
1080
1248
}
@@ -1206,6 +1374,8 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
1206
1374
bool AArch64LoadStoreOpt::runOnMachineFunction (MachineFunction &Fn) {
1207
1375
TII = static_cast <const AArch64InstrInfo *>(Fn.getSubtarget ().getInstrInfo ());
1208
1376
TRI = Fn.getSubtarget ().getRegisterInfo ();
1377
+ IsStrictAlign = (static_cast <const AArch64Subtarget &>(Fn.getSubtarget ()))
1378
+ .requiresStrictAlign ();
1209
1379
1210
1380
bool Modified = false ;
1211
1381
for (auto &MBB : Fn)
0 commit comments