@@ -201,6 +201,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
201
201
// Find and merge a base register updates before or after a ld/st instruction.
202
202
bool tryToMergeLdStUpdate (MachineBasicBlock::iterator &MBBI);
203
203
204
+ // Finds and collapses loads of repeated constant values.
205
+ bool foldRepeatedConstantLoads (MachineBasicBlock::iterator &I,
206
+ unsigned Limit);
207
+ MachineBasicBlock::iterator tryToFoldRepeatedConstantLoads (
208
+ MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
209
+ int SuccIndex, int Accumulated);
210
+
204
211
bool optimizeBlock (MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
205
212
206
213
bool runOnMachineFunction (MachineFunction &Fn) override ;
@@ -2252,6 +2259,151 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2252
2259
return E;
2253
2260
}
2254
2261
2262
+ static bool isRepeatable (MachineInstr &MI, Register BaseReg) {
2263
+ auto MatchBaseReg = [&](unsigned Count) {
2264
+ for (unsigned I = 0 ; I < Count; I++) {
2265
+ auto OpI = MI.getOperand (I);
2266
+ if (OpI.isReg () && OpI.getReg () != BaseReg)
2267
+ return false ;
2268
+ }
2269
+ return true ;
2270
+ };
2271
+
2272
+ unsigned Opc = MI.getOpcode ();
2273
+ switch (Opc) {
2274
+ default :
2275
+ return false ;
2276
+ case AArch64::MOVZXi:
2277
+ return MatchBaseReg (1 );
2278
+ case AArch64::MOVKXi:
2279
+ return MatchBaseReg (2 );
2280
+ case AArch64::ORRXrs:
2281
+ case AArch64::ORRWrs:
2282
+ MachineOperand &Imm = MI.getOperand (3 );
2283
+ unsigned BitSize = Opc == AArch64::ORRXrs ? 32 : 16 ;
2284
+ if (MatchBaseReg (3 ) && Imm.isImm () && Imm.getImm () == BitSize)
2285
+ return true ;
2286
+ }
2287
+
2288
+ return false ;
2289
+ }
2290
+
2291
+ MachineBasicBlock::iterator AArch64LoadStoreOpt::tryToFoldRepeatedConstantLoads (
2292
+ MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
2293
+ int SuccIndex, int Accumulated) {
2294
+ MachineBasicBlock::iterator I = MI.getIterator ();
2295
+ MachineBasicBlock::iterator E = I->getParent ()->end ();
2296
+ MachineBasicBlock::iterator NextI = next_nodbg (I, E);
2297
+ MachineBasicBlock::iterator FirstMovI;
2298
+ MachineBasicBlock *MBB = MI.getParent ();
2299
+ uint64_t Mask = 0xFFFFUL ;
2300
+ int Index = 0 ;
2301
+
2302
+ for (auto MI = MIs.begin (), E = MIs.end (); MI != E; ++MI, Index++) {
2303
+ if (Index == SuccIndex - 1 ) {
2304
+ FirstMovI = *MI;
2305
+ break ;
2306
+ }
2307
+ (*MI)->eraseFromParent ();
2308
+ }
2309
+
2310
+ Register DstRegW =
2311
+ TRI->getSubReg (FirstMovI->getOperand (0 ).getReg (), AArch64::sub_32);
2312
+ BuildMI (*MBB, FirstMovI, FirstMovI->getDebugLoc (), TII->get (AArch64::MOVZWi),
2313
+ DstRegW)
2314
+ .addImm (Accumulated & Mask)
2315
+ .addImm (0 );
2316
+ BuildMI (*MBB, FirstMovI, FirstMovI->getDebugLoc (), TII->get (AArch64::MOVKWi),
2317
+ DstRegW)
2318
+ .addUse (DstRegW)
2319
+ .addImm ((Accumulated >> 16 ) & Mask)
2320
+ .addImm (AArch64_AM::getShifterImm (AArch64_AM::LSL, 16 ));
2321
+ FirstMovI->eraseFromParent ();
2322
+
2323
+ Register BaseReg = getLdStRegOp (MI).getReg ();
2324
+ const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp (MI);
2325
+ DstRegW = TRI->getSubReg (BaseReg, AArch64::sub_32);
2326
+ unsigned DstRegState = getRegState (MI.getOperand (0 ));
2327
+ BuildMI (*MBB, MI, MI.getDebugLoc (), TII->get (AArch64::STPWi))
2328
+ .addReg (DstRegW, DstRegState)
2329
+ .addReg (DstRegW, DstRegState)
2330
+ .addReg (MO.getReg (), getRegState (MO))
2331
+ .add (AArch64InstrInfo::getLdStOffsetOp (MI))
2332
+ .setMemRefs (MI.memoperands ())
2333
+ .setMIFlags (MI.getFlags ());
2334
+ I->eraseFromParent ();
2335
+
2336
+ return NextI;
2337
+ }
2338
+
2339
+ bool AArch64LoadStoreOpt::foldRepeatedConstantLoads (
2340
+ MachineBasicBlock::iterator &I, unsigned Limit) {
2341
+ MachineInstr &MI = *I;
2342
+ if (MI.getOpcode () != AArch64::STRXui)
2343
+ return false ;
2344
+
2345
+ MachineBasicBlock::iterator MBBI = I;
2346
+ MachineBasicBlock::iterator B = I->getParent ()->begin ();
2347
+ if (MBBI == B)
2348
+ return false ;
2349
+
2350
+ Register BaseReg = getLdStRegOp (MI).getReg ();
2351
+ unsigned Count = 0 , SuccIndex = 0 , DupBitSize = 0 ;
2352
+ uint64_t Accumulated = 0 ;
2353
+ SmallVector<MachineBasicBlock::iterator> MIs;
2354
+ ModifiedRegUnits.clear ();
2355
+ UsedRegUnits.clear ();
2356
+
2357
+ do {
2358
+ MBBI = prev_nodbg (MBBI, B);
2359
+ MachineInstr &MI = *MBBI;
2360
+ if (!MI.isTransient ())
2361
+ ++Count;
2362
+ if (!isRepeatable (MI, BaseReg)) {
2363
+ LiveRegUnits::accumulateUsedDefed (MI, ModifiedRegUnits, UsedRegUnits,
2364
+ TRI);
2365
+ if (!ModifiedRegUnits.available (BaseReg) ||
2366
+ !UsedRegUnits.available (BaseReg))
2367
+ break ;
2368
+ continue ;
2369
+ }
2370
+
2371
+ unsigned Opc = MI.getOpcode ();
2372
+ if (Opc == AArch64::ORRXrs || Opc == AArch64::ORRWrs) {
2373
+ DupBitSize = Opc == AArch64::ORRXrs ? 32 : 16 ;
2374
+ MIs.push_back (MBBI);
2375
+ continue ;
2376
+ }
2377
+ unsigned ValueOrder = Opc == AArch64::MOVZXi ? 1 : 2 ;
2378
+ MachineOperand Value = MI.getOperand (ValueOrder);
2379
+ MachineOperand Shift = MI.getOperand (ValueOrder + 1 );
2380
+ if (!Value.isImm () || !Shift.isImm ())
2381
+ return false ;
2382
+
2383
+ uint64_t IValue = Value.getImm ();
2384
+ uint64_t IShift = Shift.getImm ();
2385
+ uint64_t mask = 0xFFFFUL ;
2386
+ Accumulated -= (Accumulated & (mask << IShift));
2387
+ Accumulated += (IValue << IShift);
2388
+ MIs.push_back (MBBI);
2389
+
2390
+ if (ValueOrder == 1 && DupBitSize) {
2391
+ Accumulated |= Accumulated << DupBitSize;
2392
+ DupBitSize = 0 ;
2393
+ }
2394
+
2395
+ if (Accumulated != 0 && (Accumulated >> 32 ) == (Accumulated & UINT_MAX))
2396
+ SuccIndex = MIs.size ();
2397
+ } while (MBBI != B && Count < Limit);
2398
+
2399
+ if (SuccIndex) {
2400
+ I = tryToFoldRepeatedConstantLoads (MI, MIs, SuccIndex, Accumulated);
2401
+ return true ;
2402
+ }
2403
+
2404
+ return false ;
2405
+ }
2406
+
2255
2407
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore (
2256
2408
MachineBasicBlock::iterator &MBBI) {
2257
2409
MachineInstr &MI = *MBBI;
@@ -2518,6 +2670,27 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2518
2670
++MBBI;
2519
2671
}
2520
2672
2673
+ // We have an opportunity to optimize the `STRXui` instruction, which loads
2674
+ // the same 32-bit value into a register twice. The `STPXi` instruction allows
2675
+ // us to load a 32-bit value only once.
2676
+ // Considering :
2677
+ // mov x8, 49370
2678
+ // movk x8, 320, lsl #16
2679
+ // movk x8, 49370, lsl #32
2680
+ // movk x8, 320, lsl #48
2681
+ // str x8, [x0]
2682
+ // Transform :
2683
+ // mov w8, 49370
2684
+ // movk w8, 320, lsl #16
2685
+ // stp w8, w8, [x0]
2686
+ for (MachineBasicBlock::iterator MBBI = MBB.begin (), E = MBB.end ();
2687
+ MBBI != E;) {
2688
+ if (foldRepeatedConstantLoads (MBBI, UpdateLimit))
2689
+ Modified = true ;
2690
+ else
2691
+ ++MBBI;
2692
+ }
2693
+
2521
2694
return Modified;
2522
2695
}
2523
2696
0 commit comments