@@ -199,6 +199,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
199
199
// Find and merge a base register updates before or after a ld/st instruction.
200
200
bool tryToMergeLdStUpdate (MachineBasicBlock::iterator &MBBI);
201
201
202
+ // Finds and collapses loads of repeated constant values.
203
+ bool foldRepeatedConstantLoads (MachineBasicBlock::iterator &I,
204
+ unsigned Limit);
205
+ MachineBasicBlock::iterator tryToFoldRepeatedConstantLoads (
206
+ MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
207
+ int SuccIndex, int Accumulated);
208
+
202
209
bool optimizeBlock (MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
203
210
204
211
bool runOnMachineFunction (MachineFunction &Fn) override ;
@@ -2250,6 +2257,151 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2250
2257
return E;
2251
2258
}
2252
2259
2260
+ static bool isRepeatable (MachineInstr &MI, Register BaseReg) {
2261
+ auto MatchBaseReg = [&](unsigned Count) {
2262
+ for (unsigned I = 0 ; I < Count; I++) {
2263
+ auto OpI = MI.getOperand (I);
2264
+ if (OpI.isReg () && OpI.getReg () != BaseReg)
2265
+ return false ;
2266
+ }
2267
+ return true ;
2268
+ };
2269
+
2270
+ unsigned Opc = MI.getOpcode ();
2271
+ switch (Opc) {
2272
+ default :
2273
+ return false ;
2274
+ case AArch64::MOVZXi:
2275
+ return MatchBaseReg (1 );
2276
+ case AArch64::MOVKXi:
2277
+ return MatchBaseReg (2 );
2278
+ case AArch64::ORRXrs:
2279
+ case AArch64::ORRWrs:
2280
+ MachineOperand &Imm = MI.getOperand (3 );
2281
+ unsigned BitSize = Opc == AArch64::ORRXrs ? 32 : 16 ;
2282
+ if (MatchBaseReg (3 ) && Imm.isImm () && Imm.getImm () == BitSize)
2283
+ return true ;
2284
+ }
2285
+
2286
+ return false ;
2287
+ }
2288
+
2289
+ MachineBasicBlock::iterator AArch64LoadStoreOpt::tryToFoldRepeatedConstantLoads (
2290
+ MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
2291
+ int SuccIndex, int Accumulated) {
2292
+ MachineBasicBlock::iterator I = MI.getIterator ();
2293
+ MachineBasicBlock::iterator E = I->getParent ()->end ();
2294
+ MachineBasicBlock::iterator NextI = next_nodbg (I, E);
2295
+ MachineBasicBlock::iterator FirstMovI;
2296
+ MachineBasicBlock *MBB = MI.getParent ();
2297
+ uint64_t Mask = 0xFFFFUL ;
2298
+ int Index = 0 ;
2299
+
2300
+ for (auto MI = MIs.begin (), E = MIs.end (); MI != E; ++MI, Index++) {
2301
+ if (Index == SuccIndex - 1 ) {
2302
+ FirstMovI = *MI;
2303
+ break ;
2304
+ }
2305
+ (*MI)->eraseFromParent ();
2306
+ }
2307
+
2308
+ Register DstRegW =
2309
+ TRI->getSubReg (FirstMovI->getOperand (0 ).getReg (), AArch64::sub_32);
2310
+ BuildMI (*MBB, FirstMovI, FirstMovI->getDebugLoc (), TII->get (AArch64::MOVZWi),
2311
+ DstRegW)
2312
+ .addImm (Accumulated & Mask)
2313
+ .addImm (0 );
2314
+ BuildMI (*MBB, FirstMovI, FirstMovI->getDebugLoc (), TII->get (AArch64::MOVKWi),
2315
+ DstRegW)
2316
+ .addUse (DstRegW)
2317
+ .addImm ((Accumulated >> 16 ) & Mask)
2318
+ .addImm (AArch64_AM::getShifterImm (AArch64_AM::LSL, 16 ));
2319
+ FirstMovI->eraseFromParent ();
2320
+
2321
+ Register BaseReg = getLdStRegOp (MI).getReg ();
2322
+ const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp (MI);
2323
+ DstRegW = TRI->getSubReg (BaseReg, AArch64::sub_32);
2324
+ unsigned DstRegState = getRegState (MI.getOperand (0 ));
2325
+ BuildMI (*MBB, MI, MI.getDebugLoc (), TII->get (AArch64::STPWi))
2326
+ .addReg (DstRegW, DstRegState)
2327
+ .addReg (DstRegW, DstRegState)
2328
+ .addReg (MO.getReg (), getRegState (MO))
2329
+ .add (AArch64InstrInfo::getLdStOffsetOp (MI))
2330
+ .setMemRefs (MI.memoperands ())
2331
+ .setMIFlags (MI.getFlags ());
2332
+ I->eraseFromParent ();
2333
+
2334
+ return NextI;
2335
+ }
2336
+
2337
+ bool AArch64LoadStoreOpt::foldRepeatedConstantLoads (
2338
+ MachineBasicBlock::iterator &I, unsigned Limit) {
2339
+ MachineInstr &MI = *I;
2340
+ if (MI.getOpcode () != AArch64::STRXui)
2341
+ return false ;
2342
+
2343
+ MachineBasicBlock::iterator MBBI = I;
2344
+ MachineBasicBlock::iterator B = I->getParent ()->begin ();
2345
+ if (MBBI == B)
2346
+ return false ;
2347
+
2348
+ Register BaseReg = getLdStRegOp (MI).getReg ();
2349
+ unsigned Count = 0 , SuccIndex = 0 , DupBitSize = 0 ;
2350
+ uint64_t Accumulated = 0 ;
2351
+ SmallVector<MachineBasicBlock::iterator> MIs;
2352
+ ModifiedRegUnits.clear ();
2353
+ UsedRegUnits.clear ();
2354
+
2355
+ do {
2356
+ MBBI = prev_nodbg (MBBI, B);
2357
+ MachineInstr &MI = *MBBI;
2358
+ if (!MI.isTransient ())
2359
+ ++Count;
2360
+ if (!isRepeatable (MI, BaseReg)) {
2361
+ LiveRegUnits::accumulateUsedDefed (MI, ModifiedRegUnits, UsedRegUnits,
2362
+ TRI);
2363
+ if (!ModifiedRegUnits.available (BaseReg) ||
2364
+ !UsedRegUnits.available (BaseReg))
2365
+ break ;
2366
+ continue ;
2367
+ }
2368
+
2369
+ unsigned Opc = MI.getOpcode ();
2370
+ if (Opc == AArch64::ORRXrs || Opc == AArch64::ORRWrs) {
2371
+ DupBitSize = Opc == AArch64::ORRXrs ? 32 : 16 ;
2372
+ MIs.push_back (MBBI);
2373
+ continue ;
2374
+ }
2375
+ unsigned ValueOrder = Opc == AArch64::MOVZXi ? 1 : 2 ;
2376
+ MachineOperand Value = MI.getOperand (ValueOrder);
2377
+ MachineOperand Shift = MI.getOperand (ValueOrder + 1 );
2378
+ if (!Value.isImm () || !Shift.isImm ())
2379
+ return false ;
2380
+
2381
+ uint64_t IValue = Value.getImm ();
2382
+ uint64_t IShift = Shift.getImm ();
2383
+ uint64_t mask = 0xFFFFUL ;
2384
+ Accumulated -= (Accumulated & (mask << IShift));
2385
+ Accumulated += (IValue << IShift);
2386
+ MIs.push_back (MBBI);
2387
+
2388
+ if (ValueOrder == 1 && DupBitSize) {
2389
+ Accumulated |= Accumulated << DupBitSize;
2390
+ DupBitSize = 0 ;
2391
+ }
2392
+
2393
+ if (Accumulated != 0 && (Accumulated >> 32 ) == (Accumulated & UINT_MAX))
2394
+ SuccIndex = MIs.size ();
2395
+ } while (MBBI != B && Count < Limit);
2396
+
2397
+ if (SuccIndex) {
2398
+ I = tryToFoldRepeatedConstantLoads (MI, MIs, SuccIndex, Accumulated);
2399
+ return true ;
2400
+ }
2401
+
2402
+ return false ;
2403
+ }
2404
+
2253
2405
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore (
2254
2406
MachineBasicBlock::iterator &MBBI) {
2255
2407
MachineInstr &MI = *MBBI;
@@ -2512,6 +2664,27 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2512
2664
++MBBI;
2513
2665
}
2514
2666
2667
+ // We have an opportunity to optimize the `STRXui` instruction, which loads
2668
+ // the same 32-bit value into a register twice. The `STPXi` instruction allows
2669
+ // us to load a 32-bit value only once.
2670
+ // Considering :
2671
+ // mov x8, 49370
2672
+ // movk x8, 320, lsl #16
2673
+ // movk x8, 49370, lsl #32
2674
+ // movk x8, 320, lsl #48
2675
+ // str x8, [x0]
2676
+ // Transform :
2677
+ // mov w8, 49370
2678
+ // movk w8, 320, lsl #16
2679
+ // stp w8, w8, [x0]
2680
+ for (MachineBasicBlock::iterator MBBI = MBB.begin (), E = MBB.end ();
2681
+ MBBI != E;) {
2682
+ if (foldRepeatedConstantLoads (MBBI, UpdateLimit))
2683
+ Modified = true ;
2684
+ else
2685
+ ++MBBI;
2686
+ }
2687
+
2515
2688
return Modified;
2516
2689
}
2517
2690
0 commit comments