-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AArch64] Optimization of repeated constant loads (#51483) #86249
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: hanbeom (ParkHanbum) ChangesThis change looks for cases where load same constant in the upper and lower 32-bit sizes of a 64-bit space. If we found, it loads only the lower 32-bit constant and replaces it with an instruction that stores it simultaneously in the upper and lower. For example: Full diff: https://github.com/llvm/llvm-project/pull/86249.diff 1 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 926a89466255ca..0b7dc874f614fc 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -199,6 +199,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
// Find and merge a base register updates before or after a ld/st instruction.
bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
+ // Finds and collapses loads of repeated constant values.
+ bool foldRepeatedConstantLoads(MachineBasicBlock::iterator &I,
+ unsigned Limit);
+ MachineBasicBlock::iterator tryToFoldRepeatedConstantLoads(
+ MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
+ int SuccIndex, int Accumulated);
+
bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -2250,6 +2257,126 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
return E;
}
+static unsigned getMovValueOrder(unsigned Opc) {
+ switch (Opc) {
+ default:
+ return 0;
+ case AArch64::MOVNXi:
+ case AArch64::MOVZXi:
+ return 1;
+ case AArch64::MOVKXi:
+ return 2;
+ }
+
+ return 0;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::tryToFoldRepeatedConstantLoads(
+ MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
+ int SuccIndex, int Accumulated) {
+ MachineBasicBlock::iterator I = MI.getIterator();
+ MachineBasicBlock::iterator E = I->getParent()->end();
+ MachineBasicBlock::iterator NextI = next_nodbg(I, E);
+ MachineBasicBlock::iterator FirstMovI;
+ MachineBasicBlock *MBB = MI.getParent();
+ uint64_t Mask = 0xFFFFUL;
+ int Index = 0;
+
+ for (auto MI = MIs.begin(), E = MIs.end(); MI != E; ++MI, Index++) {
+ if (Index == SuccIndex - 1) {
+ FirstMovI = *MI;
+ break;
+ }
+ (*MI)->eraseFromParent();
+ }
+
+ Register DstRegW =
+ TRI->getSubReg(FirstMovI->getOperand(0).getReg(), AArch64::sub_32);
+ BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(), TII->get(AArch64::MOVZWi),
+ DstRegW)
+ .addImm(Accumulated & Mask)
+ .addImm(0);
+ BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(), TII->get(AArch64::MOVKWi),
+ DstRegW)
+ .addUse(DstRegW)
+ .addImm((Accumulated >> 16) & Mask)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16));
+ FirstMovI->eraseFromParent();
+
+ Register BaseReg = getLdStRegOp(MI).getReg();
+ const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp(MI);
+ DstRegW = TRI->getSubReg(BaseReg, AArch64::sub_32);
+ unsigned DstRegState = getRegState(MI.getOperand(0));
+ BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STPWi))
+ .addReg(DstRegW, DstRegState)
+ .addReg(DstRegW, DstRegState)
+ .addReg(MO.getReg(), getRegState(MO))
+ .add(AArch64InstrInfo::getLdStOffsetOp(MI))
+ .setMemRefs(MI.memoperands())
+ .setMIFlags(MI.getFlags());
+ I->eraseFromParent();
+
+ return NextI;
+}
+
+bool AArch64LoadStoreOpt::foldRepeatedConstantLoads(
+ MachineBasicBlock::iterator &I, unsigned Limit) {
+ MachineInstr &MI = *I;
+ if (MI.getOpcode() != AArch64::STRXui)
+ return false;
+
+ MachineBasicBlock::iterator MBBI = I;
+ MachineBasicBlock::iterator B = I->getParent()->begin();
+ if (MBBI == B)
+ return false;
+
+ Register BaseReg = getLdStRegOp(MI).getReg();
+ unsigned Count = 0, SuccIndex = 0;
+ uint64_t Accumulated = 0;
+ SmallVector<MachineBasicBlock::iterator> MIs;
+ ModifiedRegUnits.clear();
+ UsedRegUnits.clear();
+
+ do {
+ MBBI = prev_nodbg(MBBI, B);
+ MachineInstr &MI = *MBBI;
+ if (!MI.isTransient())
+ ++Count;
+ unsigned CurrOpc = MI.getOpcode();
+ unsigned ValueOrder = getMovValueOrder(CurrOpc);
+ if (!ValueOrder) {
+ LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
+ TRI);
+ continue;
+ }
+
+ MachineOperand Value = MI.getOperand(ValueOrder);
+ MachineOperand Shift = MI.getOperand(ValueOrder + 1);
+ if (!Value.isImm() || !Shift.isImm())
+ return false;
+
+ uint64_t IValue = Value.getImm();
+ uint64_t IShift = Shift.getImm();
+ uint64_t mask = 0xFFFFUL;
+ Accumulated -= (Accumulated & (mask << IShift));
+ Accumulated += (IValue << IShift);
+ MIs.push_back(MBBI);
+
+ if (Accumulated != 0 && (Accumulated >> 32) == (Accumulated & UINT_MAX))
+ SuccIndex = Count;
+ if (!ModifiedRegUnits.available(BaseReg) ||
+ !UsedRegUnits.available(BaseReg))
+ break;
+ } while (MBBI != B && Count < Limit);
+
+ if (SuccIndex) {
+ I = tryToFoldRepeatedConstantLoads(MI, MIs, SuccIndex, Accumulated);
+ return true;
+ }
+
+ return false;
+}
+
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
MachineBasicBlock::iterator &MBBI) {
MachineInstr &MI = *MBBI;
@@ -2512,6 +2639,27 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
++MBBI;
}
+ // We have an opportunity to optimize the `STRXui` instruction, which loads
+ // the same 32-bit value into a register twice. The `STPXi` instruction allows
+ // us to load a 32-bit value only once.
+ // Considering :
+ // mov x8, 49370
+ // movk x8, 320, lsl #16
+ // movk x8, 49370, lsl #32
+ // movk x8, 320, lsl #48
+ // str x8, [x0]
+ // Transform :
+ // mov w8, 49370
+ // movk w8, 320, lsl #16
+ // stp w8, w8, [x0]
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ MBBI != E;) {
+ if (foldRepeatedConstantLoads(MBBI, UpdateLimit))
+ Modified = true;
+ else
+ ++MBBI;
+ }
+
return Modified;
}
|
gcc already done , https://gcc.godbolt.org/z/1a4o7Tvzz |
It looks like gcc will transform repeated constants to Is it worth doing that first, then this could be simplified a little to just having to look at Can you add some testing too? Thanks |
@davemgreen sure, consider it done. |
Thanks! I think I would expect it to be an additional case in AArch64_IMM::expandMOVImm(..), that can check the two halves are the same. |
thanks for tip! |
@davemgreen Do we support the same ORR format as GCC? could you little advise which instruction can I use please? |
Hi - I believe you will need a ORRXrs. The ORRXri will take an immediate for the second operand. It's like one of these: https://gcc.godbolt.org/z/8aWhfWv4T (but in that case might need to check that the top bits are known to be 0 when we convert it to a stp). |
@davemgreen :) thanks! |
This handles some cases, but not all. Consider:
You want mov+stp for the store... but you don't want to generate orr for the return. (Even for the cases where we generate three instructions, we're going to avoid "orr(x, x, lsl 32)" where we can because it's more expensive than movk.) Maybe we should run this optimization before pseudo-expansion? |
@efriedma-quic it is good for me. like this.
but this is base on assuming that "ORR(x, x, lsl 32)" was generated in pseudo-expansion. |
You might be right that we might need both methods in the end though. I was hoping we could add the |
On which processors? It looks like on Cortex-X2 and newer, "orr(x, x, lsl n)" has the same cost as movk, but looking through various sources, I can't find any other chip that does. Granted, it's probably still better than generating two MOVK. |
I think it's fine to assume this as a starting-point; we can try to refine it in a followup. |
7082034
to
83ee099
Compare
can I ask code review?? |
Hello. It would be best if there was one patch that added support for or(x, x, lsl 32), so we can make sure that is tested and committed and doesn't cause any problems. The load/store optimizer change should then be an addition on top. Whenever we change these mir optimization passes there seems to be a high chance of subtle things going wrong. |
(FYI, GitHub doesn't generate a notification when you push a new commit to your branch.) |
@davemgreen got it, I'll split this. |
Just leave a comment on the pull request noting that you've updated the patch. (Actually, I've seen GitHub generates notification for new commits in some cases, but in this case it didn't generate a notification; not sure what the difference is.) |
83ee099
to
b546f76
Compare
b546f76
to
1e6a5ff
Compare
Can you make the first half of this into a separate pr? Thanks |
1e6a5ff
to
1d03a1b
Compare
@davemgreen sorry I misunderstood. The test case I created consists of only one MOV Instruction. I'm not sure how to create a failure case in this case, could you please advise? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, this is looking decent to me. I think it is probably worth adding .ll file tests to check that everything works together properly. Maybe into a test like movw-const.ll.
These are some of the ones I tried. It would be good if 0x0000555500005555 still produced a mov+movk, and the last one is failing probably because of the f's. I'm not sure if there are other patterns that would be worth testing too?
define i64 @test0x1234567812345678() {
; CHECK-LABEL: test0x1234567812345678:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #22136 // =0x5678
; CHECK-NEXT: movk x0, #4660, lsl #16
; CHECK-NEXT: orr x0, x0, x0, lsl #32
; CHECK-NEXT: ret
ret i64 u0x1234567812345678
}
define i64 @test0xff3456ffff3456ff() {
; CHECK-LABEL: test0xff3456ffff3456ff:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #22271 // =0x56ff
; CHECK-NEXT: movk x0, #65332, lsl #16
; CHECK-NEXT: orr x0, x0, x0, lsl #32
; CHECK-NEXT: ret
ret i64 u0xff3456ffff3456ff
}
define i64 @test0x00345600345600() {
; CHECK-LABEL: test0x00345600345600:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #22016 // =0x5600
; CHECK-NEXT: movk x0, #52, lsl #16
; CHECK-NEXT: movk x0, #13398, lsl #32
; CHECK-NEXT: ret
ret i64 u0x00345600345600
}
define i64 @test0x5555555555555555() {
; CHECK-LABEL: test0x5555555555555555:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #6148914691236517205 // =0x5555555555555555
; CHECK-NEXT: ret
ret i64 u0x5555555555555555
}
define i64 @test0x5055555550555555() {
; CHECK-LABEL: test0x5055555550555555:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #6148914691236517205 // =0x5555555555555555
; CHECK-NEXT: and x0, x0, #0xf0fffffff0ffffff
; CHECK-NEXT: ret
ret i64 u0x5055555550555555
}
define i64 @test0x0000555555555555() {
; CHECK-LABEL: test0x0000555555555555:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #6148914691236517205 // =0x5555555555555555
; CHECK-NEXT: movk x0, #0, lsl #48
; CHECK-NEXT: ret
ret i64 u0x0000555555555555
}
define i64 @test0x0000555500005555() {
; CHECK-LABEL: test0x0000555500005555:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #21845 // =0x5555
; CHECK-NEXT: orr x0, x0, x0, lsl #32
; CHECK-NEXT: ret
ret i64 u0x0000555500005555
}
define i64 @testu0xffff5555ffff5555() {
; CHECK-LABEL: testu0xffff5555ffff5555:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #-43691 // =0xffffffffffff5555
; CHECK-NEXT: orr x0, x0, x0, lsl #32
; CHECK-NEXT: ret
ret i64 u0xffff5555ffff5555
}
I looked through the other uses of expandMOVImm and they all check Insn.size() == 1 or are only checking for the number of instructions.
Insn.push_back( | ||
{Opc, Imm16, AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)}); | ||
if (Imm != 0 && (Imm >> 32) == (Imm & UINT_MAX)) { | ||
Insn.push_back({BitSize == 32 ? AArch64::ORRWrs : AArch64::ORRXrs, 0, 32}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If BitSize == 32 is unreachable then it might be worth only adding AArch64::ORRXrs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for splitting out the tests, they look good apart for the one with ffs where isNeg=true.
; CHECK-LABEL: testu0xffff5555ffff5555: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: mov x0, #-43691 // =0xffffffffffff5555 | ||
; CHECK-NEXT: orr x0, x0, x0, lsl #32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this one might be incorrect, because it won't or
with the ff's
from the first instruction.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you very much for showing us how to write tests. I added two tests
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks - I gave this a test and it did OK. It came up a decent amount, and I couldn't see any problems. Other than changing the UINT_MAX this LGTM.
|
||
// Now, we get 16-bit divided Imm. If high and low bits are same in | ||
// 32-bit, there is an opportunity to reduce instruction. | ||
if (Insn.size() > 2 && (Imm >> 32) == (Imm & UINT_MAX)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would use 0xffffffff instead of UINT_MAX. They are likely always the same value, but the ff's is more explicit about the size.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
got it!
…#51483) This change looks for cases of symmetric constant loading. `symmetric constant load` is when the upper 32 bits and lower 32 bits of a 64-bit register load the same value. When it finds this, it replaces it with an instruction that loads only the lower 32 bits of the constant and stores it in the upper and lower bits simultaneously. For example: renamable $x8 = MOVZXi 49370, 0 renamable $x8 = MOVKXi $x8, 320, 16 renamable $x8 = MOVKXi $x8, 49370, 32 renamable $x8 = MOVKXi $x8, 320, 48 becomes renamable $x8 = MOVZXi 49370, 0 renamable $x8 = MOVKXi $x8, 320, 16 renamable $x8 = ORRXrs $x8, $x8, 32
Thanks, LGTM |
Can you update the commit message? Then let me know if I should hit submit |
This change looks for cases of symmetric constant loading. `symmetric constant load` is when the upper 32 bits and lower 32 bits of a 64-bit register load the same value. When it finds this, it replaces it with an instruction that loads only the lower 32 bits of the constant and stores it in the upper and lower bits simultaneously. For example: renamable $x8 = MOVZXi 49370, 0 renamable $x8 = MOVKXi $x8, 320, 16 renamable $x8 = MOVKXi $x8, 49370, 32 renamable $x8 = MOVKXi $x8, 320, 48 becomes renamable $x8 = MOVZXi 49370, 0 renamable $x8 = MOVKXi $x8, 320, 16 renamable $x8 = ORRXrs $x8, $x8, 32
Pushed in ce1a0d8. |
This change looks for cases where load same constant in the upper and lower 32-bit sizes of a 64-bit space. If we found, it loads only the lower 32-bit constant and replaces it with an instruction that stores it simultaneously in the upper and lower.
For example:
renamable $x8 = MOVZXi 49370, 0
renamable $x8 = MOVKXi $x8, 320, 16
renamable $x8 = MOVKXi $x8, 49370, 32
renamable $x8 = MOVKXi $x8, 320, 48
STRXui killed renamable $x8, killed renamable $x0, 0
becomes
$w8 = MOVZWi 49370, 0
$w8 = MOVKWi $w8, 320, 16
STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0