Skip to content

Commit 8662083

Browse files
committed
x86 atomic: optimize a.store(reg op a.load(acquire), release)
Summary: PR24191 finds that the expected memory-register operations aren't generated when relaxed { load ; modify ; store } is used. This is similar to PR17281 which was addressed in D4796, but only for memory-immediate operations (and for memory orderings up to acquire and release). This patch also handles some floating-point operations. Reviewers: reames, kcc, dvyukov, nadav, morisset, chandlerc, t.p.northover, pete Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D11382 llvm-svn: 244128
1 parent 8ef3cda commit 8662083

File tree

5 files changed

+562
-119
lines changed

5 files changed

+562
-119
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20132,6 +20132,45 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
2013220132
return sinkMBB;
2013320133
}
2013420134

20135+
MachineBasicBlock *
20136+
X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI,
20137+
MachineBasicBlock *BB) const {
20138+
// Combine the following atomic floating-point modification pattern:
20139+
// a.store(reg OP a.load(acquire), release)
20140+
// Transform them into:
20141+
// OPss (%gpr), %xmm
20142+
// movss %xmm, (%gpr)
20143+
// Or sd equivalent for 64-bit operations.
20144+
unsigned MOp, FOp;
20145+
switch (MI->getOpcode()) {
20146+
default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
20147+
case X86::RELEASE_FADD32mr: MOp = X86::MOVSSmr; FOp = X86::ADDSSrm; break;
20148+
case X86::RELEASE_FADD64mr: MOp = X86::MOVSDmr; FOp = X86::ADDSDrm; break;
20149+
}
20150+
const X86InstrInfo *TII = Subtarget->getInstrInfo();
20151+
DebugLoc DL = MI->getDebugLoc();
20152+
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
20153+
unsigned MSrc = MI->getOperand(0).getReg();
20154+
unsigned VSrc = MI->getOperand(5).getReg();
20155+
MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp))
20156+
.addReg(/*Base=*/MSrc)
20157+
.addImm(/*Scale=*/1)
20158+
.addReg(/*Index=*/0)
20159+
.addImm(0)
20160+
.addReg(0);
20161+
MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp),
20162+
MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
20163+
.addReg(VSrc)
20164+
.addReg(/*Base=*/MSrc)
20165+
.addImm(/*Scale=*/1)
20166+
.addReg(/*Index=*/0)
20167+
.addImm(/*Disp=*/0)
20168+
.addReg(/*Segment=*/0);
20169+
MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill);
20170+
MI->eraseFromParent(); // The pseudo instruction is gone now.
20171+
return BB;
20172+
}
20173+
2013520174
MachineBasicBlock *
2013620175
X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
2013720176
MachineBasicBlock *BB) const {
@@ -20687,6 +20726,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
2068720726
case X86::CMOV_V64I1:
2068820727
return EmitLoweredSelect(MI, BB);
2068920728

20729+
case X86::RELEASE_FADD32mr:
20730+
case X86::RELEASE_FADD64mr:
20731+
return EmitLoweredAtomicFP(MI, BB);
20732+
2069020733
case X86::FP32_TO_INT16_IN_MEM:
2069120734
case X86::FP32_TO_INT32_IN_MEM:
2069220735
case X86::FP32_TO_INT64_IN_MEM:

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1080,6 +1080,9 @@ namespace llvm {
10801080
MachineBasicBlock *EmitLoweredSelect(MachineInstr *I,
10811081
MachineBasicBlock *BB) const;
10821082

1083+
MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr *I,
1084+
MachineBasicBlock *BB) const;
1085+
10831086
MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI,
10841087
MachineBasicBlock *BB) const;
10851088

llvm/lib/Target/X86/X86InstrCompiler.td

Lines changed: 56 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -752,26 +752,40 @@ defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add",
752752

753753
/* The following multiclass tries to make sure that in code like
754754
* x.store (immediate op x.load(acquire), release)
755+
* and
756+
* x.store (register op x.load(acquire), release)
755757
* an operation directly on memory is generated instead of wasting a register.
756758
* It is not automatic as atomic_store/load are only lowered to MOV instructions
757759
* extremely late to prevent them from being accidentally reordered in the backend
758760
* (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions)
759761
*/
760762
multiclass RELEASE_BINOP_MI<string op> {
761763
def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
762-
"#RELEASE_BINOP PSEUDO!",
764+
"#BINOP "#NAME#"8mi PSEUDO!",
763765
[(atomic_store_8 addr:$dst, (!cast<PatFrag>(op)
764766
(atomic_load_8 addr:$dst), (i8 imm:$src)))]>;
767+
def NAME#8mr : I<0, Pseudo, (outs), (ins i8mem:$dst, GR8:$src),
768+
"#BINOP "#NAME#"8mr PSEUDO!",
769+
[(atomic_store_8 addr:$dst, (!cast<PatFrag>(op)
770+
(atomic_load_8 addr:$dst), GR8:$src))]>;
765771
// NAME#16 is not generated as 16-bit arithmetic instructions are considered
766772
// costly and avoided as far as possible by this backend anyway
767773
def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
768-
"#RELEASE_BINOP PSEUDO!",
774+
"#BINOP "#NAME#"32mi PSEUDO!",
769775
[(atomic_store_32 addr:$dst, (!cast<PatFrag>(op)
770776
(atomic_load_32 addr:$dst), (i32 imm:$src)))]>;
777+
def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
778+
"#BINOP "#NAME#"32mr PSEUDO!",
779+
[(atomic_store_32 addr:$dst, (!cast<PatFrag>(op)
780+
(atomic_load_32 addr:$dst), GR32:$src))]>;
771781
def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
772-
"#RELEASE_BINOP PSEUDO!",
782+
"#BINOP "#NAME#"64mi32 PSEUDO!",
773783
[(atomic_store_64 addr:$dst, (!cast<PatFrag>(op)
774784
(atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>;
785+
def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
786+
"#BINOP "#NAME#"64mr PSEUDO!",
787+
[(atomic_store_64 addr:$dst, (!cast<PatFrag>(op)
788+
(atomic_load_64 addr:$dst), GR64:$src))]>;
775789
}
776790
defm RELEASE_ADD : RELEASE_BINOP_MI<"add">;
777791
defm RELEASE_AND : RELEASE_BINOP_MI<"and">;
@@ -780,18 +794,41 @@ defm RELEASE_XOR : RELEASE_BINOP_MI<"xor">;
780794
// Note: we don't deal with sub, because substractions of constants are
781795
// optimized into additions before this code can run
782796

797+
// Same as above, but for floating-point.
798+
// FIXME: imm version.
799+
// FIXME: Version that doesn't clobber $src, using AVX's VADDSS.
800+
// FIXME: This could also handle SIMD operations with *ps and *pd instructions.
801+
let usesCustomInserter = 1 in {
802+
multiclass RELEASE_FP_BINOP_MI<string op> {
803+
def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src),
804+
"#BINOP "#NAME#"32mr PSEUDO!",
805+
[(atomic_store_32 addr:$dst,
806+
(i32 (bitconvert (!cast<PatFrag>(op)
807+
(f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))),
808+
FR32:$src))))]>, Requires<[HasSSE1]>;
809+
def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src),
810+
"#BINOP "#NAME#"64mr PSEUDO!",
811+
[(atomic_store_64 addr:$dst,
812+
(i64 (bitconvert (!cast<PatFrag>(op)
813+
(f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))),
814+
FR64:$src))))]>, Requires<[HasSSE2]>;
815+
}
816+
defm RELEASE_FADD : RELEASE_FP_BINOP_MI<"fadd">;
817+
// FIXME: Add fsub, fmul, fdiv, ...
818+
}
819+
783820
multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> {
784821
def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst),
785-
"#RELEASE_UNOP PSEUDO!",
822+
"#UNOP "#NAME#"8m PSEUDO!",
786823
[(atomic_store_8 addr:$dst, dag8)]>;
787824
def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst),
788-
"#RELEASE_UNOP PSEUDO!",
825+
"#UNOP "#NAME#"16m PSEUDO!",
789826
[(atomic_store_16 addr:$dst, dag16)]>;
790827
def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst),
791-
"#RELEASE_UNOP PSEUDO!",
828+
"#UNOP "#NAME#"32m PSEUDO!",
792829
[(atomic_store_32 addr:$dst, dag32)]>;
793830
def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst),
794-
"#RELEASE_UNOP PSEUDO!",
831+
"#UNOP "#NAME#"64m PSEUDO!",
795832
[(atomic_store_64 addr:$dst, dag64)]>;
796833
}
797834

@@ -821,42 +858,42 @@ defm RELEASE_NOT : RELEASE_UNOP<
821858
*/
822859

823860
def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
824-
"#RELEASE_MOV PSEUDO !",
861+
"#RELEASE_MOV8mi PSEUDO!",
825862
[(atomic_store_8 addr:$dst, (i8 imm:$src))]>;
826863
def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src),
827-
"#RELEASE_MOV PSEUDO !",
864+
"#RELEASE_MOV16mi PSEUDO!",
828865
[(atomic_store_16 addr:$dst, (i16 imm:$src))]>;
829866
def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
830-
"#RELEASE_MOV PSEUDO !",
867+
"#RELEASE_MOV32mi PSEUDO!",
831868
[(atomic_store_32 addr:$dst, (i32 imm:$src))]>;
832869
def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
833-
"#RELEASE_MOV PSEUDO !",
870+
"#RELEASE_MOV64mi32 PSEUDO!",
834871
[(atomic_store_64 addr:$dst, i64immSExt32:$src)]>;
835872

836873
def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src),
837-
"#RELEASE_MOV PSEUDO!",
874+
"#RELEASE_MOV8mr PSEUDO!",
838875
[(atomic_store_8 addr:$dst, GR8 :$src)]>;
839876
def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src),
840-
"#RELEASE_MOV PSEUDO!",
877+
"#RELEASE_MOV16mr PSEUDO!",
841878
[(atomic_store_16 addr:$dst, GR16:$src)]>;
842879
def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
843-
"#RELEASE_MOV PSEUDO!",
880+
"#RELEASE_MOV32mr PSEUDO!",
844881
[(atomic_store_32 addr:$dst, GR32:$src)]>;
845882
def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
846-
"#RELEASE_MOV PSEUDO!",
883+
"#RELEASE_MOV64mr PSEUDO!",
847884
[(atomic_store_64 addr:$dst, GR64:$src)]>;
848885

849886
def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
850-
"#ACQUIRE_MOV PSEUDO!",
887+
"#ACQUIRE_MOV8rm PSEUDO!",
851888
[(set GR8:$dst, (atomic_load_8 addr:$src))]>;
852889
def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src),
853-
"#ACQUIRE_MOV PSEUDO!",
890+
"#ACQUIRE_MOV16rm PSEUDO!",
854891
[(set GR16:$dst, (atomic_load_16 addr:$src))]>;
855892
def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
856-
"#ACQUIRE_MOV PSEUDO!",
893+
"#ACQUIRE_MOV32rm PSEUDO!",
857894
[(set GR32:$dst, (atomic_load_32 addr:$src))]>;
858895
def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
859-
"#ACQUIRE_MOV PSEUDO!",
896+
"#ACQUIRE_MOV64rm PSEUDO!",
860897
[(set GR64:$dst, (atomic_load_64 addr:$src))]>;
861898

862899
//===----------------------------------------------------------------------===//

llvm/lib/Target/X86/X86MCInstLower.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,17 +598,29 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
598598
case X86::RELEASE_MOV32mi: OutMI.setOpcode(X86::MOV32mi); goto ReSimplify;
599599
case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify;
600600
case X86::RELEASE_ADD8mi: OutMI.setOpcode(X86::ADD8mi); goto ReSimplify;
601+
case X86::RELEASE_ADD8mr: OutMI.setOpcode(X86::ADD8mr); goto ReSimplify;
601602
case X86::RELEASE_ADD32mi: OutMI.setOpcode(X86::ADD32mi); goto ReSimplify;
603+
case X86::RELEASE_ADD32mr: OutMI.setOpcode(X86::ADD32mr); goto ReSimplify;
602604
case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify;
605+
case X86::RELEASE_ADD64mr: OutMI.setOpcode(X86::ADD64mr); goto ReSimplify;
603606
case X86::RELEASE_AND8mi: OutMI.setOpcode(X86::AND8mi); goto ReSimplify;
607+
case X86::RELEASE_AND8mr: OutMI.setOpcode(X86::AND8mr); goto ReSimplify;
604608
case X86::RELEASE_AND32mi: OutMI.setOpcode(X86::AND32mi); goto ReSimplify;
609+
case X86::RELEASE_AND32mr: OutMI.setOpcode(X86::AND32mr); goto ReSimplify;
605610
case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify;
611+
case X86::RELEASE_AND64mr: OutMI.setOpcode(X86::AND64mr); goto ReSimplify;
606612
case X86::RELEASE_OR8mi: OutMI.setOpcode(X86::OR8mi); goto ReSimplify;
613+
case X86::RELEASE_OR8mr: OutMI.setOpcode(X86::OR8mr); goto ReSimplify;
607614
case X86::RELEASE_OR32mi: OutMI.setOpcode(X86::OR32mi); goto ReSimplify;
615+
case X86::RELEASE_OR32mr: OutMI.setOpcode(X86::OR32mr); goto ReSimplify;
608616
case X86::RELEASE_OR64mi32: OutMI.setOpcode(X86::OR64mi32); goto ReSimplify;
617+
case X86::RELEASE_OR64mr: OutMI.setOpcode(X86::OR64mr); goto ReSimplify;
609618
case X86::RELEASE_XOR8mi: OutMI.setOpcode(X86::XOR8mi); goto ReSimplify;
619+
case X86::RELEASE_XOR8mr: OutMI.setOpcode(X86::XOR8mr); goto ReSimplify;
610620
case X86::RELEASE_XOR32mi: OutMI.setOpcode(X86::XOR32mi); goto ReSimplify;
621+
case X86::RELEASE_XOR32mr: OutMI.setOpcode(X86::XOR32mr); goto ReSimplify;
611622
case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify;
623+
case X86::RELEASE_XOR64mr: OutMI.setOpcode(X86::XOR64mr); goto ReSimplify;
612624
case X86::RELEASE_INC8m: OutMI.setOpcode(X86::INC8m); goto ReSimplify;
613625
case X86::RELEASE_INC16m: OutMI.setOpcode(X86::INC16m); goto ReSimplify;
614626
case X86::RELEASE_INC32m: OutMI.setOpcode(X86::INC32m); goto ReSimplify;

0 commit comments

Comments
 (0)