@@ -862,6 +862,28 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(
862
862
case X86::MMX_MOVD64rm:
863
863
case X86::MMX_MOVQ64rm:
864
864
// AVX-512
865
+ case X86::VPBROADCASTBZ128rm:
866
+ case X86::VPBROADCASTBZ256rm:
867
+ case X86::VPBROADCASTBZrm:
868
+ case X86::VBROADCASTF32X2Z256rm:
869
+ case X86::VBROADCASTF32X2Zrm:
870
+ case X86::VBROADCASTI32X2Z128rm:
871
+ case X86::VBROADCASTI32X2Z256rm:
872
+ case X86::VBROADCASTI32X2Zrm:
873
+ case X86::VPBROADCASTWZ128rm:
874
+ case X86::VPBROADCASTWZ256rm:
875
+ case X86::VPBROADCASTWZrm:
876
+ case X86::VPBROADCASTDZ128rm:
877
+ case X86::VPBROADCASTDZ256rm:
878
+ case X86::VPBROADCASTDZrm:
879
+ case X86::VBROADCASTSSZ128rm:
880
+ case X86::VBROADCASTSSZ256rm:
881
+ case X86::VBROADCASTSSZrm:
882
+ case X86::VPBROADCASTQZ128rm:
883
+ case X86::VPBROADCASTQZ256rm:
884
+ case X86::VPBROADCASTQZrm:
885
+ case X86::VBROADCASTSDZ256rm:
886
+ case X86::VBROADCASTSDZrm:
865
887
case X86::VMOVSSZrm:
866
888
case X86::VMOVSSZrm_alt:
867
889
case X86::VMOVSDZrm:
@@ -8067,6 +8089,39 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
8067
8089
MOs.push_back (MachineOperand::CreateReg (0 , false ));
8068
8090
break ;
8069
8091
}
8092
+ case X86::VPBROADCASTBZ128rm:
8093
+ case X86::VPBROADCASTBZ256rm:
8094
+ case X86::VPBROADCASTBZrm:
8095
+ case X86::VBROADCASTF32X2Z256rm:
8096
+ case X86::VBROADCASTF32X2Zrm:
8097
+ case X86::VBROADCASTI32X2Z128rm:
8098
+ case X86::VBROADCASTI32X2Z256rm:
8099
+ case X86::VBROADCASTI32X2Zrm:
8100
+ // No instructions currently fuse with 8bits or 32bits x 2.
8101
+ return nullptr ;
8102
+
8103
+ #define FOLD_BROADCAST (SIZE ) \
8104
+ MOs.append (LoadMI.operands_begin () + NumOps - X86::AddrNumOperands, \
8105
+ LoadMI.operands_begin () + NumOps); \
8106
+ return foldMemoryBroadcast (MF, MI, Ops[0 ], MOs, InsertPt, /* Size=*/ SIZE, \
8107
+ /* AllowCommute=*/ true );
8108
+ case X86::VPBROADCASTWZ128rm:
8109
+ case X86::VPBROADCASTWZ256rm:
8110
+ case X86::VPBROADCASTWZrm:
8111
+ FOLD_BROADCAST (16 );
8112
+ case X86::VPBROADCASTDZ128rm:
8113
+ case X86::VPBROADCASTDZ256rm:
8114
+ case X86::VPBROADCASTDZrm:
8115
+ case X86::VBROADCASTSSZ128rm:
8116
+ case X86::VBROADCASTSSZ256rm:
8117
+ case X86::VBROADCASTSSZrm:
8118
+ FOLD_BROADCAST (32 );
8119
+ case X86::VPBROADCASTQZ128rm:
8120
+ case X86::VPBROADCASTQZ256rm:
8121
+ case X86::VPBROADCASTQZrm:
8122
+ case X86::VBROADCASTSDZ256rm:
8123
+ case X86::VBROADCASTSDZrm:
8124
+ FOLD_BROADCAST (64 );
8070
8125
default : {
8071
8126
if (isNonFoldablePartialRegisterLoad (LoadMI, MI, MF))
8072
8127
return nullptr ;
@@ -8081,6 +8136,37 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
8081
8136
/* Size=*/ 0 , Alignment, /* AllowCommute=*/ true );
8082
8137
}
8083
8138
8139
+ MachineInstr *
8140
+ X86InstrInfo::foldMemoryBroadcast (MachineFunction &MF, MachineInstr &MI,
8141
+ unsigned OpNum, ArrayRef<MachineOperand> MOs,
8142
+ MachineBasicBlock::iterator InsertPt,
8143
+ unsigned BitsSize, bool AllowCommute) const {
8144
+
8145
+ if (auto *I = lookupBroadcastFoldTable (MI.getOpcode (), OpNum))
8146
+ return matchBroadcastSize (*I, BitsSize)
8147
+ ? FuseInst (MF, I->DstOp , OpNum, MOs, InsertPt, MI, *this )
8148
+ : nullptr ;
8149
+
8150
+ if (AllowCommute) {
8151
+ // If the instruction and target operand are commutable, commute the
8152
+ // instruction and try again.
8153
+ unsigned CommuteOpIdx2 = commuteOperandsForFold (MI, OpNum);
8154
+ if (CommuteOpIdx2 == OpNum) {
8155
+ printFailMsgforFold (MI, OpNum);
8156
+ return nullptr ;
8157
+ }
8158
+ MachineInstr *NewMI =
8159
+ foldMemoryBroadcast (MF, MI, CommuteOpIdx2, MOs, InsertPt, BitsSize,
8160
+ /* AllowCommute=*/ false );
8161
+ if (NewMI)
8162
+ return NewMI;
8163
+ UndoCommuteForFold (MI, OpNum, CommuteOpIdx2);
8164
+ }
8165
+
8166
+ printFailMsgforFold (MI, OpNum);
8167
+ return nullptr ;
8168
+ }
8169
+
8084
8170
static SmallVector<MachineMemOperand *, 2 >
8085
8171
extractLoadMMOs (ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) {
8086
8172
SmallVector<MachineMemOperand *, 2 > LoadMMOs;
0 commit comments