Skip to content

Commit a59ed00

Browse files
committed
Add an latency tolerance based on the reduction in data load size
Allow a 1 cycle latency regression per 128-bits of constant data saved (very basic rule of thumb).
1 parent 24e1e9a commit a59ed00

File tree

4 files changed

+20
-17
lines changed

4 files changed

+20
-17
lines changed

llvm/lib/Target/X86/X86FixupVectorConstants.cpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -358,24 +358,27 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
358358
RebuildConstant;
359359
};
360360

361-
auto NewOpcPreferable = [&](const FixupEntry &Fixup) -> bool {
361+
auto NewOpcPreferable = [&](const FixupEntry &Fixup, unsigned RegBitWidth) -> bool {
362362
if (SM->hasInstrSchedModel()) {
363-
// TODO: how much increase in tput/latency should we permit for the
364-
// reduction in constant pool size?
365363
unsigned NewOpc = Fixup.Op;
366364
auto *OldDesc = SM->getSchedClassDesc(TII->get(Opc).getSchedClass());
367365
auto *NewDesc = SM->getSchedClassDesc(TII->get(NewOpc).getSchedClass());
366+
unsigned BitsSaved = RegBitWidth - (Fixup.NumCstElts * Fixup.MemBitWidth);
368367

369-
// Compare tput -> lat
368+
// Compare tput/lat - avoid any regressions, but allow extra cycle of
369+
// latency in exchange for each 128-bit (or less) constant pool reduction
370+
// (this is a very simple cost:benefit estimate - there will probably be
371+
// better ways to calculate this).
370372
double OldTput = MCSchedModel::getReciprocalThroughput(*ST, *OldDesc);
371373
double NewTput = MCSchedModel::getReciprocalThroughput(*ST, *NewDesc);
372374
if (OldTput != NewTput)
373375
return NewTput < OldTput;
374376

377+
int LatTol = (BitsSaved + 127) / 128;
375378
int OldLat = MCSchedModel::computeInstrLatency(*ST, *OldDesc);
376379
int NewLat = MCSchedModel::computeInstrLatency(*ST, *NewDesc);
377380
if (OldLat != NewLat)
378-
return NewLat < OldLat;
381+
return NewLat < (OldLat + LatTol);
379382
}
380383

381384
// We either were unable to get tput/lat or all values were equal.
@@ -399,7 +402,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
399402
unsigned CstBitWidth = C->getType()->getPrimitiveSizeInBits();
400403
RegBitWidth = RegBitWidth ? RegBitWidth : CstBitWidth;
401404
for (const FixupEntry &Fixup : Fixups) {
402-
if (Fixup.Op && (OptSize || NewOpcPreferable(Fixup))) {
405+
if (Fixup.Op && (OptSize || NewOpcPreferable(Fixup, RegBitWidth))) {
403406
// Construct a suitable constant and adjust the MI to use the new
404407
// constant pool entry.
405408
if (Constant *NewCst = Fixup.RebuildConstant(

llvm/test/CodeGen/X86/pr38639.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
define <8 x double> @test(<4 x double> %a, <4 x double> %b) {
55
; CHECK-LABEL: test:
66
; CHECK: # %bb.0:
7-
; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [u,8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1]
7+
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1]
88
; CHECK-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
99
; CHECK-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1010
; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]

llvm/test/CodeGen/X86/recip-fastmath.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -714,7 +714,7 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
714714
;
715715
; BTVER2-LABEL: v8f32_no_estimate:
716716
; BTVER2: # %bb.0:
717-
; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
717+
; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
718718
; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0
719719
; BTVER2-NEXT: retq
720720
;
@@ -790,7 +790,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
790790
;
791791
; BTVER2-LABEL: v8f32_one_step:
792792
; BTVER2: # %bb.0:
793-
; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
793+
; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
794794
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
795795
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0
796796
; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0
@@ -912,7 +912,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
912912
;
913913
; BTVER2-LABEL: v8f32_two_step:
914914
; BTVER2: # %bb.0:
915-
; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
915+
; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
916916
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
917917
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2
918918
; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2
@@ -1017,7 +1017,7 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
10171017
;
10181018
; BTVER2-LABEL: v16f32_no_estimate:
10191019
; BTVER2: # %bb.0:
1020-
; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1020+
; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
10211021
; BTVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0
10221022
; BTVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1
10231023
; BTVER2-NEXT: retq
@@ -1124,7 +1124,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
11241124
;
11251125
; BTVER2-LABEL: v16f32_one_step:
11261126
; BTVER2: # %bb.0:
1127-
; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1127+
; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
11281128
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
11291129
; BTVER2-NEXT: vrcpps %ymm1, %ymm4
11301130
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0
@@ -1302,7 +1302,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
13021302
;
13031303
; BTVER2-LABEL: v16f32_two_step:
13041304
; BTVER2: # %bb.0:
1305-
; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1305+
; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
13061306
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
13071307
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3
13081308
; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3

llvm/test/CodeGen/X86/recip-fastmath2.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -866,7 +866,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
866866
;
867867
; BTVER2-LABEL: v8f32_one_step_2_divs:
868868
; BTVER2: # %bb.0:
869-
; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
869+
; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
870870
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
871871
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0
872872
; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0
@@ -1009,7 +1009,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
10091009
;
10101010
; BTVER2-LABEL: v8f32_two_step2:
10111011
; BTVER2: # %bb.0:
1012-
; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1012+
; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
10131013
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
10141014
; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
10151015
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2
@@ -1374,7 +1374,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
13741374
;
13751375
; BTVER2-LABEL: v16f32_one_step_2_divs:
13761376
; BTVER2: # %bb.0:
1377-
; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1377+
; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
13781378
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
13791379
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0
13801380
; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0
@@ -1590,7 +1590,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
15901590
;
15911591
; BTVER2-LABEL: v16f32_two_step2:
15921592
; BTVER2: # %bb.0:
1593-
; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1593+
; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
15941594
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
15951595
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3
15961596
; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3

0 commit comments

Comments
 (0)