Skip to content

Commit e5296c5

Browse files
committed
[AMDGPU] Relax restrictions on unbreakable PHI users in BreakLargePHis
The previous heuristic rejected a PHI if one of its user was an unbreakable PHI, no matter what the other users were. This worked well in most cases, but there's one case in rocRAND where it doesn't work. In that case, a PHI node has 2 PHI users where one is breakable but not the other. When that PHI node isn't broken performance falls by 35%. Relaxing the restriction to "require that half of the PHI node users are breakable" fixes the issue, and seems like a sensible change. Solves SWDEV-409648, SWDEV-398393 Reviewed By: #amdgpu, arsenm Differential Revision: https://reviews.llvm.org/D155184
1 parent 3277aaa commit e5296c5

File tree

2 files changed

+86
-4
lines changed

2 files changed

+86
-4
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1549,13 +1549,18 @@ bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
15491549
// node as user, we don't want to break this PHI either because it's unlikely
15501550
// to be beneficial. We would just explode the vector and reassemble it
15511551
// directly, wasting instructions.
1552+
//
1553+
// In the case where multiple users are PHI nodes, we want at least half of
1554+
// them to be breakable.
1555+
int Score = 0;
15521556
for (const Value *U : I.users()) {
1553-
if (const auto *PU = dyn_cast<PHINode>(U)) {
1554-
if (!canBreakPHINode(*PU))
1555-
return false;
1556-
}
1557+
if (const auto *PU = dyn_cast<PHINode>(U))
1558+
Score += canBreakPHINode(*PU) ? 1 : -1;
15571559
}
15581560

1561+
if (Score < 0)
1562+
return false;
1563+
15591564
return BreakPhiNodesCache[&I] = true;
15601565
}
15611566

llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -680,6 +680,83 @@ end:
680680
ret void
681681
}
682682

683+
684+
define amdgpu_kernel void @used_by_unbreakable_and_breakable_phi(<5 x double> %in, ptr %out, i1 %cond, i1 %cond2) {
685+
; CHECK-LABEL: @used_by_unbreakable_and_breakable_phi(
686+
; CHECK-NEXT: entry:
687+
; CHECK-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
688+
; CHECK: then:
689+
; CHECK-NEXT: [[X:%.*]] = insertelement <5 x double> [[IN:%.*]], double 3.140000e+00, i64 3
690+
; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <5 x double> [[X]], i64 0
691+
; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X]], i64 1
692+
; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X]], i64 2
693+
; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE6:%.*]] = extractelement <5 x double> [[X]], i64 3
694+
; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE8:%.*]] = extractelement <5 x double> [[X]], i64 4
695+
; CHECK-NEXT: br label [[FINALLY:%.*]]
696+
; CHECK: else:
697+
; CHECK-NEXT: br label [[FINALLY]]
698+
; CHECK: finally:
699+
; CHECK-NEXT: [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
700+
; CHECK-NEXT: [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
701+
; CHECK-NEXT: [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
702+
; CHECK-NEXT: [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE6]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
703+
; CHECK-NEXT: [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE8]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ]
704+
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <5 x double> poison, double [[TMP0]], i64 0
705+
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP1]], i64 1
706+
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP2]], i64 2
707+
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE2]], double [[TMP3]], i64 3
708+
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE3]], double [[TMP4]], i64 4
709+
; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE4]], ptr [[OUT:%.*]], align 1
710+
; CHECK-NEXT: br i1 [[COND2:%.*]], label [[THEN1:%.*]], label [[END:%.*]]
711+
; CHECK: then1:
712+
; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE01:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE4]], i64 0
713+
; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE22:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE4]], i64 1
714+
; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE43:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE4]], i64 2
715+
; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE64:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE4]], i64 3
716+
; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE85:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE4]], i64 4
717+
; CHECK-NEXT: br label [[END]]
718+
; CHECK: end:
719+
; CHECK-NEXT: [[ENDVAL:%.*]] = phi <5 x double> [ [[LARGEPHI_INSERTSLICE4]], [[THEN1]] ], [ [[IN]], [[FINALLY]] ]
720+
; CHECK-NEXT: [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE01]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
721+
; CHECK-NEXT: [[TMP6:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE22]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
722+
; CHECK-NEXT: [[TMP7:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE43]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
723+
; CHECK-NEXT: [[TMP8:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE64]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
724+
; CHECK-NEXT: [[TMP9:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE85]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
725+
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE06:%.*]] = insertelement <5 x double> poison, double [[TMP5]], i64 0
726+
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE17:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE06]], double [[TMP6]], i64 1
727+
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE28:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE17]], double [[TMP7]], i64 2
728+
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE39:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE28]], double [[TMP8]], i64 3
729+
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE410:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE39]], double [[TMP9]], i64 4
730+
; CHECK-NEXT: store <5 x double> [[ENDVAL]], ptr [[OUT]], align 1
731+
; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE410]], ptr [[OUT]], align 1
732+
; CHECK-NEXT: ret void
733+
;
734+
entry:
735+
br i1 %cond, label %then, label %else
736+
737+
then:
738+
%x = insertelement <5 x double> %in, double 3.140000e+00, i64 3
739+
br label %finally
740+
741+
else:
742+
br label %finally
743+
744+
finally:
745+
%val = phi <5 x double> [ %x, %then ], [ zeroinitializer, %else ]
746+
store <5 x double> %val, ptr %out, align 1
747+
br i1 %cond2, label %then1, label %end
748+
749+
then1:
750+
br label %end
751+
752+
end:
753+
%endval = phi <5 x double> [ %val, %then1 ], [ %in, %finally ]
754+
%endval2 = phi <5 x double> [ %val, %then1 ], [ zeroinitializer, %finally ]
755+
store <5 x double> %endval, ptr %out, align 1
756+
store <5 x double> %endval2, ptr %out, align 1
757+
ret void
758+
}
759+
683760
; check for infinite recursion
684761
define amdgpu_kernel void @used_by_phi_self(<5 x double> %in, ptr %out, i8 %count) {
685762
; CHECK-LABEL: @used_by_phi_self(

0 commit comments

Comments
 (0)