@@ -310,7 +310,14 @@ class WaitcntBrackets {
310
310
bool counterOutOfOrder (InstCounterType T) const ;
311
311
void simplifyWaitcnt (AMDGPU::Waitcnt &Wait) const ;
312
312
void simplifyWaitcnt (InstCounterType T, unsigned &Count) const ;
313
- void determineWait (InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const ;
313
+
314
+ void determineWait (InstCounterType T, RegInterval Interval,
315
+ AMDGPU::Waitcnt &Wait) const ;
316
+ void determineWait (InstCounterType T, int RegNo,
317
+ AMDGPU::Waitcnt &Wait) const {
318
+ determineWait (T, {RegNo, RegNo + 1 }, Wait);
319
+ }
320
+
314
321
void applyWaitcnt (const AMDGPU::Waitcnt &Wait);
315
322
void applyWaitcnt (InstCounterType T, unsigned Count);
316
323
void updateByEvent (const SIInstrInfo *TII, const SIRegisterInfo *TRI,
@@ -345,16 +352,22 @@ class WaitcntBrackets {
345
352
LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
346
353
}
347
354
348
- // Return true if there might be pending writes to the specified vgpr by VMEM
355
+ // Return true if there might be pending writes to the vgpr-interval by VMEM
349
356
// instructions with types different from V.
350
- bool hasOtherPendingVmemTypes (int GprNo, VmemType V) const {
351
- assert (GprNo < NUM_ALL_VGPRS);
352
- return VgprVmemTypes[GprNo] & ~(1 << V);
357
+ bool hasOtherPendingVmemTypes (RegInterval Interval, VmemType V) const {
358
+ for (int RegNo = Interval.first ; RegNo < Interval.second ; ++RegNo) {
359
+ assert (RegNo < NUM_ALL_VGPRS);
360
+ if (VgprVmemTypes[RegNo] & ~(1 << V))
361
+ return true ;
362
+ }
363
+ return false ;
353
364
}
354
365
355
- void clearVgprVmemTypes (int GprNo) {
356
- assert (GprNo < NUM_ALL_VGPRS);
357
- VgprVmemTypes[GprNo] = 0 ;
366
+ void clearVgprVmemTypes (RegInterval Interval) {
367
+ for (int RegNo = Interval.first ; RegNo < Interval.second ; ++RegNo) {
368
+ assert (RegNo < NUM_ALL_VGPRS);
369
+ VgprVmemTypes[RegNo] = 0 ;
370
+ }
358
371
}
359
372
360
373
void setStateOnFunctionEntryOrReturn () {
@@ -396,19 +409,16 @@ class WaitcntBrackets {
396
409
}
397
410
398
411
void setRegScore (int GprNo, InstCounterType T, unsigned Val) {
399
- if (GprNo < NUM_ALL_VGPRS) {
400
- VgprUB = std::max (VgprUB, GprNo);
401
- VgprScores[T][GprNo] = Val;
402
- } else {
403
- assert (T == SmemAccessCounter);
404
- SgprUB = std::max (SgprUB, GprNo - NUM_ALL_VGPRS);
405
- SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
406
- }
412
+ setScoreByInterval ({GprNo, GprNo + 1 }, T, Val);
407
413
}
408
414
409
- void setExpScore (const MachineInstr *MI, const SIRegisterInfo *TRI,
410
- const MachineRegisterInfo *MRI, const MachineOperand &Op,
411
- unsigned Val);
415
+ void setScoreByInterval (RegInterval Interval, InstCounterType CntTy,
416
+ unsigned Score);
417
+
418
+ void setScoreByOperand (const MachineInstr *MI, const SIRegisterInfo *TRI,
419
+ const MachineRegisterInfo *MRI,
420
+ const MachineOperand &Op, InstCounterType CntTy,
421
+ unsigned Val);
412
422
413
423
const GCNSubtarget *ST = nullptr ;
414
424
InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
@@ -772,17 +782,30 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
772
782
return Result;
773
783
}
774
784
775
- void WaitcntBrackets::setExpScore (const MachineInstr *MI,
776
- const SIRegisterInfo *TRI,
777
- const MachineRegisterInfo *MRI,
778
- const MachineOperand &Op, unsigned Val) {
779
- RegInterval Interval = getRegInterval (MI, MRI, TRI, Op);
780
- assert (TRI->isVectorRegister (*MRI, Op.getReg ()));
785
+ void WaitcntBrackets::setScoreByInterval (RegInterval Interval,
786
+ InstCounterType CntTy,
787
+ unsigned Score) {
781
788
for (int RegNo = Interval.first ; RegNo < Interval.second ; ++RegNo) {
782
- setRegScore (RegNo, EXP_CNT, Val);
789
+ if (RegNo < NUM_ALL_VGPRS) {
790
+ VgprUB = std::max (VgprUB, RegNo);
791
+ VgprScores[CntTy][RegNo] = Score;
792
+ } else {
793
+ assert (CntTy == SmemAccessCounter);
794
+ SgprUB = std::max (SgprUB, RegNo - NUM_ALL_VGPRS);
795
+ SgprScores[RegNo - NUM_ALL_VGPRS] = Score;
796
+ }
783
797
}
784
798
}
785
799
800
+ void WaitcntBrackets::setScoreByOperand (const MachineInstr *MI,
801
+ const SIRegisterInfo *TRI,
802
+ const MachineRegisterInfo *MRI,
803
+ const MachineOperand &Op,
804
+ InstCounterType CntTy, unsigned Score) {
805
+ RegInterval Interval = getRegInterval (MI, MRI, TRI, Op);
806
+ setScoreByInterval (Interval, CntTy, Score);
807
+ }
808
+
786
809
void WaitcntBrackets::updateByEvent (const SIInstrInfo *TII,
787
810
const SIRegisterInfo *TRI,
788
811
const MachineRegisterInfo *MRI,
@@ -806,57 +829,61 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
806
829
// All GDS operations must protect their address register (same as
807
830
// export.)
808
831
if (const auto *AddrOp = TII->getNamedOperand (Inst, AMDGPU::OpName::addr))
809
- setExpScore (&Inst, TRI, MRI, *AddrOp, CurrScore);
832
+ setScoreByOperand (&Inst, TRI, MRI, *AddrOp, EXP_CNT , CurrScore);
810
833
811
834
if (Inst.mayStore ()) {
812
835
if (const auto *Data0 =
813
836
TII->getNamedOperand (Inst, AMDGPU::OpName::data0))
814
- setExpScore (&Inst, TRI, MRI, *Data0, CurrScore);
837
+ setScoreByOperand (&Inst, TRI, MRI, *Data0, EXP_CNT , CurrScore);
815
838
if (const auto *Data1 =
816
839
TII->getNamedOperand (Inst, AMDGPU::OpName::data1))
817
- setExpScore (&Inst, TRI, MRI, *Data1, CurrScore);
840
+ setScoreByOperand (&Inst, TRI, MRI, *Data1, EXP_CNT , CurrScore);
818
841
} else if (SIInstrInfo::isAtomicRet (Inst) && !SIInstrInfo::isGWS (Inst) &&
819
842
Inst.getOpcode () != AMDGPU::DS_APPEND &&
820
843
Inst.getOpcode () != AMDGPU::DS_CONSUME &&
821
844
Inst.getOpcode () != AMDGPU::DS_ORDERED_COUNT) {
822
845
for (const MachineOperand &Op : Inst.all_uses ()) {
823
846
if (TRI->isVectorRegister (*MRI, Op.getReg ()))
824
- setExpScore (&Inst, TRI, MRI, Op, CurrScore);
847
+ setScoreByOperand (&Inst, TRI, MRI, Op, EXP_CNT , CurrScore);
825
848
}
826
849
}
827
850
} else if (TII->isFLAT (Inst)) {
828
851
if (Inst.mayStore ()) {
829
- setExpScore (&Inst, TRI, MRI,
830
- *TII->getNamedOperand (Inst, AMDGPU::OpName::data),
831
- CurrScore);
852
+ setScoreByOperand (&Inst, TRI, MRI,
853
+ *TII->getNamedOperand (Inst, AMDGPU::OpName::data),
854
+ EXP_CNT, CurrScore);
832
855
} else if (SIInstrInfo::isAtomicRet (Inst)) {
833
- setExpScore (&Inst, TRI, MRI,
834
- *TII->getNamedOperand (Inst, AMDGPU::OpName::data),
835
- CurrScore);
856
+ setScoreByOperand (&Inst, TRI, MRI,
857
+ *TII->getNamedOperand (Inst, AMDGPU::OpName::data),
858
+ EXP_CNT, CurrScore);
836
859
}
837
860
} else if (TII->isMIMG (Inst)) {
838
861
if (Inst.mayStore ()) {
839
- setExpScore (&Inst, TRI, MRI, Inst.getOperand (0 ), CurrScore);
862
+ setScoreByOperand (&Inst, TRI, MRI, Inst.getOperand (0 ), EXP_CNT,
863
+ CurrScore);
840
864
} else if (SIInstrInfo::isAtomicRet (Inst)) {
841
- setExpScore (&Inst, TRI, MRI,
842
- *TII->getNamedOperand (Inst, AMDGPU::OpName::data),
843
- CurrScore);
865
+ setScoreByOperand (&Inst, TRI, MRI,
866
+ *TII->getNamedOperand (Inst, AMDGPU::OpName::data),
867
+ EXP_CNT, CurrScore);
844
868
}
845
869
} else if (TII->isMTBUF (Inst)) {
846
870
if (Inst.mayStore ())
847
- setExpScore (&Inst, TRI, MRI, Inst.getOperand (0 ), CurrScore);
871
+ setScoreByOperand (&Inst, TRI, MRI, Inst.getOperand (0 ), EXP_CNT,
872
+ CurrScore);
848
873
} else if (TII->isMUBUF (Inst)) {
849
874
if (Inst.mayStore ()) {
850
- setExpScore (&Inst, TRI, MRI, Inst.getOperand (0 ), CurrScore);
875
+ setScoreByOperand (&Inst, TRI, MRI, Inst.getOperand (0 ), EXP_CNT,
876
+ CurrScore);
851
877
} else if (SIInstrInfo::isAtomicRet (Inst)) {
852
- setExpScore (&Inst, TRI, MRI,
853
- *TII->getNamedOperand (Inst, AMDGPU::OpName::data),
854
- CurrScore);
878
+ setScoreByOperand (&Inst, TRI, MRI,
879
+ *TII->getNamedOperand (Inst, AMDGPU::OpName::data),
880
+ EXP_CNT, CurrScore);
855
881
}
856
882
} else if (TII->isLDSDIR (Inst)) {
857
883
// LDSDIR instructions attach the score to the destination.
858
- setExpScore (&Inst, TRI, MRI,
859
- *TII->getNamedOperand (Inst, AMDGPU::OpName::vdst), CurrScore);
884
+ setScoreByOperand (&Inst, TRI, MRI,
885
+ *TII->getNamedOperand (Inst, AMDGPU::OpName::vdst),
886
+ EXP_CNT, CurrScore);
860
887
} else {
861
888
if (TII->isEXP (Inst)) {
862
889
// For export the destination registers are really temps that
@@ -865,15 +892,13 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
865
892
// score.
866
893
for (MachineOperand &DefMO : Inst.all_defs ()) {
867
894
if (TRI->isVGPR (*MRI, DefMO.getReg ())) {
868
- setRegScore (
869
- TRI->getEncodingValue (AMDGPU::getMCReg (DefMO.getReg (), *ST)),
870
- EXP_CNT, CurrScore);
895
+ setScoreByOperand (&Inst, TRI, MRI, DefMO, EXP_CNT, CurrScore);
871
896
}
872
897
}
873
898
}
874
899
for (const MachineOperand &Op : Inst.all_uses ()) {
875
900
if (TRI->isVectorRegister (*MRI, Op.getReg ()))
876
- setExpScore (&Inst, TRI, MRI, Op, CurrScore);
901
+ setScoreByOperand (&Inst, TRI, MRI, Op, EXP_CNT , CurrScore);
877
902
}
878
903
}
879
904
} else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
@@ -901,9 +926,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
901
926
VgprVmemTypes[RegNo] |= 1 << V;
902
927
}
903
928
}
904
- for (int RegNo = Interval.first ; RegNo < Interval.second ; ++RegNo) {
905
- setRegScore (RegNo, T, CurrScore);
906
- }
929
+ setScoreByInterval (Interval, T, CurrScore);
907
930
}
908
931
if (Inst.mayStore () &&
909
932
(TII->isDS (Inst) || TII->mayWriteLDSThroughDMA (Inst))) {
@@ -1034,31 +1057,34 @@ void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1034
1057
Count = ~0u ;
1035
1058
}
1036
1059
1037
- void WaitcntBrackets::determineWait (InstCounterType T, int RegNo ,
1060
+ void WaitcntBrackets::determineWait (InstCounterType T, RegInterval Interval ,
1038
1061
AMDGPU::Waitcnt &Wait) const {
1039
- unsigned ScoreToWait = getRegScore (RegNo, T);
1040
-
1041
- // If the score of src_operand falls within the bracket, we need an
1042
- // s_waitcnt instruction.
1043
1062
const unsigned LB = getScoreLB (T);
1044
1063
const unsigned UB = getScoreUB (T);
1045
- if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1046
- if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat () &&
1047
- !ST->hasFlatLgkmVMemCountInOrder ()) {
1048
- // If there is a pending FLAT operation, and this is a VMem or LGKM
1049
- // waitcnt and the target can report early completion, then we need
1050
- // to force a waitcnt 0.
1051
- addWait (Wait, T, 0 );
1052
- } else if (counterOutOfOrder (T)) {
1053
- // Counter can get decremented out-of-order when there
1054
- // are multiple types event in the bracket. Also emit an s_wait counter
1055
- // with a conservative value of 0 for the counter.
1056
- addWait (Wait, T, 0 );
1057
- } else {
1058
- // If a counter has been maxed out avoid overflow by waiting for
1059
- // MAX(CounterType) - 1 instead.
1060
- unsigned NeededWait = std::min (UB - ScoreToWait, getWaitCountMax (T) - 1 );
1061
- addWait (Wait, T, NeededWait);
1064
+ for (int RegNo = Interval.first ; RegNo < Interval.second ; ++RegNo) {
1065
+ unsigned ScoreToWait = getRegScore (RegNo, T);
1066
+
1067
+ // If the score of src_operand falls within the bracket, we need an
1068
+ // s_waitcnt instruction.
1069
+ if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1070
+ if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat () &&
1071
+ !ST->hasFlatLgkmVMemCountInOrder ()) {
1072
+ // If there is a pending FLAT operation, and this is a VMem or LGKM
1073
+ // waitcnt and the target can report early completion, then we need
1074
+ // to force a waitcnt 0.
1075
+ addWait (Wait, T, 0 );
1076
+ } else if (counterOutOfOrder (T)) {
1077
+ // Counter can get decremented out-of-order when there
1078
+ // are multiple types event in the bracket. Also emit an s_wait counter
1079
+ // with a conservative value of 0 for the counter.
1080
+ addWait (Wait, T, 0 );
1081
+ } else {
1082
+ // If a counter has been maxed out avoid overflow by waiting for
1083
+ // MAX(CounterType) - 1 instead.
1084
+ unsigned NeededWait =
1085
+ std::min (UB - ScoreToWait, getWaitCountMax (T) - 1 );
1086
+ addWait (Wait, T, NeededWait);
1087
+ }
1062
1088
}
1063
1089
}
1064
1090
}
@@ -1670,18 +1696,16 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1670
1696
RegInterval CallAddrOpInterval =
1671
1697
ScoreBrackets.getRegInterval (&MI, MRI, TRI, CallAddrOp);
1672
1698
1673
- for (int RegNo = CallAddrOpInterval.first ;
1674
- RegNo < CallAddrOpInterval.second ; ++RegNo)
1675
- ScoreBrackets.determineWait (SmemAccessCounter, RegNo, Wait);
1699
+ ScoreBrackets.determineWait (SmemAccessCounter, CallAddrOpInterval,
1700
+ Wait);
1676
1701
1677
1702
if (const auto *RtnAddrOp =
1678
1703
TII->getNamedOperand (MI, AMDGPU::OpName::dst)) {
1679
1704
RegInterval RtnAddrOpInterval =
1680
1705
ScoreBrackets.getRegInterval (&MI, MRI, TRI, *RtnAddrOp);
1681
1706
1682
- for (int RegNo = RtnAddrOpInterval.first ;
1683
- RegNo < RtnAddrOpInterval.second ; ++RegNo)
1684
- ScoreBrackets.determineWait (SmemAccessCounter, RegNo, Wait);
1707
+ ScoreBrackets.determineWait (SmemAccessCounter, RtnAddrOpInterval,
1708
+ Wait);
1685
1709
}
1686
1710
}
1687
1711
} else {
@@ -1750,36 +1774,34 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1750
1774
RegInterval Interval = ScoreBrackets.getRegInterval (&MI, MRI, TRI, Op);
1751
1775
1752
1776
const bool IsVGPR = TRI->isVectorRegister (*MRI, Op.getReg ());
1753
- for (int RegNo = Interval.first ; RegNo < Interval.second ; ++RegNo) {
1754
- if (IsVGPR) {
1755
- // Implicit VGPR defs and uses are never a part of the memory
1756
- // instructions description and usually present to account for
1757
- // super-register liveness.
1758
- // TODO: Most of the other instructions also have implicit uses
1759
- // for the liveness accounting only.
1760
- if (Op.isImplicit () && MI.mayLoadOrStore ())
1761
- continue ;
1762
-
1763
- // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1764
- // previous write and this write are the same type of VMEM
1765
- // instruction, in which case they are (in some architectures)
1766
- // guaranteed to write their results in order anyway.
1767
- if (Op.isUse () || !updateVMCntOnly (MI) ||
1768
- ScoreBrackets.hasOtherPendingVmemTypes (RegNo,
1769
- getVmemType (MI)) ||
1770
- !ST->hasVmemWriteVgprInOrder ()) {
1771
- ScoreBrackets.determineWait (LOAD_CNT, RegNo, Wait);
1772
- ScoreBrackets.determineWait (SAMPLE_CNT, RegNo, Wait);
1773
- ScoreBrackets.determineWait (BVH_CNT, RegNo, Wait);
1774
- ScoreBrackets.clearVgprVmemTypes (RegNo);
1775
- }
1776
- if (Op.isDef () || ScoreBrackets.hasPendingEvent (EXP_LDS_ACCESS)) {
1777
- ScoreBrackets.determineWait (EXP_CNT, RegNo, Wait);
1778
- }
1779
- ScoreBrackets.determineWait (DS_CNT, RegNo, Wait);
1780
- } else {
1781
- ScoreBrackets.determineWait (SmemAccessCounter, RegNo, Wait);
1777
+ if (IsVGPR) {
1778
+ // Implicit VGPR defs and uses are never a part of the memory
1779
+ // instructions description and usually present to account for
1780
+ // super-register liveness.
1781
+ // TODO: Most of the other instructions also have implicit uses
1782
+ // for the liveness accounting only.
1783
+ if (Op.isImplicit () && MI.mayLoadOrStore ())
1784
+ continue ;
1785
+
1786
+ // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1787
+ // previous write and this write are the same type of VMEM
1788
+ // instruction, in which case they are (in some architectures)
1789
+ // guaranteed to write their results in order anyway.
1790
+ if (Op.isUse () || !updateVMCntOnly (MI) ||
1791
+ ScoreBrackets.hasOtherPendingVmemTypes (Interval,
1792
+ getVmemType (MI)) ||
1793
+ !ST->hasVmemWriteVgprInOrder ()) {
1794
+ ScoreBrackets.determineWait (LOAD_CNT, Interval, Wait);
1795
+ ScoreBrackets.determineWait (SAMPLE_CNT, Interval, Wait);
1796
+ ScoreBrackets.determineWait (BVH_CNT, Interval, Wait);
1797
+ ScoreBrackets.clearVgprVmemTypes (Interval);
1798
+ }
1799
+ if (Op.isDef () || ScoreBrackets.hasPendingEvent (EXP_LDS_ACCESS)) {
1800
+ ScoreBrackets.determineWait (EXP_CNT, Interval, Wait);
1782
1801
}
1802
+ ScoreBrackets.determineWait (DS_CNT, Interval, Wait);
1803
+ } else {
1804
+ ScoreBrackets.determineWait (SmemAccessCounter, Interval, Wait);
1783
1805
}
1784
1806
}
1785
1807
}
0 commit comments