@@ -2216,7 +2216,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2216
2216
assert (MF->getRegInfo ().isReserved (MFI->getScratchRSrcReg ()) &&
2217
2217
" unreserved scratch RSRC register" );
2218
2218
2219
- MachineOperand & FIOp = MI->getOperand (FIOperandNum);
2219
+ MachineOperand * FIOp = & MI->getOperand (FIOperandNum);
2220
2220
int Index = MI->getOperand (FIOperandNum).getIndex ();
2221
2221
2222
2222
Register FrameReg = FrameInfo.isFixedObjectIndex (Index) && hasBasePointer (*MF)
@@ -2445,7 +2445,299 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2445
2445
MI->eraseFromParent ();
2446
2446
return true ;
2447
2447
}
2448
+ case AMDGPU::V_ADD_U32_e32:
2449
+ case AMDGPU::V_ADD_U32_e64:
2450
+ case AMDGPU::V_ADD_CO_U32_e32:
2451
+ case AMDGPU::V_ADD_CO_U32_e64: {
2452
+ // TODO: Handle sub, and, or.
2453
+ unsigned NumDefs = MI->getNumExplicitDefs ();
2454
+ unsigned Src0Idx = NumDefs;
2455
+
2456
+ bool HasClamp = false ;
2457
+ MachineOperand *VCCOp = nullptr ;
2458
+
2459
+ switch (MI->getOpcode ()) {
2460
+ case AMDGPU::V_ADD_U32_e32:
2461
+ break ;
2462
+ case AMDGPU::V_ADD_U32_e64:
2463
+ HasClamp = MI->getOperand (3 ).getImm ();
2464
+ break ;
2465
+ case AMDGPU::V_ADD_CO_U32_e32:
2466
+ VCCOp = &MI->getOperand (3 );
2467
+ break ;
2468
+ case AMDGPU::V_ADD_CO_U32_e64:
2469
+ VCCOp = &MI->getOperand (1 );
2470
+ HasClamp = MI->getOperand (4 ).getImm ();
2471
+ break ;
2472
+ default :
2473
+ break ;
2474
+ }
2475
+ bool DeadVCC = !VCCOp || VCCOp->isDead ();
2476
+ MachineOperand &DstOp = MI->getOperand (0 );
2477
+ Register DstReg = DstOp.getReg ();
2478
+
2479
+ unsigned OtherOpIdx =
2480
+ FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2481
+ MachineOperand *OtherOp = &MI->getOperand (OtherOpIdx);
2482
+
2483
+ unsigned Src1Idx = Src0Idx + 1 ;
2484
+ Register MaterializedReg = FrameReg;
2485
+ Register ScavengedVGPR;
2486
+
2487
+ if (FrameReg && !ST.enableFlatScratch ()) {
2488
+ // We should just do an in-place update of the result register. However,
2489
+ // the value there may also be used by the add, in which case we need a
2490
+ // temporary register.
2491
+ //
2492
+ // FIXME: The scavenger is not finding the result register in the
2493
+ // common case where the add does not read the register.
2494
+
2495
+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2496
+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false , /* SPAdj=*/ 0 );
2497
+
2498
+ // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2499
+ // shift.
2500
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_LSHRREV_B32_e64))
2501
+ .addDef (ScavengedVGPR, RegState::Renamable)
2502
+ .addImm (ST.getWavefrontSizeLog2 ())
2503
+ .addReg (FrameReg);
2504
+ MaterializedReg = ScavengedVGPR;
2505
+ }
2506
+
2507
+ int64_t Offset = FrameInfo.getObjectOffset (Index);
2508
+ // For the non-immediate case, we could fall through to the default
2509
+ // handling, but we do an in-place update of the result register here to
2510
+ // avoid scavenging another register.
2511
+ if (OtherOp->isImm ()) {
2512
+ OtherOp->setImm (OtherOp->getImm () + Offset);
2513
+ Offset = 0 ;
2514
+ }
2515
+
2516
+ if ((!OtherOp->isImm () || OtherOp->getImm () != 0 ) && MaterializedReg) {
2517
+ if (ST.enableFlatScratch () &&
2518
+ !TII->isOperandLegal (*MI, Src1Idx, OtherOp)) {
2519
+ // We didn't need the shift above, so we have an SGPR for the frame
2520
+ // register, but may have a VGPR only operand.
2521
+ //
2522
+ // TODO: On gfx10+, we can easily change the opcode to the e64 version
2523
+ // and use the higher constant bus restriction to avoid this copy.
2524
+
2525
+ if (!ScavengedVGPR) {
2526
+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2527
+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2528
+ /* SPAdj=*/ 0 );
2529
+ }
2530
+
2531
+ assert (ScavengedVGPR != DstReg);
2532
+
2533
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2534
+ .addReg (MaterializedReg,
2535
+ MaterializedReg != FrameReg ? RegState::Kill : 0 );
2536
+ MaterializedReg = ScavengedVGPR;
2537
+ }
2538
+
2539
+ // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2540
+ // is not live, we could use a scalar add + vector add instead of 2
2541
+ // vector adds.
2542
+ auto AddI32 = BuildMI (*MBB, *MI, DL, TII->get (MI->getOpcode ()))
2543
+ .addDef (DstReg, RegState::Renamable);
2544
+ if (NumDefs == 2 )
2545
+ AddI32.add (MI->getOperand (1 ));
2546
+
2547
+ unsigned MaterializedRegFlags =
2548
+ MaterializedReg != FrameReg ? RegState::Kill : 0 ;
2549
+
2550
+ if (isVGPRClass (getPhysRegBaseClass (MaterializedReg))) {
2551
+ // If we know we have a VGPR already, it's more likely the other
2552
+ // operand is a legal vsrc0.
2553
+ AddI32
2554
+ .add (*OtherOp)
2555
+ .addReg (MaterializedReg, MaterializedRegFlags);
2556
+ } else {
2557
+ // Commute operands to avoid violating VOP2 restrictions. This will
2558
+ // typically happen when using scratch.
2559
+ AddI32
2560
+ .addReg (MaterializedReg, MaterializedRegFlags)
2561
+ .add (*OtherOp);
2562
+ }
2563
+
2564
+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e64 ||
2565
+ MI->getOpcode () == AMDGPU::V_ADD_U32_e64)
2566
+ AddI32.addImm (0 ); // clamp
2567
+
2568
+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e32)
2569
+ AddI32.setOperandDead (3 ); // Dead vcc
2570
+
2571
+ MaterializedReg = DstReg;
2572
+
2573
+ OtherOp->ChangeToRegister (MaterializedReg, false );
2574
+ OtherOp->setIsKill (true );
2575
+ FIOp->ChangeToImmediate (Offset);
2576
+ Offset = 0 ;
2577
+ } else if (Offset != 0 ) {
2578
+ assert (!MaterializedReg);
2579
+ FIOp->ChangeToImmediate (Offset);
2580
+ Offset = 0 ;
2581
+ } else {
2582
+ if (DeadVCC && !HasClamp) {
2583
+ assert (Offset == 0 );
2584
+
2585
+ // TODO: Losing kills and implicit operands. Just mutate to copy and
2586
+ // let lowerCopy deal with it?
2587
+ if (OtherOp->isReg () && OtherOp->getReg () == DstReg) {
2588
+ // Folded to an identity copy.
2589
+ MI->eraseFromParent ();
2590
+ return true ;
2591
+ }
2592
+
2593
+ // The immediate value should be in OtherOp
2594
+ MI->setDesc (TII->get (AMDGPU::V_MOV_B32_e32));
2595
+ MI->removeOperand (FIOperandNum);
2596
+
2597
+ unsigned NumOps = MI->getNumOperands ();
2598
+ for (unsigned I = NumOps - 2 ; I >= 2 ; --I)
2599
+ MI->removeOperand (I);
2600
+
2601
+ if (NumDefs == 2 )
2602
+ MI->removeOperand (1 );
2603
+
2604
+ // The code below can't deal with a mov.
2605
+ return true ;
2606
+ }
2607
+
2608
+ // This folded to a constant, but we have to keep the add around for
2609
+ // pointless implicit defs or clamp modifier.
2610
+ FIOp->ChangeToImmediate (0 );
2611
+ }
2612
+
2613
+ // Try to improve legality by commuting.
2614
+ if (!TII->isOperandLegal (*MI, Src1Idx) && TII->commuteInstruction (*MI)) {
2615
+ std::swap (FIOp, OtherOp);
2616
+ std::swap (FIOperandNum, OtherOpIdx);
2617
+ }
2448
2618
2619
+ for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
2620
+ // Depending on operand constraints we may need to insert another copy.
2621
+ if (!TII->isOperandLegal (*MI, SrcIdx)) {
2622
+ // If commuting didn't make the operands legal, we need to materialize
2623
+ // in a register.
2624
+ // TODO: Can use SGPR on gfx10+ in some cases.
2625
+ if (!ScavengedVGPR) {
2626
+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2627
+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2628
+ /* SPAdj=*/ 0 );
2629
+ }
2630
+
2631
+ assert (ScavengedVGPR != DstReg);
2632
+
2633
+ MachineOperand &Src = MI->getOperand (SrcIdx);
2634
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2635
+ .add (Src);
2636
+
2637
+ Src.ChangeToRegister (ScavengedVGPR, false );
2638
+ Src.setIsKill (true );
2639
+ }
2640
+ }
2641
+
2642
+ // Fold out add of 0 case that can appear in kernels.
2643
+ if (FIOp->isImm () && FIOp->getImm () == 0 && DeadVCC && !HasClamp) {
2644
+ if (OtherOp->isReg () && OtherOp->getReg () != DstReg) {
2645
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::COPY), DstReg).add (*OtherOp);
2646
+ }
2647
+
2648
+ MI->eraseFromParent ();
2649
+ }
2650
+
2651
+ return true ;
2652
+ }
2653
+ case AMDGPU::S_ADD_I32: {
2654
+ // TODO: Handle s_or_b32, s_and_b32.
2655
+ unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1 ;
2656
+ MachineOperand &OtherOp = MI->getOperand (OtherOpIdx);
2657
+
2658
+ assert (FrameReg || MFI->isBottomOfStack ());
2659
+
2660
+ MachineOperand &DstOp = MI->getOperand (0 );
2661
+ const DebugLoc &DL = MI->getDebugLoc ();
2662
+ Register MaterializedReg = FrameReg;
2663
+
2664
+ // Defend against live scc, which should never happen in practice.
2665
+ bool DeadSCC = MI->getOperand (3 ).isDead ();
2666
+
2667
+ Register TmpReg;
2668
+
2669
+ if (FrameReg && !ST.enableFlatScratch ()) {
2670
+ // FIXME: In the common case where the add does not also read its result
2671
+ // (i.e. this isn't a reg += fi), it's not finding the dest reg as
2672
+ // available.
2673
+ TmpReg = RS->scavengeRegisterBackwards (AMDGPU::SReg_32_XM0RegClass, MI,
2674
+ false , 0 );
2675
+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::S_LSHR_B32))
2676
+ .addDef (TmpReg, RegState::Renamable)
2677
+ .addReg (FrameReg)
2678
+ .addImm (ST.getWavefrontSizeLog2 ())
2679
+ .setOperandDead (3 ); // Set SCC dead
2680
+ MaterializedReg = TmpReg;
2681
+ }
2682
+
2683
+ int64_t Offset = FrameInfo.getObjectOffset (Index);
2684
+
2685
+ // For the non-immediate case, we could fall through to the default
2686
+ // handling, but we do an in-place update of the result register here to
2687
+ // avoid scavenging another register.
2688
+ if (OtherOp.isImm ()) {
2689
+ OtherOp.setImm (OtherOp.getImm () + Offset);
2690
+ Offset = 0 ;
2691
+
2692
+ if (MaterializedReg)
2693
+ FIOp->ChangeToRegister (MaterializedReg, false );
2694
+ else
2695
+ FIOp->ChangeToImmediate (0 );
2696
+ } else if (MaterializedReg) {
2697
+ // If we can't fold the other operand, do another increment.
2698
+ Register DstReg = DstOp.getReg ();
2699
+
2700
+ if (!TmpReg && MaterializedReg == FrameReg) {
2701
+ TmpReg = RS->scavengeRegisterBackwards (AMDGPU::SReg_32_XM0RegClass,
2702
+ MI, false , 0 );
2703
+ DstReg = TmpReg;
2704
+ }
2705
+
2706
+ auto AddI32 = BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::S_ADD_I32))
2707
+ .addDef (DstReg, RegState::Renamable)
2708
+ .addReg (MaterializedReg, RegState::Kill)
2709
+ .add (OtherOp);
2710
+ if (DeadSCC)
2711
+ AddI32.setOperandDead (3 );
2712
+
2713
+ MaterializedReg = DstReg;
2714
+
2715
+ OtherOp.ChangeToRegister (MaterializedReg, false );
2716
+ OtherOp.setIsKill (true );
2717
+ OtherOp.setIsRenamable (true );
2718
+ FIOp->ChangeToImmediate (Offset);
2719
+ } else {
2720
+ // If we don't have any other offset to apply, we can just directly
2721
+ // interpret the frame index as the offset.
2722
+ FIOp->ChangeToImmediate (Offset);
2723
+ }
2724
+
2725
+ if (DeadSCC && OtherOp.isImm () && OtherOp.getImm () == 0 ) {
2726
+ assert (Offset == 0 );
2727
+ MI->removeOperand (3 );
2728
+ MI->removeOperand (OtherOpIdx);
2729
+ MI->setDesc (TII->get (FIOp->isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2730
+ } else if (DeadSCC && FIOp->isImm () && FIOp->getImm () == 0 ) {
2731
+ assert (Offset == 0 );
2732
+ MI->removeOperand (3 );
2733
+ MI->removeOperand (FIOperandNum);
2734
+ MI->setDesc (
2735
+ TII->get (OtherOp.isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2736
+ }
2737
+
2738
+ assert (!FIOp->isFI ());
2739
+ return true ;
2740
+ }
2449
2741
default : {
2450
2742
// Other access to frame index
2451
2743
const DebugLoc &DL = MI->getDebugLoc ();
@@ -2459,7 +2751,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2459
2751
2460
2752
// The offset is always swizzled, just replace it
2461
2753
if (FrameReg)
2462
- FIOp. ChangeToRegister (FrameReg, false );
2754
+ FIOp-> ChangeToRegister (FrameReg, false );
2463
2755
2464
2756
MachineOperand *OffsetOp =
2465
2757
TII->getNamedOperand (*MI, AMDGPU::OpName::offset);
@@ -2512,18 +2804,18 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2512
2804
}
2513
2805
2514
2806
if (!FrameReg) {
2515
- FIOp. ChangeToImmediate (Offset);
2516
- if (TII->isImmOperandLegal (*MI, FIOperandNum, FIOp))
2807
+ FIOp-> ChangeToImmediate (Offset);
2808
+ if (TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp))
2517
2809
return false ;
2518
2810
}
2519
2811
2520
2812
// We need to use register here. Check if we can use an SGPR or need
2521
2813
// a VGPR.
2522
- FIOp. ChangeToRegister (AMDGPU::M0, false );
2523
- bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, & FIOp);
2814
+ FIOp-> ChangeToRegister (AMDGPU::M0, false );
2815
+ bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, FIOp);
2524
2816
2525
2817
if (!Offset && FrameReg && UseSGPR) {
2526
- FIOp. setReg (FrameReg);
2818
+ FIOp-> setReg (FrameReg);
2527
2819
return false ;
2528
2820
}
2529
2821
@@ -2532,8 +2824,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2532
2824
2533
2825
Register TmpReg =
2534
2826
RS->scavengeRegisterBackwards (*RC, MI, false , 0 , !UseSGPR);
2535
- FIOp. setReg (TmpReg);
2536
- FIOp. setIsKill ();
2827
+ FIOp-> setReg (TmpReg);
2828
+ FIOp-> setIsKill ();
2537
2829
2538
2830
if ((!FrameReg || !Offset) && TmpReg) {
2539
2831
unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -2562,8 +2854,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2562
2854
if (!TmpSReg) {
2563
2855
// Use frame register and restore it after.
2564
2856
TmpSReg = FrameReg;
2565
- FIOp. setReg (FrameReg);
2566
- FIOp. setIsKill (false );
2857
+ FIOp-> setReg (FrameReg);
2858
+ FIOp-> setIsKill (false );
2567
2859
}
2568
2860
2569
2861
if (NeedSaveSCC) {
@@ -2802,7 +3094,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2802
3094
MI->eraseFromParent ();
2803
3095
return true ;
2804
3096
}
2805
- FIOp. ChangeToRegister (ResultReg, false , false , true );
3097
+ FIOp-> ChangeToRegister (ResultReg, false , false , true );
2806
3098
return false ;
2807
3099
}
2808
3100
@@ -2833,13 +3125,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2833
3125
// If the offset is simply too big, don't convert to a scratch wave offset
2834
3126
// relative index.
2835
3127
2836
- FIOp. ChangeToImmediate (Offset);
2837
- if (!TII->isImmOperandLegal (*MI, FIOperandNum, FIOp)) {
3128
+ FIOp-> ChangeToImmediate (Offset);
3129
+ if (!TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp)) {
2838
3130
Register TmpReg = RS->scavengeRegisterBackwards (AMDGPU::VGPR_32RegClass,
2839
3131
MI, false , 0 );
2840
3132
BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), TmpReg)
2841
3133
.addImm (Offset);
2842
- FIOp. ChangeToRegister (TmpReg, false , false , true );
3134
+ FIOp-> ChangeToRegister (TmpReg, false , false , true );
2843
3135
}
2844
3136
}
2845
3137
}
0 commit comments