@@ -3549,6 +3549,176 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3549
3549
return true ;
3550
3550
}
3551
3551
3552
+ // Match BITOP3 operation and return a number of matched instructions plus
3553
+ // truth table.
3554
+ static std::pair<unsigned , uint8_t > BitOp3_Op (SDValue In,
3555
+ SmallVectorImpl<SDValue> &Src) {
3556
+ unsigned NumOpcodes = 0 ;
3557
+ uint8_t LHSBits, RHSBits;
3558
+
3559
+ auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
3560
+ // Define truth table given Src0, Src1, Src2 bits permutations:
3561
+ // 0 0 0
3562
+ // 0 0 1
3563
+ // 0 1 0
3564
+ // 0 1 1
3565
+ // 1 0 0
3566
+ // 1 0 1
3567
+ // 1 1 0
3568
+ // 1 1 1
3569
+ const uint8_t SrcBits[3 ] = { 0xf0 , 0xcc , 0xaa };
3570
+
3571
+ if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
3572
+ if (C->isAllOnes ()) {
3573
+ Bits = 0xff ;
3574
+ return true ;
3575
+ }
3576
+ if (C->isZero ()) {
3577
+ Bits = 0 ;
3578
+ return true ;
3579
+ }
3580
+ }
3581
+
3582
+ for (unsigned I = 0 ; I < Src.size (); ++I) {
3583
+ // Try to find existing reused operand
3584
+ if (Src[I] == Op) {
3585
+ Bits = SrcBits[I];
3586
+ return true ;
3587
+ }
3588
+ // Try to replace parent operator
3589
+ if (Src[I] == In) {
3590
+ Bits = SrcBits[I];
3591
+ Src[I] = Op;
3592
+ return true ;
3593
+ }
3594
+ }
3595
+
3596
+ if (Src.size () == 3 ) {
3597
+ // No room left for operands. Try one last time, there can be a 'not' of
3598
+ // one of our source operands. In this case we can compute the bits
3599
+ // without growing Src vector.
3600
+ if (Op.getOpcode () == ISD::XOR) {
3601
+ if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand (1 ))) {
3602
+ if (C->isAllOnes ()) {
3603
+ SDValue LHS = Op.getOperand (0 );
3604
+ for (unsigned I = 0 ; I < Src.size (); ++I) {
3605
+ if (Src[I] == LHS) {
3606
+ Bits = ~SrcBits[I];
3607
+ return true ;
3608
+ }
3609
+ }
3610
+ }
3611
+ }
3612
+ }
3613
+
3614
+ return false ;
3615
+ }
3616
+
3617
+ Bits = SrcBits[Src.size ()];
3618
+ Src.push_back (Op);
3619
+ return true ;
3620
+ };
3621
+
3622
+ switch (In.getOpcode ()) {
3623
+ case ISD::AND:
3624
+ case ISD::OR:
3625
+ case ISD::XOR: {
3626
+ SDValue LHS = In.getOperand (0 );
3627
+ SDValue RHS = In.getOperand (1 );
3628
+
3629
+ SmallVector<SDValue, 3 > Backup (Src.begin (), Src.end ());
3630
+ if (!getOperandBits (LHS, LHSBits) ||
3631
+ !getOperandBits (RHS, RHSBits)) {
3632
+ Src = Backup;
3633
+ return std::make_pair (0 , 0 );
3634
+ }
3635
+
3636
+ // Recursion is naturally limited by the size of the operand vector.
3637
+ auto Op = BitOp3_Op (LHS, Src);
3638
+ if (Op.first ) {
3639
+ NumOpcodes += Op.first ;
3640
+ LHSBits = Op.second ;
3641
+ }
3642
+
3643
+ Op = BitOp3_Op (RHS, Src);
3644
+ if (Op.first ) {
3645
+ NumOpcodes += Op.first ;
3646
+ RHSBits = Op.second ;
3647
+ }
3648
+ break ;
3649
+ }
3650
+ default :
3651
+ return std::make_pair (0 , 0 );
3652
+ }
3653
+
3654
+ uint8_t TTbl;
3655
+ switch (In.getOpcode ()) {
3656
+ case ISD::AND:
3657
+ TTbl = LHSBits & RHSBits;
3658
+ break ;
3659
+ case ISD::OR:
3660
+ TTbl = LHSBits | RHSBits;
3661
+ break ;
3662
+ case ISD::XOR:
3663
+ TTbl = LHSBits ^ RHSBits;
3664
+ break ;
3665
+ default :
3666
+ break ;
3667
+ }
3668
+
3669
+ return std::make_pair (NumOpcodes + 1 , TTbl);
3670
+ }
3671
+
3672
+ bool AMDGPUDAGToDAGISel::SelectBITOP3 (SDValue In, SDValue &Src0, SDValue &Src1,
3673
+ SDValue &Src2, SDValue &Tbl) const {
3674
+ SmallVector<SDValue, 3 > Src;
3675
+ uint8_t TTbl;
3676
+ unsigned NumOpcodes;
3677
+
3678
+ std::tie (NumOpcodes, TTbl) = BitOp3_Op (In, Src);
3679
+
3680
+ // Src.empty() case can happen if all operands are all zero or all ones.
3681
+ // Normally it shall be optimized out before reaching this.
3682
+ if (NumOpcodes < 2 || Src.empty ())
3683
+ return false ;
3684
+
3685
+ // For a uniform case threshold should be higher to account for moves between
3686
+ // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
3687
+ // and a readtfirstlane after.
3688
+ if (NumOpcodes < 4 && !In->isDivergent ())
3689
+ return false ;
3690
+
3691
+ if (NumOpcodes == 2 && In.getValueType () == MVT::i32) {
3692
+ // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3693
+ // asm more readable. This cannot be modeled with AddedComplexity because
3694
+ // selector does not know how many operations did we match.
3695
+ if ((In.getOpcode () == ISD::XOR || In.getOpcode () == ISD::OR) &&
3696
+ (In.getOperand (0 ).getOpcode () == In.getOpcode () ||
3697
+ In.getOperand (1 ).getOpcode () == In.getOpcode ()))
3698
+ return false ;
3699
+
3700
+ if (In.getOpcode () == ISD::OR &&
3701
+ (In.getOperand (0 ).getOpcode () == ISD::AND ||
3702
+ In.getOperand (1 ).getOpcode () == ISD::AND))
3703
+ return false ;
3704
+ }
3705
+
3706
+ // Last operand can be ignored, turning a ternary operation into a binary.
3707
+ // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3708
+ // 'c' with 'a' here without changing the answer. In some pathological
3709
+ // cases it should be possible to get an operation with a single operand
3710
+ // too if optimizer would not catch it.
3711
+ while (Src.size () < 3 )
3712
+ Src.push_back (Src[0 ]);
3713
+
3714
+ Src0 = Src[0 ];
3715
+ Src1 = Src[1 ];
3716
+ Src2 = Src[2 ];
3717
+
3718
+ Tbl = CurDAG->getTargetConstant (TTbl, SDLoc (In), MVT::i32);
3719
+ return true ;
3720
+ }
3721
+
3552
3722
SDValue AMDGPUDAGToDAGISel::getHi16Elt (SDValue In) const {
3553
3723
if (In.isUndef ())
3554
3724
return CurDAG->getUNDEF (MVT::i32);
0 commit comments