@@ -3552,6 +3552,176 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3552
3552
return true ;
3553
3553
}
3554
3554
3555
+ // Match BITOP3 operation and return a number of matched instructions plus
3556
+ // truth table.
3557
+ static std::pair<unsigned , uint8_t > BitOp3_Op (SDValue In,
3558
+ SmallVectorImpl<SDValue> &Src) {
3559
+ unsigned NumOpcodes = 0 ;
3560
+ uint8_t LHSBits, RHSBits;
3561
+
3562
+ auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
3563
+ // Define truth table given Src0, Src1, Src2 bits permutations:
3564
+ // 0 0 0
3565
+ // 0 0 1
3566
+ // 0 1 0
3567
+ // 0 1 1
3568
+ // 1 0 0
3569
+ // 1 0 1
3570
+ // 1 1 0
3571
+ // 1 1 1
3572
+ const uint8_t SrcBits[3 ] = { 0xf0 , 0xcc , 0xaa };
3573
+
3574
+ if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
3575
+ if (C->isAllOnes ()) {
3576
+ Bits = 0xff ;
3577
+ return true ;
3578
+ }
3579
+ if (C->isZero ()) {
3580
+ Bits = 0 ;
3581
+ return true ;
3582
+ }
3583
+ }
3584
+
3585
+ for (unsigned I = 0 ; I < Src.size (); ++I) {
3586
+ // Try to find existing reused operand
3587
+ if (Src[I] == Op) {
3588
+ Bits = SrcBits[I];
3589
+ return true ;
3590
+ }
3591
+ // Try to replace parent operator
3592
+ if (Src[I] == In) {
3593
+ Bits = SrcBits[I];
3594
+ Src[I] = Op;
3595
+ return true ;
3596
+ }
3597
+ }
3598
+
3599
+ if (Src.size () == 3 ) {
3600
+ // No room left for operands. Try one last time, there can be a 'not' of
3601
+ // one of our source operands. In this case we can compute the bits
3602
+ // without growing Src vector.
3603
+ if (Op.getOpcode () == ISD::XOR) {
3604
+ if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand (1 ))) {
3605
+ if (C->isAllOnes ()) {
3606
+ SDValue LHS = Op.getOperand (0 );
3607
+ for (unsigned I = 0 ; I < Src.size (); ++I) {
3608
+ if (Src[I] == LHS) {
3609
+ Bits = ~SrcBits[I];
3610
+ return true ;
3611
+ }
3612
+ }
3613
+ }
3614
+ }
3615
+ }
3616
+
3617
+ return false ;
3618
+ }
3619
+
3620
+ Bits = SrcBits[Src.size ()];
3621
+ Src.push_back (Op);
3622
+ return true ;
3623
+ };
3624
+
3625
+ switch (In.getOpcode ()) {
3626
+ case ISD::AND:
3627
+ case ISD::OR:
3628
+ case ISD::XOR: {
3629
+ SDValue LHS = In.getOperand (0 );
3630
+ SDValue RHS = In.getOperand (1 );
3631
+
3632
+ SmallVector<SDValue, 3 > Backup (Src.begin (), Src.end ());
3633
+ if (!getOperandBits (LHS, LHSBits) ||
3634
+ !getOperandBits (RHS, RHSBits)) {
3635
+ Src = Backup;
3636
+ return std::make_pair (0 , 0 );
3637
+ }
3638
+
3639
+ // Recursion is naturally limited by the size of the operand vector.
3640
+ auto Op = BitOp3_Op (LHS, Src);
3641
+ if (Op.first ) {
3642
+ NumOpcodes += Op.first ;
3643
+ LHSBits = Op.second ;
3644
+ }
3645
+
3646
+ Op = BitOp3_Op (RHS, Src);
3647
+ if (Op.first ) {
3648
+ NumOpcodes += Op.first ;
3649
+ RHSBits = Op.second ;
3650
+ }
3651
+ break ;
3652
+ }
3653
+ default :
3654
+ return std::make_pair (0 , 0 );
3655
+ }
3656
+
3657
+ uint8_t TTbl;
3658
+ switch (In.getOpcode ()) {
3659
+ case ISD::AND:
3660
+ TTbl = LHSBits & RHSBits;
3661
+ break ;
3662
+ case ISD::OR:
3663
+ TTbl = LHSBits | RHSBits;
3664
+ break ;
3665
+ case ISD::XOR:
3666
+ TTbl = LHSBits ^ RHSBits;
3667
+ break ;
3668
+ default :
3669
+ break ;
3670
+ }
3671
+
3672
+ return std::make_pair (NumOpcodes + 1 , TTbl);
3673
+ }
3674
+
3675
+ bool AMDGPUDAGToDAGISel::SelectBITOP3 (SDValue In, SDValue &Src0, SDValue &Src1,
3676
+ SDValue &Src2, SDValue &Tbl) const {
3677
+ SmallVector<SDValue, 3 > Src;
3678
+ uint8_t TTbl;
3679
+ unsigned NumOpcodes;
3680
+
3681
+ std::tie (NumOpcodes, TTbl) = BitOp3_Op (In, Src);
3682
+
3683
+ // Src.empty() case can happen if all operands are all zero or all ones.
3684
+ // Normally it shall be optimized out before reaching this.
3685
+ if (NumOpcodes < 2 || Src.empty ())
3686
+ return false ;
3687
+
3688
+ // For a uniform case threshold should be higher to account for moves between
3689
+ // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
3690
+ // and a readtfirstlane after.
3691
+ if (NumOpcodes < 4 && !In->isDivergent ())
3692
+ return false ;
3693
+
3694
+ if (NumOpcodes == 2 && In.getValueType () == MVT::i32) {
3695
+ // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3696
+ // asm more readable. This cannot be modeled with AddedComplexity because
3697
+ // selector does not know how many operations did we match.
3698
+ if ((In.getOpcode () == ISD::XOR || In.getOpcode () == ISD::OR) &&
3699
+ (In.getOperand (0 ).getOpcode () == In.getOpcode () ||
3700
+ In.getOperand (1 ).getOpcode () == In.getOpcode ()))
3701
+ return false ;
3702
+
3703
+ if (In.getOpcode () == ISD::OR &&
3704
+ (In.getOperand (0 ).getOpcode () == ISD::AND ||
3705
+ In.getOperand (1 ).getOpcode () == ISD::AND))
3706
+ return false ;
3707
+ }
3708
+
3709
+ // Last operand can be ignored, turning a ternary operation into a binary.
3710
+ // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3711
+ // 'c' with 'a' here without changing the answer. In some pathological
3712
+ // cases it should be possible to get an operation with a single operand
3713
+ // too if optimizer would not catch it.
3714
+ while (Src.size () < 3 )
3715
+ Src.push_back (Src[0 ]);
3716
+
3717
+ Src0 = Src[0 ];
3718
+ Src1 = Src[1 ];
3719
+ Src2 = Src[2 ];
3720
+
3721
+ Tbl = CurDAG->getTargetConstant (TTbl, SDLoc (In), MVT::i32);
3722
+ return true ;
3723
+ }
3724
+
3555
3725
SDValue AMDGPUDAGToDAGISel::getHi16Elt (SDValue In) const {
3556
3726
if (In.isUndef ())
3557
3727
return CurDAG->getUNDEF (MVT::i32);
0 commit comments