Skip to content

Commit 46b86f9

Browse files
committed
[VP] IR expansion for bitreverse/bswap
1 parent 32f7197 commit 46b86f9

File tree

2 files changed

+116
-1
lines changed

2 files changed

+116
-1
lines changed

llvm/lib/CodeGen/ExpandVectorPredication.cpp

+12-1
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,15 @@ Value *CachingVPExpander::expandPredicationToIntCall(
299299
replaceOperation(*NewOp, VPI);
300300
return NewOp;
301301
}
302+
case Intrinsic::bswap:
303+
case Intrinsic::bitreverse: {
304+
Value *Op = VPI.getOperand(0);
305+
Function *Fn = Intrinsic::getDeclaration(
306+
VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()});
307+
Value *NewOp = Builder.CreateCall(Fn, {Op}, VPI.getName());
308+
replaceOperation(*NewOp, VPI);
309+
return NewOp;
310+
}
302311
}
303312
return nullptr;
304313
}
@@ -702,13 +711,15 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
702711
case Intrinsic::vp_fneg: {
703712
Value *NewNegOp = Builder.CreateFNeg(VPI.getOperand(0), VPI.getName());
704713
replaceOperation(*NewNegOp, VPI);
705-
return NewNegOp;
714+
return NewNegOp;
706715
}
707716
case Intrinsic::vp_abs:
708717
case Intrinsic::vp_smax:
709718
case Intrinsic::vp_smin:
710719
case Intrinsic::vp_umax:
711720
case Intrinsic::vp_umin:
721+
case Intrinsic::vp_bswap:
722+
case Intrinsic::vp_bitreverse:
712723
return expandPredicationToIntCall(Builder, VPI,
713724
VPI.getFunctionalIntrinsicID().value());
714725
case Intrinsic::vp_fabs:

llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll

+104
Original file line numberDiff line numberDiff line change
@@ -956,3 +956,107 @@ define void @vp_umin_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun
956956
ret void
957957
}
958958
declare <4 x i32> @llvm.vp.umin.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
959+
960+
define <4 x i32> @vp_bitreverse_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
961+
; SSE-LABEL: vp_bitreverse_v4i32:
962+
; SSE: # %bb.0:
963+
; SSE-NEXT: pxor %xmm1, %xmm1
964+
; SSE-NEXT: movdqa %xmm0, %xmm2
965+
; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
966+
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
967+
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
968+
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
969+
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
970+
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
971+
; SSE-NEXT: packuswb %xmm2, %xmm0
972+
; SSE-NEXT: movdqa %xmm0, %xmm1
973+
; SSE-NEXT: psrlw $4, %xmm1
974+
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
975+
; SSE-NEXT: pand %xmm2, %xmm1
976+
; SSE-NEXT: pand %xmm2, %xmm0
977+
; SSE-NEXT: psllw $4, %xmm0
978+
; SSE-NEXT: por %xmm1, %xmm0
979+
; SSE-NEXT: movdqa %xmm0, %xmm1
980+
; SSE-NEXT: psrlw $2, %xmm1
981+
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
982+
; SSE-NEXT: pand %xmm2, %xmm1
983+
; SSE-NEXT: pand %xmm2, %xmm0
984+
; SSE-NEXT: psllw $2, %xmm0
985+
; SSE-NEXT: por %xmm1, %xmm0
986+
; SSE-NEXT: movdqa %xmm0, %xmm1
987+
; SSE-NEXT: psrlw $1, %xmm1
988+
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
989+
; SSE-NEXT: pand %xmm2, %xmm1
990+
; SSE-NEXT: pand %xmm2, %xmm0
991+
; SSE-NEXT: paddb %xmm0, %xmm0
992+
; SSE-NEXT: por %xmm1, %xmm0
993+
; SSE-NEXT: retq
994+
;
995+
; AVX1-LABEL: vp_bitreverse_v4i32:
996+
; AVX1: # %bb.0:
997+
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
998+
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
999+
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
1000+
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1001+
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1002+
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
1003+
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1004+
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1005+
; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
1006+
; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
1007+
; AVX1-NEXT: retq
1008+
;
1009+
; AVX2-LABEL: vp_bitreverse_v4i32:
1010+
; AVX2: # %bb.0:
1011+
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1012+
; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1013+
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
1014+
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1015+
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1016+
; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
1017+
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1018+
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1019+
; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
1020+
; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
1021+
; AVX2-NEXT: retq
1022+
;
1023+
; AVX512-LABEL: vp_bitreverse_v4i32:
1024+
; AVX512: # %bb.0:
1025+
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1026+
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1027+
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
1028+
; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1029+
; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1030+
; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0
1031+
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1032+
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1033+
; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0
1034+
; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
1035+
; AVX512-NEXT: retq
1036+
%v = call <4 x i32> @llvm.vp.bitreverse.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl)
1037+
ret <4 x i32> %v
1038+
}
1039+
declare <4 x i32> @llvm.vp.bitreverse.v4i32(<4 x i32>, <4 x i1>, i32)
1040+
1041+
define <4 x i32> @vp_bswap_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
1042+
; SSE-LABEL: vp_bswap_v4i32:
1043+
; SSE: # %bb.0:
1044+
; SSE-NEXT: pxor %xmm1, %xmm1
1045+
; SSE-NEXT: movdqa %xmm0, %xmm2
1046+
; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1047+
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1048+
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1049+
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1050+
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1051+
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1052+
; SSE-NEXT: packuswb %xmm2, %xmm0
1053+
; SSE-NEXT: retq
1054+
;
1055+
; AVX-LABEL: vp_bswap_v4i32:
1056+
; AVX: # %bb.0:
1057+
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1058+
; AVX-NEXT: retq
1059+
%v = call <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl)
1060+
ret <4 x i32> %v
1061+
}
1062+
declare <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32>, <4 x i1>, i32)

0 commit comments

Comments
 (0)