@@ -635,6 +635,56 @@ while.end71: ; preds = %while.body38, %whil
635
635
ret void
636
636
}
637
637
638
+ ; FIXME: This should not be vectorizing (further) with expensive shuffles.
639
+ ; The old cost of the or+extract should be 2*1 (or) + 4*2 (extract). The new
640
+ ; cost should be 1*1 (or) + 2*2 (extract) + at least 4 (shuffles).
641
+ define i1 @tryMapToRange (ptr %values , ptr %result , <2 x i64 > %hi , <2 x i64 > %lo ) {
642
+ ; CHECK-LABEL: @tryMapToRange(
643
+ ; CHECK-NEXT: [[L:%.*]] = load <2 x i64>, ptr [[VALUES:%.*]], align 8
644
+ ; CHECK-NEXT: [[C1:%.*]] = icmp sgt <2 x i64> [[L]], [[HI:%.*]]
645
+ ; CHECK-NEXT: [[S1:%.*]] = sext <2 x i1> [[C1]] to <2 x i64>
646
+ ; CHECK-NEXT: [[BC1:%.*]] = bitcast <2 x i64> [[S1]] to <16 x i8>
647
+ ; CHECK-NEXT: [[A1:%.*]] = and <16 x i8> [[BC1]], <i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
648
+ ; CHECK-NEXT: [[C2:%.*]] = icmp slt <2 x i64> [[L]], [[LO:%.*]]
649
+ ; CHECK-NEXT: [[S2:%.*]] = sext <2 x i1> [[C2]] to <2 x i64>
650
+ ; CHECK-NEXT: [[BC2:%.*]] = bitcast <2 x i64> [[S2]] to <16 x i8>
651
+ ; CHECK-NEXT: [[A2:%.*]] = and <16 x i8> [[BC2]], <i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
652
+ ; CHECK-NEXT: [[REASS_SUB:%.*]] = sub <2 x i64> [[L]], [[LO]]
653
+ ; CHECK-NEXT: [[ADD_I_I_I_I_I_I:%.*]] = add <2 x i64> [[REASS_SUB]], splat (i64 1)
654
+ ; CHECK-NEXT: store <2 x i64> [[ADD_I_I_I_I_I_I]], ptr [[RESULT:%.*]], align 8
655
+ ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A1]], <16 x i8> [[A2]], <2 x i32> <i32 8, i32 24>
656
+ ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[A1]], <16 x i8> [[A2]], <2 x i32> <i32 0, i32 16>
657
+ ; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i8> [[TMP1]], [[TMP2]]
658
+ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i8> [[TMP3]], i32 0
659
+ ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP3]], i32 1
660
+ ; CHECK-NEXT: [[O3:%.*]] = or i8 [[TMP4]], [[TMP5]]
661
+ ; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[O3]], 0
662
+ ; CHECK-NEXT: ret i1 [[C]]
663
+ ;
664
+ %l = load <2 x i64 >, ptr %values , align 8
665
+ %c1 = icmp sgt <2 x i64 > %l , %hi
666
+ %s1 = sext <2 x i1 > %c1 to <2 x i64 >
667
+ %bc1 = bitcast <2 x i64 > %s1 to <16 x i8 >
668
+ %a1 = and <16 x i8 > %bc1 , <i8 1 , i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 1 , i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
669
+ %e1 = extractelement <16 x i8 > %a1 , i64 0
670
+ %e2 = extractelement <16 x i8 > %a1 , i64 8
671
+ %c2 = icmp slt <2 x i64 > %l , %lo
672
+ %s2 = sext <2 x i1 > %c2 to <2 x i64 >
673
+ %bc2 = bitcast <2 x i64 > %s2 to <16 x i8 >
674
+ %a2 = and <16 x i8 > %bc2 , <i8 1 , i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 1 , i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
675
+ %e3 = extractelement <16 x i8 > %a2 , i64 0
676
+ %e4 = extractelement <16 x i8 > %a2 , i64 8
677
+ %reass.sub = sub <2 x i64 > %l , %lo
678
+ %add.i.i.i.i.i.i = add <2 x i64 > %reass.sub , splat (i64 1 )
679
+ store <2 x i64 > %add.i.i.i.i.i.i , ptr %result , align 8
680
+ %o1 = or i8 %e2 , %e1
681
+ %o2 = or i8 %e4 , %e3
682
+ %o3 = or i8 %o1 , %o2
683
+ %c = icmp eq i8 %o3 , 0
684
+ ret i1 %c
685
+ }
686
+
687
+
638
688
declare <16 x i8 > @llvm.ctpop.v16i8 (<16 x i8 >) #1
639
689
declare <8 x i16 > @llvm.aarch64.neon.uaddlp.v8i16.v16i8 (<16 x i8 >) #2
640
690
declare <4 x i32 > @llvm.aarch64.neon.uaddlp.v4i32.v8i16 (<8 x i16 >) #2
0 commit comments