@@ -762,7 +762,64 @@ for.exit:
762
762
ret void
763
763
}
764
764
765
+ ;; Make sure the histogram intrinsic uses the active lane mask when tail folding.
766
+ define void @simple_histogram_tailfold (ptr noalias %buckets , ptr readonly %indices , i64 %N ) #0 {
767
+ ; CHECK-LABEL: define void @simple_histogram_tailfold(
768
+ ; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
769
+ ; CHECK-NEXT: entry:
770
+ ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
771
+ ; CHECK: vector.ph:
772
+ ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
773
+ ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP2]], 2
774
+ ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
775
+ ; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
776
+ ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP5]])
777
+ ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
778
+ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
779
+ ; CHECK: vector.body:
780
+ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
781
+ ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
782
+ ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
783
+ ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
784
+ ; CHECK-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
785
+ ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
786
+ ; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
787
+ ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
788
+ ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]])
789
+ ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
790
+ ; CHECK-NEXT: br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP14:![0-9]+]]
791
+ ; CHECK: middle.block:
792
+ ; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
793
+ ; CHECK: scalar.ph:
794
+ ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
795
+ ; CHECK: for.body:
796
+ ; CHECK-NEXT: br i1 poison, label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
797
+ ; CHECK: for.exit:
798
+ ; CHECK-NEXT: ret void
799
+ ;
800
+ entry:
801
+ br label %for.body
802
+
803
+ for.body:
804
+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %for.body ]
805
+ %arrayidx = getelementptr inbounds i32 , ptr %indices , i64 %iv
806
+ %0 = load i32 , ptr %arrayidx , align 4
807
+ %idxprom1 = zext i32 %0 to i64
808
+ %arrayidx2 = getelementptr inbounds i32 , ptr %buckets , i64 %idxprom1
809
+ %1 = load i32 , ptr %arrayidx2 , align 4
810
+ %inc = add nsw i32 %1 , 1
811
+ store i32 %inc , ptr %arrayidx2 , align 4
812
+ %iv.next = add nuw nsw i64 %iv , 1
813
+ %exitcond = icmp eq i64 %iv.next , %N
814
+ br i1 %exitcond , label %for.exit , label %for.body , !llvm.loop !2
815
+
816
+ for.exit:
817
+ ret void
818
+ }
819
+
765
820
attributes #0 = { "target-features" ="+sve2" vscale_range(1 ,16 ) }
766
821
767
822
!0 = distinct !{!0 , !1 }
768
823
!1 = !{!"llvm.loop.interleave.count" , i32 2 }
824
+ !2 = distinct !{!2 , !3 }
825
+ !3 = !{!"llvm.loop.vectorize.predicate.enable" , i1 true }
0 commit comments