@@ -681,3 +681,127 @@ define <1 x i32> @test_builtin_column_major_load_multiuse(ptr %src, <4 x i32> %b
681
681
}
682
682
683
683
declare void @use.v4i32 (<4 x i32 >)
684
+
685
+ define <1 x i32 > @test_builtin_column_major_variable_stride (ptr %src , <4 x i32 > %a , i64 %stride ) {
686
+ ; CHECK-LABEL: @test_builtin_column_major_variable_stride(
687
+ ; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
688
+ ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[VEC_START]]
689
+ ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <1 x i32>, ptr [[VEC_GEP]], align 4
690
+ ; CHECK-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]]
691
+ ; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[VEC_START1]]
692
+ ; CHECK-NEXT: [[COL_LOAD3:%.*]] = load <1 x i32>, ptr [[VEC_GEP2]], align 4
693
+ ; CHECK-NEXT: [[VEC_START4:%.*]] = mul i64 2, [[STRIDE]]
694
+ ; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[VEC_START4]]
695
+ ; CHECK-NEXT: [[COL_LOAD6:%.*]] = load <1 x i32>, ptr [[VEC_GEP5]], align 4
696
+ ; CHECK-NEXT: [[VEC_START7:%.*]] = mul i64 3, [[STRIDE]]
697
+ ; CHECK-NEXT: [[VEC_GEP8:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[VEC_START7]]
698
+ ; CHECK-NEXT: [[COL_LOAD9:%.*]] = load <1 x i32>, ptr [[VEC_GEP8]], align 4
699
+ ; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
700
+ ; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <1 x i32> [[COL_LOAD]], <1 x i32> poison, <1 x i32> zeroinitializer
701
+ ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[SPLIT]], i64 0
702
+ ; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> poison, i32 [[TMP1]], i64 0
703
+ ; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> poison, <1 x i32> zeroinitializer
704
+ ; CHECK-NEXT: [[TMP2:%.*]] = mul <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]]
705
+ ; CHECK-NEXT: [[BLOCK10:%.*]] = shufflevector <1 x i32> [[COL_LOAD3]], <1 x i32> poison, <1 x i32> zeroinitializer
706
+ ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[SPLIT]], i64 1
707
+ ; CHECK-NEXT: [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x i32> poison, i32 [[TMP3]], i64 0
708
+ ; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT11]], <1 x i32> poison, <1 x i32> zeroinitializer
709
+ ; CHECK-NEXT: [[TMP4:%.*]] = mul <1 x i32> [[BLOCK10]], [[SPLAT_SPLAT12]]
710
+ ; CHECK-NEXT: [[TMP5:%.*]] = add <1 x i32> [[TMP2]], [[TMP4]]
711
+ ; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <1 x i32> [[COL_LOAD6]], <1 x i32> poison, <1 x i32> zeroinitializer
712
+ ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[SPLIT]], i64 2
713
+ ; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x i32> poison, i32 [[TMP6]], i64 0
714
+ ; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT14]], <1 x i32> poison, <1 x i32> zeroinitializer
715
+ ; CHECK-NEXT: [[TMP7:%.*]] = mul <1 x i32> [[BLOCK13]], [[SPLAT_SPLAT15]]
716
+ ; CHECK-NEXT: [[TMP8:%.*]] = add <1 x i32> [[TMP5]], [[TMP7]]
717
+ ; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <1 x i32> [[COL_LOAD9]], <1 x i32> poison, <1 x i32> zeroinitializer
718
+ ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[SPLIT]], i64 3
719
+ ; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x i32> poison, i32 [[TMP9]], i64 0
720
+ ; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT17]], <1 x i32> poison, <1 x i32> zeroinitializer
721
+ ; CHECK-NEXT: [[TMP10:%.*]] = mul <1 x i32> [[BLOCK16]], [[SPLAT_SPLAT18]]
722
+ ; CHECK-NEXT: [[TMP11:%.*]] = add <1 x i32> [[TMP8]], [[TMP10]]
723
+ ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x i32> [[TMP11]], <1 x i32> poison, <1 x i32> zeroinitializer
724
+ ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <1 x i32> undef, <1 x i32> [[TMP12]], <1 x i32> <i32 1>
725
+ ; CHECK-NEXT: ret <1 x i32> [[TMP13]]
726
+ ;
727
+ %l = call <4 x i32 > @llvm.matrix.column.major.load.v4i32.i64 (ptr %src , i64 %stride , i1 false , i32 1 , i32 4 )
728
+ %r = call <1 x i32 > @llvm.matrix.multiply.v1i32.v4i32.v4i32 (<4 x i32 > %l , <4 x i32 > %a , i32 1 , i32 4 , i32 1 )
729
+ ret <1 x i32 > %r
730
+ }
731
+
732
+ define <1 x i32 > @test_builtin_column_major_variable_stride_multiuse (ptr %src , <5 x i32 > %a , i64 %stride ) {
733
+ ; CHECK-LABEL: @test_builtin_column_major_variable_stride_multiuse(
734
+ ; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
735
+ ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[VEC_START]]
736
+ ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <1 x i32>, ptr [[VEC_GEP]], align 4
737
+ ; CHECK-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]]
738
+ ; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[VEC_START1]]
739
+ ; CHECK-NEXT: [[COL_LOAD3:%.*]] = load <1 x i32>, ptr [[VEC_GEP2]], align 4
740
+ ; CHECK-NEXT: [[VEC_START4:%.*]] = mul i64 2, [[STRIDE]]
741
+ ; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[VEC_START4]]
742
+ ; CHECK-NEXT: [[COL_LOAD6:%.*]] = load <1 x i32>, ptr [[VEC_GEP5]], align 4
743
+ ; CHECK-NEXT: [[VEC_START7:%.*]] = mul i64 3, [[STRIDE]]
744
+ ; CHECK-NEXT: [[VEC_GEP8:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[VEC_START7]]
745
+ ; CHECK-NEXT: [[COL_LOAD9:%.*]] = load <1 x i32>, ptr [[VEC_GEP8]], align 4
746
+ ; CHECK-NEXT: [[VEC_START10:%.*]] = mul i64 4, [[STRIDE]]
747
+ ; CHECK-NEXT: [[VEC_GEP11:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[VEC_START10]]
748
+ ; CHECK-NEXT: [[COL_LOAD12:%.*]] = load <1 x i32>, ptr [[VEC_GEP11]], align 4
749
+ ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i32> [[COL_LOAD]], i64 0
750
+ ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x i32> poison, i32 [[TMP1]], i64 0
751
+ ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <1 x i32> [[COL_LOAD3]], i64 0
752
+ ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x i32> [[TMP2]], i32 [[TMP3]], i64 1
753
+ ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i32> [[COL_LOAD6]], i64 0
754
+ ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <5 x i32> [[TMP4]], i32 [[TMP5]], i64 2
755
+ ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i32> [[COL_LOAD9]], i64 0
756
+ ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <5 x i32> [[TMP6]], i32 [[TMP7]], i64 3
757
+ ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <1 x i32> [[COL_LOAD12]], i64 0
758
+ ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <5 x i32> [[TMP8]], i32 [[TMP9]], i64 4
759
+ ; CHECK-NEXT: call void @use.v5i32(<5 x i32> [[TMP10]])
760
+ ; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <5 x i32> [[A:%.*]], <5 x i32> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
761
+ ; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <1 x i32> [[COL_LOAD]], <1 x i32> poison, <1 x i32> zeroinitializer
762
+ ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <5 x i32> [[SPLIT]], i64 0
763
+ ; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> poison, i32 [[TMP11]], i64 0
764
+ ; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> poison, <1 x i32> zeroinitializer
765
+ ; CHECK-NEXT: [[TMP12:%.*]] = mul <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]]
766
+ ; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <1 x i32> [[COL_LOAD3]], <1 x i32> poison, <1 x i32> zeroinitializer
767
+ ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <5 x i32> [[SPLIT]], i64 1
768
+ ; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x i32> poison, i32 [[TMP13]], i64 0
769
+ ; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT14]], <1 x i32> poison, <1 x i32> zeroinitializer
770
+ ; CHECK-NEXT: [[TMP14:%.*]] = mul <1 x i32> [[BLOCK13]], [[SPLAT_SPLAT15]]
771
+ ; CHECK-NEXT: [[TMP15:%.*]] = add <1 x i32> [[TMP12]], [[TMP14]]
772
+ ; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <1 x i32> [[COL_LOAD6]], <1 x i32> poison, <1 x i32> zeroinitializer
773
+ ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <5 x i32> [[SPLIT]], i64 2
774
+ ; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x i32> poison, i32 [[TMP16]], i64 0
775
+ ; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT17]], <1 x i32> poison, <1 x i32> zeroinitializer
776
+ ; CHECK-NEXT: [[TMP17:%.*]] = mul <1 x i32> [[BLOCK16]], [[SPLAT_SPLAT18]]
777
+ ; CHECK-NEXT: [[TMP18:%.*]] = add <1 x i32> [[TMP15]], [[TMP17]]
778
+ ; CHECK-NEXT: [[BLOCK19:%.*]] = shufflevector <1 x i32> [[COL_LOAD9]], <1 x i32> poison, <1 x i32> zeroinitializer
779
+ ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <5 x i32> [[SPLIT]], i64 3
780
+ ; CHECK-NEXT: [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x i32> poison, i32 [[TMP19]], i64 0
781
+ ; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT20]], <1 x i32> poison, <1 x i32> zeroinitializer
782
+ ; CHECK-NEXT: [[TMP20:%.*]] = mul <1 x i32> [[BLOCK19]], [[SPLAT_SPLAT21]]
783
+ ; CHECK-NEXT: [[TMP21:%.*]] = add <1 x i32> [[TMP18]], [[TMP20]]
784
+ ; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <1 x i32> [[COL_LOAD12]], <1 x i32> poison, <1 x i32> zeroinitializer
785
+ ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <5 x i32> [[SPLIT]], i64 4
786
+ ; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x i32> poison, i32 [[TMP22]], i64 0
787
+ ; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT23]], <1 x i32> poison, <1 x i32> zeroinitializer
788
+ ; CHECK-NEXT: [[TMP23:%.*]] = mul <1 x i32> [[BLOCK22]], [[SPLAT_SPLAT24]]
789
+ ; CHECK-NEXT: [[TMP24:%.*]] = add <1 x i32> [[TMP21]], [[TMP23]]
790
+ ; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <1 x i32> [[TMP24]], <1 x i32> poison, <1 x i32> zeroinitializer
791
+ ; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <1 x i32> undef, <1 x i32> [[TMP25]], <1 x i32> <i32 1>
792
+ ; CHECK-NEXT: ret <1 x i32> [[TMP26]]
793
+ ;
794
+ %l = call <5 x i32 > @llvm.matrix.column.major.load.v5i32.i64 (ptr %src , i64 %stride , i1 false , i32 1 , i32 5 )
795
+ %t = call <5 x i32 > @llvm.matrix.transpose.v5i32 (<5 x i32 > %l , i32 1 , i32 5 )
796
+ call void @use.v5i32 (<5 x i32 > %t )
797
+ %r = call <1 x i32 > @llvm.matrix.multiply.v1i32.v5i32.v5i32 (<5 x i32 > %l , <5 x i32 > %a , i32 1 , i32 5 , i32 1 )
798
+ ret <1 x i32 > %r
799
+ }
800
+
801
+ declare void @use.v5i32 (<5 x i32 >)
802
+
803
+ declare <1 x i32 > @llvm.matrix.multiply.v1i32.v5i32.v5i32 (<5 x i32 >, <5 x i32 >, i32 immarg, i32 immarg, i32 immarg) #0
804
+
805
+ declare <5 x i32 > @llvm.matrix.column.major.load.v5i32.i64 (ptr nocapture , i64 , i1 immarg, i32 immarg, i32 immarg) #1
806
+
807
+ declare <5 x i32 > @llvm.matrix.transpose.v5i32 (<5 x i32 >, i32 immarg, i32 immarg) #0
0 commit comments