Skip to content

Commit 677b0d3

Browse files
committed
[Matrix] Add dot product tests with builtin loads with variable strides
Extra tests for D147330.
1 parent eee024b commit 677b0d3

File tree

1 file changed

+124
-0
lines changed

1 file changed

+124
-0
lines changed

llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-int.ll

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -681,3 +681,127 @@ define <1 x i32> @test_builtin_column_major_load_multiuse(ptr %src, <4 x i32> %b
681681
}
682682

683683
declare void @use.v4i32(<4 x i32>)
684+
685+
define <1 x i32> @test_builtin_column_major_variable_stride(ptr %src, <4 x i32> %a, i64 %stride) {
686+
; CHECK-LABEL: @test_builtin_column_major_variable_stride(
687+
; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
688+
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[VEC_START]]
689+
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <1 x i32>, ptr [[VEC_GEP]], align 4
690+
; CHECK-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]]
691+
; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[VEC_START1]]
692+
; CHECK-NEXT: [[COL_LOAD3:%.*]] = load <1 x i32>, ptr [[VEC_GEP2]], align 4
693+
; CHECK-NEXT: [[VEC_START4:%.*]] = mul i64 2, [[STRIDE]]
694+
; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[VEC_START4]]
695+
; CHECK-NEXT: [[COL_LOAD6:%.*]] = load <1 x i32>, ptr [[VEC_GEP5]], align 4
696+
; CHECK-NEXT: [[VEC_START7:%.*]] = mul i64 3, [[STRIDE]]
697+
; CHECK-NEXT: [[VEC_GEP8:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[VEC_START7]]
698+
; CHECK-NEXT: [[COL_LOAD9:%.*]] = load <1 x i32>, ptr [[VEC_GEP8]], align 4
699+
; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
700+
; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <1 x i32> [[COL_LOAD]], <1 x i32> poison, <1 x i32> zeroinitializer
701+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[SPLIT]], i64 0
702+
; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> poison, i32 [[TMP1]], i64 0
703+
; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> poison, <1 x i32> zeroinitializer
704+
; CHECK-NEXT: [[TMP2:%.*]] = mul <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]]
705+
; CHECK-NEXT: [[BLOCK10:%.*]] = shufflevector <1 x i32> [[COL_LOAD3]], <1 x i32> poison, <1 x i32> zeroinitializer
706+
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[SPLIT]], i64 1
707+
; CHECK-NEXT: [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x i32> poison, i32 [[TMP3]], i64 0
708+
; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT11]], <1 x i32> poison, <1 x i32> zeroinitializer
709+
; CHECK-NEXT: [[TMP4:%.*]] = mul <1 x i32> [[BLOCK10]], [[SPLAT_SPLAT12]]
710+
; CHECK-NEXT: [[TMP5:%.*]] = add <1 x i32> [[TMP2]], [[TMP4]]
711+
; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <1 x i32> [[COL_LOAD6]], <1 x i32> poison, <1 x i32> zeroinitializer
712+
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[SPLIT]], i64 2
713+
; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x i32> poison, i32 [[TMP6]], i64 0
714+
; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT14]], <1 x i32> poison, <1 x i32> zeroinitializer
715+
; CHECK-NEXT: [[TMP7:%.*]] = mul <1 x i32> [[BLOCK13]], [[SPLAT_SPLAT15]]
716+
; CHECK-NEXT: [[TMP8:%.*]] = add <1 x i32> [[TMP5]], [[TMP7]]
717+
; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <1 x i32> [[COL_LOAD9]], <1 x i32> poison, <1 x i32> zeroinitializer
718+
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[SPLIT]], i64 3
719+
; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x i32> poison, i32 [[TMP9]], i64 0
720+
; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT17]], <1 x i32> poison, <1 x i32> zeroinitializer
721+
; CHECK-NEXT: [[TMP10:%.*]] = mul <1 x i32> [[BLOCK16]], [[SPLAT_SPLAT18]]
722+
; CHECK-NEXT: [[TMP11:%.*]] = add <1 x i32> [[TMP8]], [[TMP10]]
723+
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x i32> [[TMP11]], <1 x i32> poison, <1 x i32> zeroinitializer
724+
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <1 x i32> undef, <1 x i32> [[TMP12]], <1 x i32> <i32 1>
725+
; CHECK-NEXT: ret <1 x i32> [[TMP13]]
726+
;
727+
%l = call <4 x i32> @llvm.matrix.column.major.load.v4i32.i64(ptr %src, i64 %stride, i1 false, i32 1, i32 4)
728+
%r = call <1 x i32> @llvm.matrix.multiply.v1i32.v4i32.v4i32(<4 x i32> %l, <4 x i32> %a, i32 1, i32 4, i32 1)
729+
ret <1 x i32> %r
730+
}
731+
732+
define <1 x i32> @test_builtin_column_major_variable_stride_multiuse(ptr %src, <5 x i32> %a, i64 %stride) {
733+
; CHECK-LABEL: @test_builtin_column_major_variable_stride_multiuse(
734+
; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
735+
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[VEC_START]]
736+
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <1 x i32>, ptr [[VEC_GEP]], align 4
737+
; CHECK-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]]
738+
; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[VEC_START1]]
739+
; CHECK-NEXT: [[COL_LOAD3:%.*]] = load <1 x i32>, ptr [[VEC_GEP2]], align 4
740+
; CHECK-NEXT: [[VEC_START4:%.*]] = mul i64 2, [[STRIDE]]
741+
; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[VEC_START4]]
742+
; CHECK-NEXT: [[COL_LOAD6:%.*]] = load <1 x i32>, ptr [[VEC_GEP5]], align 4
743+
; CHECK-NEXT: [[VEC_START7:%.*]] = mul i64 3, [[STRIDE]]
744+
; CHECK-NEXT: [[VEC_GEP8:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[VEC_START7]]
745+
; CHECK-NEXT: [[COL_LOAD9:%.*]] = load <1 x i32>, ptr [[VEC_GEP8]], align 4
746+
; CHECK-NEXT: [[VEC_START10:%.*]] = mul i64 4, [[STRIDE]]
747+
; CHECK-NEXT: [[VEC_GEP11:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[VEC_START10]]
748+
; CHECK-NEXT: [[COL_LOAD12:%.*]] = load <1 x i32>, ptr [[VEC_GEP11]], align 4
749+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i32> [[COL_LOAD]], i64 0
750+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x i32> poison, i32 [[TMP1]], i64 0
751+
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <1 x i32> [[COL_LOAD3]], i64 0
752+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x i32> [[TMP2]], i32 [[TMP3]], i64 1
753+
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i32> [[COL_LOAD6]], i64 0
754+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <5 x i32> [[TMP4]], i32 [[TMP5]], i64 2
755+
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i32> [[COL_LOAD9]], i64 0
756+
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <5 x i32> [[TMP6]], i32 [[TMP7]], i64 3
757+
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <1 x i32> [[COL_LOAD12]], i64 0
758+
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <5 x i32> [[TMP8]], i32 [[TMP9]], i64 4
759+
; CHECK-NEXT: call void @use.v5i32(<5 x i32> [[TMP10]])
760+
; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <5 x i32> [[A:%.*]], <5 x i32> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
761+
; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <1 x i32> [[COL_LOAD]], <1 x i32> poison, <1 x i32> zeroinitializer
762+
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <5 x i32> [[SPLIT]], i64 0
763+
; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> poison, i32 [[TMP11]], i64 0
764+
; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> poison, <1 x i32> zeroinitializer
765+
; CHECK-NEXT: [[TMP12:%.*]] = mul <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]]
766+
; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <1 x i32> [[COL_LOAD3]], <1 x i32> poison, <1 x i32> zeroinitializer
767+
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <5 x i32> [[SPLIT]], i64 1
768+
; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x i32> poison, i32 [[TMP13]], i64 0
769+
; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT14]], <1 x i32> poison, <1 x i32> zeroinitializer
770+
; CHECK-NEXT: [[TMP14:%.*]] = mul <1 x i32> [[BLOCK13]], [[SPLAT_SPLAT15]]
771+
; CHECK-NEXT: [[TMP15:%.*]] = add <1 x i32> [[TMP12]], [[TMP14]]
772+
; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <1 x i32> [[COL_LOAD6]], <1 x i32> poison, <1 x i32> zeroinitializer
773+
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <5 x i32> [[SPLIT]], i64 2
774+
; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x i32> poison, i32 [[TMP16]], i64 0
775+
; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT17]], <1 x i32> poison, <1 x i32> zeroinitializer
776+
; CHECK-NEXT: [[TMP17:%.*]] = mul <1 x i32> [[BLOCK16]], [[SPLAT_SPLAT18]]
777+
; CHECK-NEXT: [[TMP18:%.*]] = add <1 x i32> [[TMP15]], [[TMP17]]
778+
; CHECK-NEXT: [[BLOCK19:%.*]] = shufflevector <1 x i32> [[COL_LOAD9]], <1 x i32> poison, <1 x i32> zeroinitializer
779+
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <5 x i32> [[SPLIT]], i64 3
780+
; CHECK-NEXT: [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x i32> poison, i32 [[TMP19]], i64 0
781+
; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT20]], <1 x i32> poison, <1 x i32> zeroinitializer
782+
; CHECK-NEXT: [[TMP20:%.*]] = mul <1 x i32> [[BLOCK19]], [[SPLAT_SPLAT21]]
783+
; CHECK-NEXT: [[TMP21:%.*]] = add <1 x i32> [[TMP18]], [[TMP20]]
784+
; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <1 x i32> [[COL_LOAD12]], <1 x i32> poison, <1 x i32> zeroinitializer
785+
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <5 x i32> [[SPLIT]], i64 4
786+
; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x i32> poison, i32 [[TMP22]], i64 0
787+
; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT23]], <1 x i32> poison, <1 x i32> zeroinitializer
788+
; CHECK-NEXT: [[TMP23:%.*]] = mul <1 x i32> [[BLOCK22]], [[SPLAT_SPLAT24]]
789+
; CHECK-NEXT: [[TMP24:%.*]] = add <1 x i32> [[TMP21]], [[TMP23]]
790+
; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <1 x i32> [[TMP24]], <1 x i32> poison, <1 x i32> zeroinitializer
791+
; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <1 x i32> undef, <1 x i32> [[TMP25]], <1 x i32> <i32 1>
792+
; CHECK-NEXT: ret <1 x i32> [[TMP26]]
793+
;
794+
%l = call <5 x i32> @llvm.matrix.column.major.load.v5i32.i64(ptr %src, i64 %stride, i1 false, i32 1, i32 5)
795+
%t = call <5 x i32> @llvm.matrix.transpose.v5i32(<5 x i32> %l, i32 1, i32 5)
796+
call void @use.v5i32(<5 x i32> %t)
797+
%r = call <1 x i32> @llvm.matrix.multiply.v1i32.v5i32.v5i32(<5 x i32> %l, <5 x i32> %a, i32 1, i32 5, i32 1)
798+
ret <1 x i32> %r
799+
}
800+
801+
declare void @use.v5i32(<5 x i32>)
802+
803+
declare <1 x i32> @llvm.matrix.multiply.v1i32.v5i32.v5i32(<5 x i32>, <5 x i32>, i32 immarg, i32 immarg, i32 immarg) #0
804+
805+
declare <5 x i32> @llvm.matrix.column.major.load.v5i32.i64(ptr nocapture, i64, i1 immarg, i32 immarg, i32 immarg) #1
806+
807+
declare <5 x i32> @llvm.matrix.transpose.v5i32(<5 x i32>, i32 immarg, i32 immarg) #0

0 commit comments

Comments
 (0)