Skip to content

Commit 16092ea

Browse files
authored
Merge pull request #9693 from fhahn/matrix-crash
Pick fix for matrix lowering crashes
2 parents da9d5f9 + 41551d3 commit 16092ea

File tree

5 files changed

+192
-33
lines changed

5 files changed

+192
-33
lines changed

llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp

Lines changed: 54 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -97,19 +97,6 @@ static DISubprogram *getSubprogram(DIScope *Scope) {
9797
return cast<DILocalScope>(Scope)->getSubprogram();
9898
}
9999

100-
/// Erase \p V from \p BB and move \II forward to avoid invalidating
101-
/// iterators.
102-
static void eraseFromParentAndMove(Value *V, BasicBlock::reverse_iterator &II,
103-
BasicBlock &BB) {
104-
auto *Inst = cast<Instruction>(V);
105-
// Still used, don't erase.
106-
if (!Inst->use_empty())
107-
return;
108-
if (II != BB.rend() && Inst == &*II)
109-
++II;
110-
Inst->eraseFromParent();
111-
}
112-
113100
/// Return true if V is a splat of a value (which is used when multiplying a
114101
/// matrix with a scalar).
115102
static bool isSplat(Value *V) {
@@ -259,7 +246,7 @@ static bool isUniformShape(Value *V) {
259246
/// Return the ShapeInfo for the result of \p I, it it can be determined.
260247
static std::optional<ShapeInfo>
261248
computeShapeInfoForInst(Instruction *I,
262-
const ValueMap<Value *, ShapeInfo> &ShapeMap) {
249+
const DenseMap<Value *, ShapeInfo> &ShapeMap) {
263250
Value *M;
264251
Value *N;
265252
Value *K;
@@ -492,10 +479,16 @@ class LowerMatrixIntrinsics {
492479
/// the result value of the instruction, with the only exceptions being store
493480
/// instructions and the matrix_column_major_store intrinsics. For those, the
494481
/// shape information indicates that those instructions should be lowered
495-
/// using shape information as well. A ValueMap is used so that when
496-
/// sub-passes like optimizeTransposes performs RAUW the map stays
497-
/// up-to-date.
498-
ValueMap<Value *, ShapeInfo> ShapeMap;
482+
/// using shape information as well. Note that extra care is needed when
483+
/// erasing or RAUW'ing a value that is present in ShapeMap. If the
484+
/// replacement is also a matrix operation, use
485+
/// updateShapeAndReplaceAllUsesWith to make sure the replacement is added to
486+
/// ShapeMap. We don't use ValueMap, as there are also cases where we do not
487+
/// want to add shape information for a replacement instruction. When directly
488+
/// erasing a value with an entry in ShapeMap, use
489+
/// eraseFromParentAndRemoveFromShapeMap to make sure ShapeMap is also updated
490+
/// accordingly.
491+
DenseMap<Value *, ShapeInfo> ShapeMap;
499492

500493
/// List of instructions to remove. While lowering, we are not replacing all
501494
/// users of a lowered instruction, if shape information is available and
@@ -759,6 +752,30 @@ class LowerMatrixIntrinsics {
759752
return Operation(T0, Shape0.t(), T1, Shape1.t());
760753
}
761754

755+
/// Erase \p Inst from both ShapeMap (if an entry exists) and erase \p Inst
756+
/// itself.
757+
void eraseFromParentAndRemoveFromShapeMap(Instruction *Inst) {
758+
auto Iter = ShapeMap.find(Inst);
759+
if (Iter != ShapeMap.end())
760+
ShapeMap.erase(Iter);
761+
Inst->eraseFromParent();
762+
}
763+
764+
/// Erase \p V from \p BB and move \II forward to avoid invalidating
765+
/// iterators.
766+
void eraseFromParentAndMove(Value *V, BasicBlock::reverse_iterator &II,
767+
BasicBlock &BB) {
768+
auto *Inst = cast<Instruction>(V);
769+
// Still used, don't erase.
770+
if (!Inst->use_empty())
771+
return;
772+
if (II != BB.rend() && Inst == &*II)
773+
++II;
774+
eraseFromParentAndRemoveFromShapeMap(Inst);
775+
}
776+
777+
/// Add a new entry to ShapeMap for \p New with \p Old's shape info, erase the
778+
/// entry for \p Old and replace all uses of \p Old with \p New.
762779
void updateShapeAndReplaceAllUsesWith(Instruction &Old, Value *New) {
763780
// We need to remove Old from the ShapeMap otherwise RAUW will replace it
764781
// with New. We should only add New it it supportsShapeInfo so we insert
@@ -872,13 +889,13 @@ class LowerMatrixIntrinsics {
872889

873890
void liftTranspose(Instruction &I) {
874891
// Erase dead Instructions after lifting transposes from binops.
875-
auto CleanupBinOp = [](Instruction &T, Value *A, Value *B) {
892+
auto CleanupBinOp = [this](Instruction &T, Value *A, Value *B) {
876893
if (T.use_empty())
877-
T.eraseFromParent();
894+
eraseFromParentAndRemoveFromShapeMap(&T);
878895
if (A->use_empty())
879-
cast<Instruction>(A)->eraseFromParent();
896+
eraseFromParentAndRemoveFromShapeMap(cast<Instruction>(A));
880897
if (A != B && B->use_empty())
881-
cast<Instruction>(B)->eraseFromParent();
898+
eraseFromParentAndRemoveFromShapeMap(cast<Instruction>(B));
882899
};
883900

884901
Value *A, *B, *AT, *BT;
@@ -908,8 +925,7 @@ class LowerMatrixIntrinsics {
908925
match(B, m_Intrinsic<Intrinsic::matrix_transpose>(
909926
m_Value(BT), m_ConstantInt(), m_ConstantInt()))) {
910927
IRBuilder<> Builder(&I);
911-
auto *Add = cast<Instruction>(Builder.CreateFAdd(AT, BT, "mfadd"));
912-
setShapeInfo(Add, {R, C});
928+
auto *Add = Builder.CreateFAdd(AT, BT, "mfadd");
913929
MatrixBuilder MBuilder(Builder);
914930
Instruction *NewInst = MBuilder.CreateMatrixTranspose(
915931
Add, R->getZExtValue(), C->getZExtValue(), "mfadd_t");
@@ -918,9 +934,13 @@ class LowerMatrixIntrinsics {
918934
computeShapeInfoForInst(&I, ShapeMap) &&
919935
"Shape of new instruction doesn't match original shape.");
920936
CleanupBinOp(I, A, B);
921-
assert(computeShapeInfoForInst(Add, ShapeMap).value_or(ShapeMap[Add]) ==
922-
ShapeMap[Add] &&
923-
"Shape of updated addition doesn't match cached shape.");
937+
if (auto *AddI = dyn_cast<Instruction>(Add)) {
938+
setShapeInfo(AddI, {R, C});
939+
assert(
940+
computeShapeInfoForInst(AddI, ShapeMap).value_or(ShapeMap[AddI]) ==
941+
ShapeMap[AddI] &&
942+
"Shape of updated addition doesn't match cached shape.");
943+
}
924944
}
925945
}
926946

@@ -1014,7 +1034,8 @@ class LowerMatrixIntrinsics {
10141034

10151035
// Third, try to fuse candidates.
10161036
for (CallInst *CI : MaybeFusableInsts)
1017-
LowerMatrixMultiplyFused(CI, FusedInsts, LifetimeEnds);
1037+
if (!FusedInsts.contains(CI))
1038+
LowerMatrixMultiplyFused(CI, FusedInsts, LifetimeEnds);
10181039

10191040
Changed = !FusedInsts.empty();
10201041

@@ -1475,7 +1496,7 @@ class LowerMatrixIntrinsics {
14751496
m_Value(Arg)))) {
14761497
auto *NewLoad = Builder.CreateLoad(Op->getType(), Arg);
14771498
Op->replaceAllUsesWith(NewLoad);
1478-
cast<Instruction>(Op)->eraseFromParent();
1499+
eraseFromParentAndRemoveFromShapeMap(cast<Instruction>(Op));
14791500
return;
14801501
} else if (match(Op, m_Intrinsic<Intrinsic::matrix_transpose>(
14811502
m_Value(Arg)))) {
@@ -1844,15 +1865,15 @@ class LowerMatrixIntrinsics {
18441865
// Mark eliminated instructions as fused and remove them.
18451866
FusedInsts.insert(Store);
18461867
FusedInsts.insert(MatMul);
1847-
Store->eraseFromParent();
1848-
MatMul->eraseFromParent();
1868+
eraseFromParentAndRemoveFromShapeMap(Store);
1869+
eraseFromParentAndRemoveFromShapeMap(MatMul);
18491870
if (LoadOp0->hasNUses(0)) {
18501871
FusedInsts.insert(LoadOp0);
1851-
LoadOp0->eraseFromParent();
1872+
eraseFromParentAndRemoveFromShapeMap(LoadOp0);
18521873
}
18531874
if (LoadOp1 != LoadOp0 && LoadOp1->hasNUses(0)) {
18541875
FusedInsts.insert(LoadOp1);
1855-
LoadOp1->eraseFromParent();
1876+
eraseFromParentAndRemoveFromShapeMap(LoadOp1);
18561877
}
18571878
}
18581879

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -p lower-matrix-intrinsics -S %s | FileCheck %s
3+
4+
define void @test(ptr %p, <8 x i32> %x) {
5+
; CHECK-LABEL: define void @test(
6+
; CHECK-SAME: ptr [[P:%.*]], <8 x i32> [[X:%.*]]) {
7+
; CHECK-NEXT: [[L:%.*]] = load <8 x i32>, ptr [[P]], align 4
8+
; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <1 x i32> zeroinitializer
9+
; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <1 x i32> <i32 1>
10+
; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <1 x i32> <i32 2>
11+
; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <1 x i32> <i32 3>
12+
; CHECK-NEXT: [[SPLIT4:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <1 x i32> <i32 4>
13+
; CHECK-NEXT: [[SPLIT5:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <1 x i32> <i32 5>
14+
; CHECK-NEXT: [[SPLIT6:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <1 x i32> <i32 6>
15+
; CHECK-NEXT: [[SPLIT7:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <1 x i32> <i32 7>
16+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i32> [[SPLIT]], i64 0
17+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0
18+
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <1 x i32> [[SPLIT1]], i64 0
19+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP3]], i64 1
20+
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i32> [[SPLIT2]], i64 0
21+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP5]], i64 2
22+
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i32> [[SPLIT3]], i64 0
23+
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP7]], i64 3
24+
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <1 x i32> [[SPLIT4]], i64 0
25+
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP9]], i64 4
26+
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <1 x i32> [[SPLIT5]], i64 0
27+
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP11]], i64 5
28+
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i32> [[SPLIT6]], i64 0
29+
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[TMP13]], i64 6
30+
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i32> [[SPLIT7]], i64 0
31+
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[TMP15]], i64 7
32+
; CHECK-NEXT: [[TMP17:%.*]] = mul <8 x i32> [[L]], [[TMP16]]
33+
; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP17]])
34+
; CHECK-NEXT: [[TMP19:%.*]] = insertelement <1 x i32> poison, i32 [[TMP18]], i64 0
35+
; CHECK-NEXT: [[E:%.*]] = extractelement <1 x i32> [[TMP19]], i64 0
36+
; CHECK-NEXT: store i32 [[E]], ptr [[P]], align 4
37+
; CHECK-NEXT: ret void
38+
;
39+
%l = load <8 x i32>, ptr %p, align 4
40+
%t = tail call <8 x i32> @llvm.matrix.transpose.v8i32(<8 x i32> %x, i32 1, i32 8)
41+
%m = tail call <1 x i32> @llvm.matrix.multiply.v1i32.v8i32.v8i32(<8 x i32> %l, <8 x i32> %t, i32 1, i32 8, i32 1)
42+
%e = extractelement <1 x i32> %m, i64 0
43+
store i32 %e, ptr %p, align 4
44+
ret void
45+
}
46+
47+
declare <8 x i32> @llvm.matrix.transpose.v8i32(<8 x i32>, i32 immarg, i32 immarg)
48+
49+
declare <1 x i32> @llvm.matrix.multiply.v1i32.v8i32.v8i32(<8 x i32>, <8 x i32>, i32 immarg, i32 immarg, i32 immarg)

llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-transpose-int.ll

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,3 +190,33 @@ declare <1 x i32> @llvm.matrix.multiply.v1i32.v5i32.v5i32(<5 x i32>, <5 x i32>,
190190
declare <5 x i32> @llvm.matrix.column.major.load.v5i32.i64(ptr nocapture, i64, i1 immarg, i32 immarg, i32 immarg) #1
191191

192192
declare <5 x i32> @llvm.matrix.transpose.v5i32(<5 x i32>, i32 immarg, i32 immarg) #0
193+
194+
define <1 x i32> @test_dot_product_with_transposed_shuffle_op(<4 x i32> %a, <2 x i32> %b) {
195+
; CHECK-LABEL: @test_dot_product_with_transposed_shuffle_op(
196+
; CHECK-NEXT: entry:
197+
; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
198+
; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
199+
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i32> [[SPLIT]], i64 0
200+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
201+
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SPLIT1]], i64 0
202+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP2]], i64 1
203+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[SPLIT]], i64 1
204+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i64 0
205+
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[SPLIT1]], i64 1
206+
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP6]], i64 1
207+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
208+
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>
209+
; CHECK-NEXT: [[TMP9:%.*]] = mul <2 x i32> [[SHUFFLE]], [[B:%.*]]
210+
; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP9]])
211+
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <1 x i32> poison, i32 [[TMP10]], i64 0
212+
; CHECK-NEXT: ret <1 x i32> [[TMP11]]
213+
;
214+
entry:
215+
%t.a = tail call <4 x i32> @llvm.matrix.transpose.v4i32(<4 x i32> %a, i32 2, i32 2)
216+
%shuffle = shufflevector <4 x i32> %t.a, <4 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>
217+
%t.shuffle = call <2 x i32> @llvm.matrix.transpose.v2i32(<2 x i32> %shuffle, i32 2, i32 1)
218+
%m = call <1 x i32> @llvm.matrix.multiply.v1i32.v2i32.v2i32(<2 x i32> %t.shuffle, <2 x i32> %b, i32 1, i32 2, i32 1)
219+
ret <1 x i32> %m
220+
}
221+
222+
declare <2 x i32> @llvm.matrix.transpose.v2i32(<2 x i32>, i32 immarg, i32 immarg)
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -p lower-matrix-intrinsics -S %s | FileCheck %s
3+
4+
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
5+
6+
define <8 x float> @transpose_constant_fold_fadd_AT_BT() {
7+
; CHECK-LABEL: define <8 x float> @transpose_constant_fold_fadd_AT_BT() {
8+
; CHECK-NEXT: [[ENTRY:.*:]]
9+
; CHECK-NEXT: ret <8 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
10+
;
11+
entry:
12+
%t = tail call <8 x float> @llvm.matrix.transpose.v8f32(<8 x float> splat (float 1.0), i32 8, i32 1)
13+
%f = fadd <8 x float> %t, %t
14+
ret <8 x float> %f
15+
}
16+
17+
define <8 x float> @transpose_constant_fold_fmul_A_k() {
18+
; CHECK-LABEL: define <8 x float> @transpose_constant_fold_fmul_A_k() {
19+
; CHECK-NEXT: [[ENTRY:.*:]]
20+
; CHECK-NEXT: [[SPLAT:%.*]] = shufflevector <8 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>, <8 x float> poison, <8 x i32> zeroinitializer
21+
; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <8 x float> [[SPLAT]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
22+
; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <8 x float> [[SPLAT]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
23+
; CHECK-NEXT: [[TMP0:%.*]] = fmul <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[SPLIT]]
24+
; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[SPLIT1]]
25+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
26+
; CHECK-NEXT: ret <8 x float> [[TMP2]]
27+
;
28+
entry:
29+
%t.1 = tail call <8 x float> @llvm.matrix.transpose.v8f32(<8 x float> splat (float 1.0), i32 4, i32 2)
30+
%splat = shufflevector <8 x float> splat (float 3.0), <8 x float> poison, <8 x i32> zeroinitializer
31+
%m = fmul <8 x float> %t.1, %splat
32+
%t.2 = tail call <8 x float> @llvm.matrix.transpose.v8f32(<8 x float> %m, i32 2, i32 4)
33+
ret <8 x float> %t.2
34+
}
35+
36+
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
37+
declare <8 x float> @llvm.matrix.transpose.v8f32(<8 x float>, i32 immarg, i32 immarg) #0
38+
39+
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

llvm/test/Transforms/LowerMatrixIntrinsics/transpose-opts-lifting.ll

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,8 +144,28 @@ entry:
144144
ret <6 x double> %mul
145145
}
146146

147+
define void @test_remove_entries_from_shape_map(<3 x float> %a, <2 x float> %b, <6 x float> %c, ptr %dst) {
148+
; CHECK-LABEL: define void @test_remove_entries_from_shape_map(
149+
; CHECK-SAME: <3 x float> [[A:%.*]], <2 x float> [[B:%.*]], <6 x float> [[C:%.*]], ptr [[DST:%.*]]) {
150+
; CHECK-NEXT: [[ENTRY:.*:]]
151+
; CHECK-NEXT: [[TMP0:%.*]] = call <6 x float> @llvm.matrix.multiply.v6f32.v3f32.v2f32(<3 x float> [[A]], <2 x float> [[B]], i32 3, i32 1, i32 2)
152+
; CHECK-NEXT: [[MFADD:%.*]] = fadd <6 x float> [[C]], [[TMP0]]
153+
; CHECK-NEXT: [[MFADD_T:%.*]] = call <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> [[MFADD]], i32 3, i32 2)
154+
; CHECK-NEXT: store <6 x float> [[MFADD_T]], ptr [[DST]], align 4
155+
; CHECK-NEXT: ret void
156+
;
157+
entry:
158+
%m = tail call <6 x float> @llvm.matrix.multiply.v6f32.v3f32.v2f32(<3 x float> %a, <2 x float> %b, i32 3, i32 1, i32 2)
159+
%add = fadd <6 x float> %c, %m
160+
%t = tail call <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %add, i32 3, i32 2)
161+
store <6 x float> %t, ptr %dst, align 4
162+
ret void
163+
}
164+
147165
declare <6 x double> @llvm.matrix.transpose.v6f64.v6f64(<6 x double>, i32, i32)
148166
declare <4 x double> @llvm.matrix.transpose.v4f64.v4f64(<4 x double>, i32, i32)
149167
declare <9 x double> @llvm.matrix.multiply.v9f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32)
150168
declare <6 x double> @llvm.matrix.multiply.v6f64.v6f64.v4f64(<6 x double>, <4 x double>, i32, i32, i32)
151169
declare <6 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double>, <4 x double>, i32, i32, i32)
170+
declare <6 x float> @llvm.matrix.transpose.v6f32(<6 x float>, i32 immarg, i32 immarg)
171+
declare <6 x float> @llvm.matrix.multiply.v6f32.v3f32.v2f32(<3 x float>, <2 x float>, i32 immarg, i32 immarg, i32 immarg)

0 commit comments

Comments
 (0)