@@ -391,13 +391,18 @@ void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput,
391
391
/* posTupleNum=*/ Value (), std::nullopt, 0 );
392
392
if (dimGetter && !isSynTensor (tid)) {
393
393
for (Level l = 0 ; l < lvlRank; l++) {
394
- dependentLvlMap[tid][l] = dimGetter (tid, l);
394
+ std::vector<std::pair<LoopId, unsigned >> deps = dimGetter (tid, l);
395
+ // Sort the loop by order.
396
+ std::sort (deps.begin (), deps.end (),
397
+ [](auto &lhs, auto &rhs) { return lhs.first < rhs.first ; });
398
+
399
+ dependentLvlMap[tid][l] = std::move (deps);
395
400
unsigned depends = dependentLvlMap[tid][l].size ();
396
401
if (depends == 0 )
397
402
continue ;
398
- sliceMeta[tid][l].assign (depends, std::make_pair ( nullptr , 0 ) );
403
+ sliceMeta[tid][l].reserve (depends);
399
404
// We need `depends - 1` slices to fully reduce the affine expression.
400
- slicePosBuffer[tid][l].assign (depends - 1 , nullptr );
405
+ slicePosBuffer[tid][l].reserve (depends - 1 );
401
406
}
402
407
}
403
408
}
@@ -487,35 +492,70 @@ void LoopEmitter::initializeLoopEmit(
487
492
// hoist the code ouside if-conditions.
488
493
}
489
494
490
- Type indexType = builder.getIndexType ();
491
- Value c0 = constantZero (builder, loc, indexType);
495
+ initSliceDriven (builder, loc);
496
+ }
497
+
498
+ void LoopEmitter::initSliceDriven (OpBuilder &builder, Location loc) {
499
+ Value c0 = C_IDX (0 );
492
500
for (TensorId t = 0 , e = tensors.size (); t < e; t++) {
493
501
auto rtp = dyn_cast<RankedTensorType>(tensors[t].getType ());
494
502
if (!rtp)
495
503
continue ;
496
504
497
505
Level lvlRank = SparseTensorType (rtp).getLvlRank ();
506
+
507
+ // Compute the dependency reduction order.
508
+ auto remDepStack = dependentLvlMap;
509
+ std::vector<std::tuple<LoopId, TensorId, Level>> depRedOrder;
498
510
for (Level lvl = 0 ; lvl < lvlRank; lvl++) {
499
- if (!dependentLvlMap[t][lvl].empty ()) {
500
- ArrayRef<std::pair<TensorLevel, unsigned >> depLvls =
501
- dependentLvlMap[t][lvl];
502
- // Needs at least two operands to form a non-trivial affine expression.
503
- assert (depLvls.size () == sliceMeta[t][lvl].size ());
504
-
505
- Value size = c0;
506
- for (int e = depLvls.size () - 1 ; e >= 0 ; e--) {
507
- auto [dt, dl] = unpackTensorLevel (depLvls[e].first );
508
- unsigned stride = depLvls[e].second ;
509
- Value stridedSize = lvlSizes[dt][dl];
510
- if (stride != 1 )
511
- stridedSize = MULI (stridedSize, C_IDX (stride));
512
- size = ADDI (size, stridedSize);
513
- sliceMeta[t][lvl][e] = std::make_pair (size, stride);
514
- }
511
+ // Reverse queue into a stack.
512
+ std::reverse (remDepStack[t][lvl].begin (), remDepStack[t][lvl].end ());
513
+ for (auto [loop, coeff] : dependentLvlMap[t][lvl])
514
+ depRedOrder.emplace_back (std::make_tuple (loop, t, lvl));
515
+ }
516
+
517
+ if (depRedOrder.empty ())
518
+ continue ;
519
+ std::sort (depRedOrder.begin (), depRedOrder.end (),
520
+ [](auto &l, auto &r) { return std::get<0 >(l) < std::get<0 >(r); });
521
+
522
+ for (auto [loop, t, lvl] : depRedOrder) {
523
+ std::pair<LoopId, unsigned > curDep = remDepStack[t][lvl].back ();
524
+ assert (curDep.first == loop);
525
+ Value size = c0;
526
+ for (auto [loop, stride] : remDepStack[t][lvl]) {
527
+ // The synthetic tensor high defines the loop upper bound.
528
+ Value loopHi = highs[getSynTensorId ()][loop];
529
+ size = ADDI (size, MULI (loopHi, C_IDX (stride)));
515
530
}
531
+ sliceMeta[t][lvl].emplace_back (size, curDep.second );
532
+ remDepStack[t][lvl].pop_back ();
533
+
534
+ // Generate caches required to fast compute next-non-empty slices with
535
+ // increasing offset for slice-base loop.
536
+ // We do not need cache for dense levels.
537
+ if (!remDepStack[t][lvl].empty () && !isDenseLT (lvls[t][lvl]->getLT ())) {
538
+ Value cnt = C_IDX (1 );
539
+ for (int preLvl = lvl - 1 ; preLvl >= 0 ; preLvl--) {
540
+ if (remDepStack[t][preLvl].empty ())
541
+ break ;
542
+ assert (remDepStack[t][preLvl].size () == 1 && " Not implemented" );
543
+ auto [loop, stride] = remDepStack[t][preLvl].back ();
544
+ assert (stride == 1 && " Not yet implemented" );
545
+ // Accumlate the size required to cache the pLo for the slice.
546
+ // E.g., if we want to cache the pIdx for slice<d0xd1xf64> on the
547
+ // second level. We at most need a memref<d0xindex>.
548
+ //
549
+ // NOTE: this is apparently an over-approximation when the previous
550
+ // level is compressed, and we can compute a precise memory size
551
+ // inside the loops. But that would also requires us to allocate/free
552
+ // memory in loops.
553
+ cnt = MULI (highs[getSynTensorId ()][loop], cnt);
554
+ }
555
+ slicePosBuffer[t][lvl].push_back (allocSlicePosBuf (builder, loc, cnt));
556
+ } // else fully resolved.
516
557
}
517
558
}
518
- localInsertPos = builder.getInsertionPoint ()->getPrevNode ();
519
559
}
520
560
521
561
void LoopEmitter::categorizeLoopCondition (
@@ -1878,9 +1918,6 @@ void LoopEmitter::genUnResolvedSliceBegin(OpBuilder &builder, Location loc,
1878
1918
// simple dim expression in between).
1879
1919
assert (lvl == *sliceStack[tid].back ().slicedOnLvl + 1 );
1880
1920
1881
- // Check slice stack integrity.
1882
- assert (slicePosBuffer[tid][lvl - 1 ].size () == sliceStack[tid].back ().depth );
1883
-
1884
1921
SmallVector<const SliceInfo *> unResSlices;
1885
1922
std::optional<std::pair<TensorId, Level>> firstResLvl;
1886
1923
for (Level curLvl = lvl; curLvl >= 1 ; curLvl--) {
@@ -2006,37 +2043,6 @@ bool LoopEmitter::genSliceBegin(OpBuilder &builder, Location loc, TensorId tid,
2006
2043
if (baseEnc.isSlice ())
2007
2044
llvm_unreachable (" TODO: not yet implemented" );
2008
2045
2009
- // Generate caches required to fast compute next-non-empty slices with
2010
- // increasing offset for slice-base loop.
2011
- // We do not need cache for dense levels.
2012
- if (slicePosBuffer[tid][lvl][0 ] == nullptr && !isDenseLT (lvlType)) {
2013
- OpBuilder::InsertionGuard guard (builder);
2014
- // The buffer can be reused, and the size is loop invariant: it only
2015
- // depends on the iteration graph's toposort.
2016
- builder.setInsertionPointAfter (localInsertPos);
2017
- Value tupleCnt = C_IDX (1 );
2018
- // Accumlates the size required to cache the pLo for the slice.
2019
- // E.g., if we want to cache the pIdx for slice<d0xd1xf64> on the second
2020
- // level. We at most need to a memref<d0xindex>.
2021
- // NOTE: this is apperantly an over-approximation when the previous
2022
- // level is compressed, and we can compute a precise memory size
2023
- // inside the loops. But that would also requires us to allocate/free
2024
- // memorys in loops.
2025
- // TODO: Maybe using allocaScopeOp inside the loop to resolve the issue?
2026
- for (Level curLevel = lvl;
2027
- curLevel >= 1 && !lvlFullyResolved (tid, curLevel - 1 ); curLevel--) {
2028
- // We only handle cases when all the previously unresolved levels are
2029
- // fully reduced.
2030
- assert (depFullyReduced (tid, curLevel - 1 ));
2031
- assert (!sliceMeta[tid][curLevel - 1 ].empty ());
2032
- auto [sz, stride] = sliceMeta[tid][curLevel - 1 ].back ();
2033
- assert (stride == 1 && " Not yet implemented" );
2034
- tupleCnt = MULI (tupleCnt, sz);
2035
- }
2036
- for (Value &cache : slicePosBuffer[tid][lvl])
2037
- cache = allocSlicePosBuf (builder, loc, tupleCnt);
2038
- }
2039
-
2040
2046
if (sliceInfo.isInitialTensor () ||
2041
2047
(lvl >= 1 && lvlFullyResolved (tid, lvl - 1 ))) {
2042
2048
// First level or previous level has been full resolved.
0 commit comments