Skip to content

[MLIR][Linalg] Scalable Vectorization of Reduction on the Trailing Dimension #97788

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 47 additions & 8 deletions mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,14 @@ static SmallVector<bool> getDimsToReduce(LinalgOp linalgOp) {
llvm::map_range(linalgOp.getIteratorTypesArray(), isReductionIterator));
}

/// Check if `op` is a linalg.reduce or a linalg.generic that has at least one
/// reduction iterator.
static bool hasReductionIterator(LinalgOp &op) {
return isa<linalg::ReduceOp>(op) ||
(isa<linalg::GenericOp>(op) &&
llvm::any_of(op.getIteratorTypesArray(), isReductionIterator));
}

/// Build a vector.transfer_write of `value` into `outputOperand` at indices set
/// to all `0`; where `outputOperand` is an output operand of the LinalgOp
/// currently being vectorized. If `dest` has null rank, build an memref.store.
Expand Down Expand Up @@ -1787,6 +1795,9 @@ vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op,
if (isa<ConvolutionOpInterface>(op.getOperation()))
return vectorizeDynamicConvOpPrecondition(op, flatten1DDepthwiseConv);

if (hasReductionIterator(op))
return reductionPreconditions(op);

// TODO: Masking only supports dynamic element-wise ops, linalg.generic ops,
// linalg.copy ops and ops that implement ContractionOpInterface for now.
if (!isElementwise(op) &&
Expand Down Expand Up @@ -1976,6 +1987,7 @@ vectorizeScalableVectorPrecondition(Operation *op,
// 1. exactly 1 dim is scalable and that's the _last_ parallel dim
// 2. exactly 2 dims are scalable and those are the _last two adjacent_
// parallel dims
// 3. exactly 1 reduction dim is scalable and that's the last (innermost) dim
// The 2nd restriction above means that only Matmul-like Ops are supported
// when 2 dims are scalable, e.g. :
// * iterators = [parallel, parallel, reduction]
Expand All @@ -1992,19 +2004,45 @@ vectorizeScalableVectorPrecondition(Operation *op,
scalableFlags.pop_back();
}

// TODO: Support scalable vectorisation for reduction dims
if (iterators.back() == utils::IteratorType::reduction)
return failure();

// If this is not the _last_ parallel dim, 1. above is not met
if (seenParalell)
return failure();
switch (iterators.back()) {
case utils::IteratorType::reduction: {
// Check 3. above is met.
if (iterators.size() != inputVectorSizes.size()) {
LDBG("Non-trailing reduction dim requested for scalable "
"vectorization\n");
return failure();
}
if (isa<linalg::MatmulOp>(op) || isa<linalg::MatmulTransposeAOp>(op)) {
LDBG("Scalable vectorization of the reduction dim in Matmul-like ops "
"is not supported\n");
return failure();
}
break;
}
case utils::IteratorType::parallel: {
// Check 1. and 2. above are met.
if (seenParalell) {
LDBG("Inner parallel dim not requested for scalable "
"vectorization\n");
return failure();
}
break;
}
}

// If present, check the 2nd scalable dim. ATM, only Matmul-like Ops are
// supported for which expect the folowing config:
// * iterators = [parallel, parallel, reduction]
// * scalable flags = [true, true, false]
if (numOfScalableDims == 2) {
// Disallow below case which breaks 3. above:
// * iterators = [..., parallel, reduction]
// * scalable flags = [..., true, true]
if (iterators.back() == utils::IteratorType::reduction) {
LDBG("Higher dim than the trailing reduction dim requested for scalable "
"vectorization\n");
return failure();
}
scalableFlags.pop_back();
iterators.pop_back();

Expand All @@ -2017,7 +2055,8 @@ vectorizeScalableVectorPrecondition(Operation *op,
// presence of scalable vectors
return success(isElementwise(linalgOp) || isa<linalg::MatmulOp>(op) ||
isa<linalg::MatmulTransposeAOp>(op) ||
isa<linalg::DepthwiseConv1DNwcWcOp>(op));
isa<linalg::DepthwiseConv1DNwcWcOp>(op) ||
isa<linalg::MatvecOp>(op) || hasReductionIterator(linalgOp));
}

LogicalResult mlir::linalg::vectorizeOpPrecondition(
Expand Down
165 changes: 165 additions & 0 deletions mlir/test/Dialect/Linalg/vectorization-scalable.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -189,3 +189,168 @@ module attributes {transform.with_named_sequence} {
transform.yield
}
}

// -----

func.func @vectorize_dynamic_reduction_scalable_1d(%arg0: tensor<?xf32>,
%arg1: tensor<f32>) -> tensor<f32> {

%0 = linalg.reduce ins(%arg0 : tensor<?xf32>) outs(%arg1 : tensor<f32>) dimensions = [0]
(%in: f32, %init: f32) {
%0 = arith.addf %in, %init : f32
linalg.yield %0 : f32
}
return %0 : tensor<f32>
}

// CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_1d(
// CHECK-SAME: %[[ARG_0:.*]]: tensor<?xf32>, %[[ARG_1:.*]]: tensor<f32>) -> tensor<f32> {
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?xf32>
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[MASK:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<[4]xi1>
// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32>
// CHECK: %[[C0_F32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[VEC_RD_1:.*]] = vector.transfer_read %[[ARG_1]][], %[[C0_F32]] : tensor<f32>, vector<f32>
// CHECK: %[[ACC_f32:.*]] = vector.extractelement %[[VEC_RD_1]][] : vector<f32>
// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK]] { vector.multi_reduction <add>, %[[VEC_RD_0]], %[[ACC_f32]] [0] : vector<[4]xf32> to f32 } : vector<[4]xi1> -> f32
// CHECK: %[[VEC_f32:.*]] = vector.broadcast %[[REDUCE]] : f32 to vector<f32>
// CHECK: %{{.*}} = vector.transfer_write %[[VEC_f32]], %[[ARG_1]][] : vector<f32>, tensor<f32>

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %0 vector_sizes [[4]] : !transform.any_op
transform.yield
}
}

// -----

// Note: scalable version of `vectorize_dynamic_reduction` in test/Dialect/Linalg/vectorization.mlir.
func.func @vectorize_dynamic_reduction_scalable_2d(%arg0: tensor<?x?xf32>,
%arg1: tensor<?xf32>) -> tensor<?xf32> {
%0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d0)>],
iterator_types = ["parallel", "reduction"] }
ins(%arg0 : tensor<?x?xf32>)
outs(%arg1 : tensor<?xf32>) {
^bb(%in: f32, %out: f32) :
%0 = arith.addf %in, %out : f32
linalg.yield %0 : f32
} -> tensor<?xf32>
return %0 : tensor<?xf32>
}

// CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_2d(
// CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>) -> tensor<?xf32> {
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?x?xf32>
// CHECK: %[[C1_idx:.*]] = arith.constant 1 : index
// CHECK: %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor<?x?xf32>
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<4x[8]xi1>
// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x[8]xf32> } : vector<4x[8]xi1> -> vector<4x[8]xf32>
// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[MASK_1d:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<4xi1>
// CHECK: %[[VEC_RD_1:.*]] = vector.mask %[[MASK_1d]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction <add>, %[[VEC_RD_0]], %[[VEC_RD_1]] [1] : vector<4x[8]xf32> to vector<4xf32> } : vector<4x[8]xi1> -> vector<4xf32>
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %{{.*}} = vector.mask %[[MASK_1d]] { vector.transfer_write %[[REDUCE]], %[[ARG_1]][%[[C0_idx]]] {in_bounds = [true]} : vector<4xf32>, tensor<?xf32> } : vector<4xi1> -> tensor<?xf32>

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %0 vector_sizes [4, [8]] : !transform.any_op
transform.yield
}
}

// -----

func.func @vectorize_dynamic_matvec_trailing_reduction_dim(%arg0: tensor<?x?xf32>,
%arg1: tensor<?xf32>,
%arg2: tensor<?xf32>) {
linalg.matvec ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?xf32>)
outs(%arg2 : tensor<?xf32>) -> tensor<?xf32>
return
}

// CHECK-LABEL: func.func @vectorize_dynamic_matvec_trailing_reduction_dim(
// CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>, %[[ARG_2:.*]]: tensor<?xf32>) {
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?x?xf32>
// CHECK: %[[C1_idx:.*]] = arith.constant 1 : index
// CHECK: %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor<?x?xf32>
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<4x[4]xi1>
// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x[4]xf32> } : vector<4x[4]xi1> -> vector<4x[4]xf32>
// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[MASK_d1:.*]] = vector.create_mask %[[DIM_A0_1]] : vector<[4]xi1>
// CHECK: %[[VEC_RD_1:.*]] = vector.mask %[[MASK_d1]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true], permutation_map = #map} : tensor<?xf32>, vector<4x[4]xf32> } : vector<[4]xi1> -> vector<4x[4]xf32>
// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[MASK_d2:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<4xi1>
// CHECK: %[[VEC_RD_2:.*]] = vector.mask %[[MASK_d2]] { vector.transfer_read %[[ARG_2]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
// CHECK: %[[MUL:.*]] = arith.mulf %[[VEC_RD_0:.*]], %[[VEC_RD_1:.*]] : vector<4x[4]xf32>
// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction <add>, %[[MUL]], %[[VEC_RD_2]] [1] : vector<4x[4]xf32> to vector<4xf32> } : vector<4x[4]xi1> -> vector<4xf32>
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %{{.*}} = vector.mask %[[MASK_d2]] { vector.transfer_write %[[REDUCE]], %[[ARG_2]][%[[C0_idx]]] {in_bounds = [true]} : vector<4xf32>, tensor<?xf32> } : vector<4xi1> -> tensor<?xf32>

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.matvec"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %0 vector_sizes [4, [4]] : !transform.any_op
transform.yield
}
}

// -----

func.func @vectorize_dynamic_generic_matvec_leading_parallel_dim(%arg0: tensor<?x?xf32>,
%arg1: tensor<?xf32>,
%arg2: tensor<?xf32>) -> tensor<?xf32> {
%0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d1)>,
affine_map<(d0, d1) -> (d0)>],
iterator_types = ["parallel", "reduction"] }
ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?xf32>)
outs(%arg2 : tensor<?xf32>) {
^bb(%mat: f32, %vec: f32, %res: f32) :
%0 = arith.mulf %mat, %vec : f32
%1 = arith.addf %res, %0 : f32
linalg.yield %1 : f32
} -> tensor<?xf32>
return %0 : tensor<?xf32>
}

// CHECK-LABEL: func.func @vectorize_dynamic_generic_matvec_leading_parallel_dim(
// CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>, %[[ARG_2:.*]]: tensor<?xf32>) -> tensor<?xf32> {
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?x?xf32>
// CHECK: %[[C1_idx:.*]] = arith.constant 1 : index
// CHECK: %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor<?x?xf32>
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<[4]x4xi1>
// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<[4]x4xf32> } : vector<[4]x4xi1> -> vector<[4]x4xf32>
// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[MASK_d1:.*]] = vector.create_mask %[[DIM_A0_1]] : vector<4xi1>
// CHECK: %[[VEC_RD_1:.*]] = vector.mask %[[MASK_d1]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true], permutation_map = #map} : tensor<?xf32>, vector<[4]x4xf32> } : vector<4xi1> -> vector<[4]x4xf32>
// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[MASK_d2:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<[4]xi1>
// CHECK: %[[VEC_RD_2:.*]] = vector.mask %[[MASK_d2]] { vector.transfer_read %[[ARG_2]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32>
// CHECK: %[[MUL:.*]] = arith.mulf %[[VEC_RD_0:.*]], %[[VEC_RD_1:.*]] : vector<[4]x4xf32>
// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction <add>, %[[MUL]], %[[VEC_RD_2]] [1] : vector<[4]x4xf32> to vector<[4]xf32> } : vector<[4]x4xi1> -> vector<[4]xf32>
// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
// CHECK: %{{.*}} = vector.mask %[[MASK_d2]] { vector.transfer_write %[[REDUCE]], %[[ARG_2]][%[[C0_idx]]] {in_bounds = [true]} : vector<[4]xf32>, tensor<?xf32> } : vector<[4]xi1> -> tensor<?xf32>

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %0 vector_sizes [[4], 4] : !transform.any_op
transform.yield
}
}
Loading
Loading