Skip to content

Commit 3f37df5

Browse files
authored
[reland][mlir][amdgpu] Shared memory access optimization pass (#79164)
- Reland: #75627 - Reproduced then fixed the build issue
1 parent 66c710e commit 3f37df5

File tree

10 files changed

+465
-1
lines changed

10 files changed

+465
-1
lines changed

mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,23 @@ def AMDGPU_Dialect : Dialect {
2929
"gpu::GPUDialect"
3030
];
3131
let useDefaultAttributePrinterParser = 1;
32+
33+
let extraClassDeclaration = [{
34+
/// Return true if the given MemRefType has an integer address
35+
/// space that matches the ROCDL shared memory address space or
36+
/// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
37+
static bool hasSharedMemoryAddressSpace(MemRefType type);
38+
39+
/// Return true if the given Attribute has an integer address
40+
/// space that matches the ROCDL shared memory address space or
41+
/// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
42+
static bool isSharedMemoryAddressSpace(Attribute type);
43+
44+
/// Defines the MemRef memory space attribute numeric value that indicates
45+
/// a memref is located in shared memory. This should correspond to the
46+
/// value used in ROCDL.
47+
static constexpr unsigned kSharedMemoryAddressSpace = 3;
48+
}];
3249
}
3350

3451
//===----------------------------------------------------------------------===//

mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ namespace mlir {
2020
class ConversionTarget;
2121
namespace amdgpu {
2222

23-
#define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS
23+
#define GEN_PASS_DECL
2424
#define GEN_PASS_REGISTRATION
2525
#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
2626

mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,17 @@ def AmdgpuEmulateAtomicsPass : Pass<"amdgpu-emulate-atomics"> {
3030
"Chipset that these operations will run on">];
3131
}
3232

33+
def OptimizeSharedMemory : Pass<"amdgpu-optimize-shared-memory"> {
34+
let summary = "Optimizes accesses to shared memory memrefs in order to reduce bank conflicts.";
35+
let description = [{
36+
This pass adds a transformation and pass to the AMDGPU dialect that
37+
attempts to optimize reads/writes from a memref representing GPU shared
38+
memory in order to avoid bank conflicts.
39+
}];
40+
41+
let dependentDialects = [
42+
"memref::MemRefDialect", "vector::VectorDialect"
43+
];
44+
}
45+
3346
#endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
//===- Transforms.h - AMDGPU Dialect transformations --------------*-
2+
// C++-*-===//
3+
//
4+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
//
8+
//===----------------------------------------------------------------------===//
9+
//
10+
// This file declares functions that assist transformations for the amdgpu
11+
// dialect.
12+
//
13+
//===----------------------------------------------------------------------===//
14+
#ifndef MLIR_DIALECT_AMDGPU_TRANSFORMS_TRANSFORMS_H_
15+
#define MLIR_DIALECT_AMDGPU_TRANSFORMS_TRANSFORMS_H_
16+
17+
#include "mlir/IR/Operation.h"
18+
#include "mlir/Support/LogicalResult.h"
19+
20+
namespace mlir {
21+
class RewriterBase;
22+
23+
namespace amdgpu {
24+
25+
///
26+
/// Passes
27+
///
28+
29+
/// Optimizes vectorized accesses to a shared memory buffer specified by
30+
/// memrefValue. This transformation assumes the following:
31+
/// 1) All relevant accesses to `memrefValue` are contained with `parentOp`.
32+
/// 2) The function will fail precondition checks if any subviews are
33+
/// taken of `memrefValue`. All reads/writes to `memrefValue` should occur
34+
/// through `memrefValue` directly.
35+
///
36+
/// Shared memory bank conflicts occur when multiple threads attempt to read or
37+
/// write locations assigned to the same shared memory bank. For `2^N` byte
38+
/// vectorized accesses, we need to be concerned with conflicts among threads
39+
/// identified as `(tid) -> tid.floordiv(2^{7-N})`. As such, this transformation
40+
/// changes any indexed memory access (vector.load, memref.load, etc)
41+
/// such that the final dimension's index value is permuted such that
42+
/// `newColIndex = oldColIndex % vectorSize +
43+
/// perm[rowIndex](oldColIndex/vectorSize, rowIndex)` where `rowIndex` is the
44+
/// index for the second-to last dimension and `perm[rowIndex]` is a permutation
45+
/// function that depends on the row Index. The permutation function is chosen
46+
/// to ensure that sequential distributed+vectorized reads/writes down a single
47+
/// dimension of the memref have minimal conflicts.
48+
mlir::LogicalResult optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
49+
Value memrefValue);
50+
51+
} // namespace amdgpu
52+
} // namespace mlir
53+
54+
#endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_TRANSFORMS_H_
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
//===- Utils.h - Transform utilities -----------------------------*- C++-*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "mlir/IR/Operation.h"
10+
11+
namespace mlir {
12+
namespace amdgpu {
13+
14+
/// Get and set the indices that the given load/store operation is operating on.
15+
/// Preconditions:
16+
/// - The Op must have memory affects.
17+
/// - Considers memref::LoadOp, vector::LoadOp, and vector::TransferReadOp.
18+
/// - Considers memref::StoreOp, vector::StoreOp, and vector::TransferWriteOp.
19+
/// - Excludes subview op.
20+
std::optional<Operation::operand_range> getIndices(Operation *op);
21+
void setIndices(Operation *op, ArrayRef<Value> indices);
22+
23+
} // namespace amdgpu
24+
} // namespace mlir

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,21 @@ void AMDGPUDialect::initialize() {
4343
>();
4444
}
4545

46+
bool amdgpu::AMDGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) {
47+
if (!memorySpace)
48+
return false;
49+
if (auto intAttr = llvm::dyn_cast<IntegerAttr>(memorySpace))
50+
return intAttr.getInt() == AMDGPUDialect::kSharedMemoryAddressSpace;
51+
if (auto gpuAttr = llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
52+
return gpuAttr.getValue() == gpu::AddressSpace::Workgroup;
53+
return false;
54+
}
55+
56+
bool amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(MemRefType type) {
57+
Attribute memorySpace = type.getMemorySpace();
58+
return isSharedMemoryAddressSpace(memorySpace);
59+
}
60+
4661
//===----------------------------------------------------------------------===//
4762
// 8-bit float ops
4863
//===----------------------------------------------------------------------===//

mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
add_mlir_dialect_library(MLIRAMDGPUTransforms
22
EmulateAtomics.cpp
3+
OptimizeSharedMemory.cpp
4+
Utils.cpp
35

46
ADDITIONAL_HEADER_DIRS
57
{$MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/Transforms
@@ -16,4 +18,5 @@ add_mlir_dialect_library(MLIRAMDGPUTransforms
1618
MLIRPass
1719
MLIRTransforms
1820
MLIRTransformUtils
21+
MLIRVectorDialect
1922
)

0 commit comments

Comments
 (0)