Skip to content

[mlir][amdgpu] Shared memory access optimization pass #75627

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,23 @@ def AMDGPU_Dialect : Dialect {
"gpu::GPUDialect"
];
let useDefaultAttributePrinterParser = 1;

let extraClassDeclaration = [{
/// Return true if the given MemRefType has an integer address
/// space that matches the ROCDL shared memory address space or
/// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
static bool hasSharedMemoryAddressSpace(MemRefType type);

/// Return true if the given Attribute has an integer address
/// space that matches the ROCDL shared memory address space or
/// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
static bool isSharedMemoryAddressSpace(Attribute type);

/// Defines the MemRef memory space attribute numeric value that indicates
/// a memref is located in shared memory. This should correspond to the
/// value used in ROCDL.
static constexpr unsigned kSharedMemoryAddressSpace = 3;
}];
}

//===----------------------------------------------------------------------===//
Expand Down
3 changes: 2 additions & 1 deletion mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ namespace mlir {
class ConversionTarget;
namespace amdgpu {

#define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS
#define GEN_PASS_DECL

#define GEN_PASS_REGISTRATION
#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"

Expand Down
13 changes: 13 additions & 0 deletions mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,17 @@ def AmdgpuEmulateAtomicsPass : Pass<"amdgpu-emulate-atomics"> {
"Chipset that these operations will run on">];
}

def OptimizeSharedMemory : Pass<"amdgpu-optimize-shared-memory"> {
let summary = "Optimizes accesses to shared memory memrefs in order to reduce bank conflicts.";
let description = [{
This pass adds a transformation and pass to the AMDGPU dialect that
attempts to optimize reads/writes from a memref representing GPU shared
memory in order to avoid bank conflicts.
}];

let dependentDialects = [
"memref::MemRefDialect", "vector::VectorDialect"
];
}

#endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
54 changes: 54 additions & 0 deletions mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
//===- Transforms.h - AMDGPU Dialect transformations --------------*-
// C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file declares functions that assist transformations for the amdgpu
// dialect.
//
//===----------------------------------------------------------------------===//
#ifndef MLIR_DIALECT_AMDGPU_TRANSFORMS_TRANSFORMS_H_
#define MLIR_DIALECT_AMDGPU_TRANSFORMS_TRANSFORMS_H_

#include "mlir/IR/Operation.h"
#include "mlir/Support/LogicalResult.h"

namespace mlir {
class RewriterBase;

namespace amdgpu {

///
/// Passes
///

/// Optimizes vectorized accesses to a shared memory buffer specified by
/// memrefValue. This transformation assumes the following:
/// 1) All relevant accesses to `memrefValue` are contained with `parentOp`.
/// 2) The function will fail precondition checks if any subviews are
/// taken of `memrefValue`. All reads/writes to `memrefValue` should occur
/// through `memrefValue` directly.
///
/// Shared memory bank conflicts occur when multiple threads attempt to read or
/// write locations assigned to the same shared memory bank. For `2^N` byte
/// vectorized accesses, we need to be concerned with conflicts among threads
/// identified as `(tid) -> tid.floordiv(2^{7-N})`. As such, this transformation
/// changes any indexed memory access (vector.load, memref.load, etc)
/// such that the final dimension's index value is permuted such that
/// `newColIndex = oldColIndex % vectorSize +
/// perm[rowIndex](oldColIndex/vectorSize, rowIndex)` where `rowIndex` is the
/// index for the second-to last dimension and `perm[rowIndex]` is a permutation
/// function that depends on the row Index. The permutation function is chosen
/// to ensure that sequential distributed+vectorized reads/writes down a single
/// dimension of the memref have minimal conflicts.
mlir::LogicalResult optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
Value memrefValue);

} // namespace amdgpu
} // namespace mlir

#endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_TRANSFORMS_H_
24 changes: 24 additions & 0 deletions mlir/include/mlir/Dialect/AMDGPU/Transforms/Utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
//===- Utils.h - Transform utilities -----------------------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "mlir/IR/Operation.h"

namespace mlir {
namespace amdgpu {

/// Get and set the indices that the given load/store operation is operating on.
/// Preconditions:
/// - The Op must have memory affects
/// - Considers memref::LoadOp, vector::LoadOp, vector::TransferReadOp
/// - Considers memref::StoreOp, vector::StoreOp, vector::TransferWriteOp
/// - Excludes subview op
std::optional<Operation::operand_range> getIndices(Operation *op);
void setIndices(Operation *op, ArrayRef<Value> indices);

} // namespace amdgpu
} // namespace mlir
15 changes: 15 additions & 0 deletions mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,21 @@ void AMDGPUDialect::initialize() {
>();
}

bool amdgpu::AMDGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) {
if (!memorySpace)
return false;
if (auto intAttr = llvm::dyn_cast<IntegerAttr>(memorySpace))
return intAttr.getInt() == AMDGPUDialect::kSharedMemoryAddressSpace;
if (auto gpuAttr = llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
return gpuAttr.getValue() == gpu::AddressSpace::Workgroup;
return false;
}

bool amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(MemRefType type) {
Attribute memorySpace = type.getMemorySpace();
return isSharedMemoryAddressSpace(memorySpace);
}

//===----------------------------------------------------------------------===//
// 8-bit float ops
//===----------------------------------------------------------------------===//
Expand Down
2 changes: 2 additions & 0 deletions mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
add_mlir_dialect_library(MLIRAMDGPUTransforms
EmulateAtomics.cpp
OptimizeSharedMemory.cpp
Utils.cpp

ADDITIONAL_HEADER_DIRS
{$MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/Transforms
Expand Down
Loading