@@ -50,12 +50,12 @@ constexpr int64_t kDefaultVectorSizeBits = 64;
50
50
static Value permuteVectorOffset (OpBuilder &b, Location loc,
51
51
ArrayRef<Value> indices, MemRefType memrefTy,
52
52
int64_t srcDim, int64_t tgtDim) {
53
- // Adjust the src index to change how often the permutation changes
54
- // if necessary.
53
+ // / Adjust the src index to change how often the permutation changes
54
+ // / if necessary.
55
55
Value src = indices[srcDim];
56
56
57
- // We only want to permute every N iterations of the target dim where N is
58
- // ceil(sharedMemoryLineSizeBytes / dimSizeBytes(tgtDim)).
57
+ // / We only want to permute every N iterations of the target dim where N is
58
+ // / ceil(sharedMemoryLineSizeBytes / dimSizeBytes(tgtDim)).
59
59
const int64_t permuteEveryN = std::max<int64_t >(
60
60
1 , kSharedMemoryLineSizeBytes / ((memrefTy.getDimSize (tgtDim) *
61
61
memrefTy.getElementTypeBitWidth ()) /
@@ -81,8 +81,8 @@ static Value permuteVectorOffset(OpBuilder &b, Location loc,
81
81
Value srcBits = b.create <arith::ConstantIndexOp>(loc, mask);
82
82
srcBits = b.create <arith::AndIOp>(loc, src, srcBits);
83
83
84
- // Use the src bits to permute the target bits b[N:M] containing the
85
- // vector offset.
84
+ // / Use the src bits to permute the target bits b[N:M] containing the
85
+ // / vector offset.
86
86
if (permuteEveryN > 1 ) {
87
87
int64_t shlBits = n - llvm::Log2_64 (permuteEveryN);
88
88
if (shlBits > 0 ) {
@@ -131,8 +131,8 @@ getShmReadAndWriteOps(Operation *parentOp, Value shmMemRef,
131
131
writeOps.push_back (op);
132
132
});
133
133
134
- // Restrict to a supported set of ops. We also require at least 2D access,
135
- // although this could be relaxed.
134
+ // / Restrict to a supported set of ops. We also require at least 2D access,
135
+ // / although this could be relaxed.
136
136
if (llvm::any_of (readOps, [](Operation *op) {
137
137
return !isa<memref::LoadOp, vector::LoadOp, vector::TransferReadOp>(
138
138
op) ||
@@ -157,15 +157,15 @@ mlir::amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
157
157
!amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace (memRefType))
158
158
return failure ();
159
159
160
- // Abort if the given value has any sub-views; we do not do any alias
161
- // analysis.
160
+ // / Abort if the given value has any sub-views; we do not do any alias
161
+ // / analysis.
162
162
bool hasSubView = false ;
163
163
parentOp->walk ([&](memref::SubViewOp subView) { hasSubView = true ; });
164
164
if (hasSubView)
165
165
return failure ();
166
166
167
- // Check if this is necessary given the assumption of 128b accesses:
168
- // If dim[rank-1] is small enough to fit 8 rows in a 128B line.
167
+ // / Check if this is necessary given the assumption of 128b accesses:
168
+ // / If dim[rank-1] is small enough to fit 8 rows in a 128B line.
169
169
const int64_t rowSize = memRefType.getDimSize (memRefType.getRank () - 1 );
170
170
const int64_t rowsPerLine =
171
171
(8 * kSharedMemoryLineSizeBytes / memRefType.getElementTypeBitWidth ()) /
@@ -175,8 +175,8 @@ mlir::amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
175
175
if (rowsPerLine >= threadGroupSize)
176
176
return failure ();
177
177
178
- // Get sets of operations within the function that read/write to shared
179
- // memory.
178
+ // / Get sets of operations within the function that read/write to shared
179
+ // / memory.
180
180
SmallVector<Operation *, 16 > shmReadOps;
181
181
SmallVector<Operation *, 16 > shmWriteOps;
182
182
if (failed (getShmReadAndWriteOps (parentOp, memrefValue, shmReadOps,
@@ -191,7 +191,7 @@ mlir::amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
191
191
int64_t tgtDim = memRefType.getRank () - 1 ;
192
192
int64_t srcDim = memRefType.getRank () - 2 ;
193
193
194
- // Transform indices for the ops writing to shared memory.
194
+ // / Transform indices for the ops writing to shared memory.
195
195
while (!shmWriteOps.empty ()) {
196
196
Operation *shmWriteOp = shmWriteOps.pop_back_val ();
197
197
builder.setInsertionPoint (shmWriteOp);
@@ -203,7 +203,7 @@ mlir::amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
203
203
amdgpu::setIndices (shmWriteOp, transformedIndices);
204
204
}
205
205
206
- // Transform indices for the ops reading from shared memory.
206
+ // / Transform indices for the ops reading from shared memory.
207
207
while (!shmReadOps.empty ()) {
208
208
Operation *shmReadOp = shmReadOps.pop_back_val ();
209
209
builder.setInsertionPoint (shmReadOp);
@@ -218,8 +218,7 @@ mlir::amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
218
218
return success ();
219
219
}
220
220
221
- void amdgpu::optimizeSharedMemoryReadsAndWritesOp (
222
- func::FuncOp funcOp) {
221
+ void amdgpu::optimizeSharedMemoryReadsAndWritesOp (func::FuncOp funcOp) {
223
222
SmallVector<memref::AllocOp> shmAllocOps;
224
223
funcOp.walk ([&](memref::AllocOp allocOp) {
225
224
if (!amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace (allocOp.getType ()))
0 commit comments