@@ -241,24 +241,19 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
241
241
map.map (operand.value (), entryBlock.getArgument (operand.index ()));
242
242
243
243
// Clone the region of the gpu.launch operation into the gpu.func operation.
244
- // TODO: If cloneInto can be modified such that if a mapping for
245
- // a block exists, that block will be used to clone operations into (at the
246
- // end of the block), instead of creating a new block, this would be much
247
- // cleaner.
248
244
launchOpBody.cloneInto (&outlinedFuncBody, map);
249
245
250
- // Branch from entry of the gpu.func operation to the block that is cloned
251
- // from the entry block of the gpu.launch operation.
252
- Block &launchOpEntry = launchOpBody.front ();
253
- Block *clonedLaunchOpEntry = map.lookup (&launchOpEntry);
254
- builder.setInsertionPointToEnd (&entryBlock);
255
- builder.create <cf::BranchOp>(loc, clonedLaunchOpEntry);
256
-
257
- outlinedFunc.walk ([](gpu::TerminatorOp op) {
258
- OpBuilder replacer (op);
259
- replacer.create <gpu::ReturnOp>(op.getLoc ());
260
- op.erase ();
261
- });
246
+ // Splice now the entry block of the gpu.launch operation at the end of the
247
+ // gpu.func entry block and erase the redundant block.
248
+ Block *clonedLaunchOpEntry = map.lookup (&launchOpBody.front ());
249
+ Operation *terminator = clonedLaunchOpEntry->getTerminator ();
250
+ OpBuilder replacer (terminator);
251
+ replacer.create <gpu::ReturnOp>(terminator->getLoc ());
252
+ terminator->erase ();
253
+ entryBlock.getOperations ().splice (entryBlock.getOperations ().end (),
254
+ clonedLaunchOpEntry->getOperations ());
255
+ clonedLaunchOpEntry->erase ();
256
+
262
257
return outlinedFunc;
263
258
}
264
259
0 commit comments